### **Import the necessary libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,precision_score,recall_score,f1_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

import scipy.stats as stats

from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

In [2]:
#import the data
df2_train = pd.read_csv("Surveydata_train.csv")
df2_test = pd.read_csv("Surveydata_test.csv")

In [5]:
#this one cell cleans all the data for the survey train/test
def clean_survey(df):
    #create a list of ratings columns. Exclude ID and Overall experience
    #remove seat class - it is a classifier
    #reomve Platform location for separate treatment
    cols_rating = df.columns[2:]
    cols_rating = cols_rating.drop(['Seat_Class','Platform_Location'])
    #fill NA in both train and test
    for i in cols_rating:
        df[i].fillna('no answer', inplace = True)
    #set values to replace
    names = [
        'Excellent',
        'Good',
        'Acceptable',
        'Needs Improvement',
        'Poor',
        'Extremely Poor',
        'no answer'
    ]
    #set values to replace for Platform_Location
    names_plat = [
        'Very Convenient',
        'Convenient',
        'Manageable',
        'Needs Improvement', 
        'Inconvenient',
        'Very Inconvenient',
        'no answer',
    ]
    #numbers that will replace inputs
    numbers = [6,5,4,3,2,1,0]
    #replace the values in the dataframes
    for i in df[cols_rating]:
        df[i].replace(to_replace = names, value = numbers, inplace=True)
    #Also replace 'Platform_Location' column with numbers
    df['Platform_Location'].replace(to_replace = names_plat, value = numbers, inplace=True)
    #Set seat class as a category
    df['Seat_Class'].astype('category')
    
    return df

In [6]:
survey_train = clean_survey(df2_train)
survey_test = clean_survey(df2_test)

In [7]:
survey_train.head()

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,0,3,Green Car,6,6,6.0,5,3,4,3,3,4,3,5,3,2
1,98800002,0,2,Ordinary,6,2,3.0,5,2,5,5,6,3,2,3,5,5
2,98800003,1,3,Green Car,3,3,3.0,3,5,6,6,6,6,6,5,6,6
3,98800004,0,4,Ordinary,3,0,3.0,4,3,4,4,4,4,4,5,4,4
4,98800005,1,4,Ordinary,4,4,4.0,3,5,6,5,5,5,5,5,5,5


In [8]:
survey_test.head()

Unnamed: 0,ID,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,99900001,Acceptable,Green Car,4,4,4.0,3,6,5,6,6,6,6,5,6,2
1,99900002,Extremely Poor,Ordinary,5,2,4.0,4,2,4,4,6,4,5,4,6,4
2,99900003,Excellent,Ordinary,6,6,6.0,6,6,6,3,3,3,3,5,3,6
3,99900004,Acceptable,Green Car,6,4,6.0,2,4,6,2,4,3,6,6,6,2
4,99900005,Excellent,Ordinary,1,6,3.0,6,6,6,6,0,4,6,6,6,6


In [9]:
survey_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Overall_Experience       94379 non-null  int64  
 2   Seat_Comfort             94379 non-null  int64  
 3   Seat_Class               94379 non-null  object 
 4   Arrival_Time_Convenient  94379 non-null  int64  
 5   Catering                 94379 non-null  int64  
 6   Platform_Location        94349 non-null  float64
 7   Onboard_Wifi_Service     94379 non-null  int64  
 8   Onboard_Entertainment    94379 non-null  int64  
 9   Online_Support           94379 non-null  int64  
 10  Ease_of_Online_Booking   94379 non-null  int64  
 11  Onboard_Service          94379 non-null  int64  
 12  Legroom                  94379 non-null  int64  
 13  Baggage_Handling         94379 non-null  int64  
 14  CheckIn_Service       

In [10]:
survey_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       35602 non-null  int64  
 1   Seat_Comfort             35580 non-null  object 
 2   Seat_Class               35602 non-null  object 
 3   Arrival_Time_Convenient  35602 non-null  int64  
 4   Catering                 35602 non-null  int64  
 5   Platform_Location        35590 non-null  float64
 6   Onboard_Wifi_Service     35602 non-null  int64  
 7   Onboard_Entertainment    35602 non-null  int64  
 8   Online_Support           35602 non-null  int64  
 9   Ease_of_Online_Booking   35602 non-null  int64  
 10  Onboard_Service          35602 non-null  int64  
 11  Legroom                  35602 non-null  int64  
 12  Baggage_Handling         35602 non-null  int64  
 13  CheckIn_Service          35602 non-null  int64  
 14  Cleanliness           