### **Import the necessary libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,precision_score,recall_score,f1_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

import scipy.stats as stats

from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

### **Import and clean survey data**

In [2]:
#import the data
df2_train = pd.read_csv("Surveydata_train.csv")
df2_test = pd.read_csv("Surveydata_test.csv")

In [3]:
#this one cell cleans all the data for the survey train/test
def clean_survey(df):
    #create a list of ratings columns. Exclude ID and Overall experience
    #remove seat class - it is a classifier
    #reomve Platform location for separate treatment
    cols_rating = df.columns[2:]
    cols_rating = cols_rating.drop(['Seat_Class','Platform_Location'])
    #fill NA in both train and test
    for i in cols_rating:
        df[i].fillna('no answer', inplace = True)
    #***NOTE - I moved no answer to the middle value instead of the low value.  may want to test both.
    #***NOTE - What happens if we increase the weights on thses numbers doubled? squared? etc
    #set values to replace
    names = [
        'Excellent',
        'Good',
        'Acceptable',
        'no answer',
        'Needs Improvement',
        'Poor',
        'Extremely Poor'
    ]
    #set values to replace for Platform_Location
    names_plat = [
        'Very Convenient',
        'Convenient',
        'Manageable',
        'no answer',
        'Needs Improvement', 
        'Inconvenient',
        'Very Inconvenient',
    ]
    #numbers that will replace inputs
    numbers = [6,5,4,3,2,1,0]
    #replace the values in the dataframes
    for i in df[cols_rating]:
        df[i].replace(to_replace = names, value = numbers, inplace=True)
    #Also replace 'Platform_Location' column with numbers
    df['Platform_Location'].replace(to_replace = names_plat, value = numbers, inplace=True)
    #Set seat class as a category
    df['Seat_Class'].astype('category')
    
    return df

In [4]:
survey_train = clean_survey(df2_train)
survey_test = clean_survey(df2_test)

In [6]:
display(survey_train.head())
display(survey_train.info())
print('-'*40)
display(survey_test.head())
display(survey_test.info())

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,0,2,Green Car,6,6,6.0,5,2,4,2,2,4,2,5,2,1
1,98800002,0,1,Ordinary,6,1,2.0,5,1,5,5,6,2,1,2,5,5
2,98800003,1,2,Green Car,2,2,2.0,2,5,6,6,6,6,6,5,6,6
3,98800004,0,4,Ordinary,2,3,2.0,4,2,4,4,4,4,4,5,4,4
4,98800005,1,4,Ordinary,4,4,4.0,2,5,6,5,5,5,5,5,5,5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Overall_Experience       94379 non-null  int64  
 2   Seat_Comfort             94379 non-null  int64  
 3   Seat_Class               94379 non-null  object 
 4   Arrival_Time_Convenient  94379 non-null  int64  
 5   Catering                 94379 non-null  int64  
 6   Platform_Location        94349 non-null  float64
 7   Onboard_Wifi_Service     94379 non-null  int64  
 8   Onboard_Entertainment    94379 non-null  int64  
 9   Online_Support           94379 non-null  int64  
 10  Ease_of_Online_Booking   94379 non-null  int64  
 11  Onboard_Service          94379 non-null  int64  
 12  Legroom                  94379 non-null  int64  
 13  Baggage_Handling         94379 non-null  int64  
 14  CheckIn_Service       

None

----------------------------------------


Unnamed: 0,ID,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,99900001,Acceptable,Green Car,4,4,4.0,2,6,5,6,6,6,6,5,6,1
1,99900002,Extremely Poor,Ordinary,5,1,4.0,4,1,4,4,6,4,5,4,6,4
2,99900003,Excellent,Ordinary,6,6,6.0,6,6,6,2,2,2,2,5,2,6
3,99900004,Acceptable,Green Car,6,4,6.0,1,4,6,1,4,2,6,6,6,1
4,99900005,Excellent,Ordinary,0,6,2.0,6,6,6,6,3,4,6,6,6,6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       35602 non-null  int64  
 1   Seat_Comfort             35580 non-null  object 
 2   Seat_Class               35602 non-null  object 
 3   Arrival_Time_Convenient  35602 non-null  int64  
 4   Catering                 35602 non-null  int64  
 5   Platform_Location        35590 non-null  float64
 6   Onboard_Wifi_Service     35602 non-null  int64  
 7   Onboard_Entertainment    35602 non-null  int64  
 8   Online_Support           35602 non-null  int64  
 9   Ease_of_Online_Booking   35602 non-null  int64  
 10  Onboard_Service          35602 non-null  int64  
 11  Legroom                  35602 non-null  int64  
 12  Baggage_Handling         35602 non-null  int64  
 13  CheckIn_Service          35602 non-null  int64  
 14  Cleanliness           

None

### **Import and clean travel data**

In [7]:
# Importing data
df1_train = pd.read_csv("Traveldata_train.csv")
df1_test = pd.read_csv("Traveldata_test.csv")

In [8]:
def clean_travel(df):
    # Separating categories and numerics 
    cat_list = df.select_dtypes(['object']).columns.tolist()
    num_list = df.select_dtypes(['number']).columns.tolist()

    # Replacing NaN values in the data
    #10% of values for cust type and travel type are unknown - this could be significant
    #df.loc[df['Customer_Type'].isnull(),'Customer_Type'] =  df['Customer_Type'].mode()[0]
    df.loc[df['Customer_Type'].isnull(),'Customer_Type'] =  "Unknown"
    #df.loc[df['Type_Travel'].isnull(),'Type_Travel'] =  df['Type_Travel'].mode()[0]
    df.loc[df['Type_Travel'].isnull(),'Type_Travel'] =  "Unknown"

    df.loc[df['Age'].isnull(),'Age'] =  df['Age'].mean()
    
    df.loc[df['Departure_Delay_in_Mins'].isnull(),'Departure_Delay_in_Mins'] = df['Departure_Delay_in_Mins'].median() 
    df.loc[df['Arrival_Delay_in_Mins'].isnull(),'Arrival_Delay_in_Mins'] = df['Arrival_Delay_in_Mins'].median() 

    df.loc[df['Gender'].isnull(),'Gender'] = df['Gender'].mode() 
    #dropping Gender Na would lose too much info however less than 0.1% of samples are unknown.  we will just use mode
    
    for col in cat_list:
        df[col] = df[col].astype('category')
    
    return df

In [9]:
travel_train = clean_travel(df1_train)
travel_test = clean_travel(df1_test)

In [10]:
display(travel_train.head())
display(travel_train.info())
print('-'*40)
display(travel_test.head())
display(travel_test.info())

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,98800001,Female,Loyal Customer,52.0,Unknown,Business,272,0.0,5.0
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   ID                       94379 non-null  int64   
 1   Gender                   94302 non-null  category
 2   Customer_Type            94379 non-null  category
 3   Age                      94379 non-null  float64 
 4   Type_Travel              94379 non-null  category
 5   Travel_Class             94379 non-null  category
 6   Travel_Distance          94379 non-null  int64   
 7   Departure_Delay_in_Mins  94379 non-null  float64 
 8   Arrival_Delay_in_Mins    94379 non-null  float64 
dtypes: category(4), float64(3), int64(2)
memory usage: 4.0 MB


None

----------------------------------------


Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,99900001,Female,Unknown,36.0,Business Travel,Business,532,0.0,0.0
1,99900002,Female,Disloyal Customer,21.0,Business Travel,Business,1425,9.0,28.0
2,99900003,Male,Loyal Customer,60.0,Business Travel,Business,2832,0.0,0.0
3,99900004,Female,Loyal Customer,29.0,Personal Travel,Eco,1352,0.0,0.0
4,99900005,Male,Disloyal Customer,18.0,Business Travel,Business,1610,17.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   ID                       35602 non-null  int64   
 1   Gender                   35572 non-null  category
 2   Customer_Type            35602 non-null  category
 3   Age                      35602 non-null  float64 
 4   Type_Travel              35602 non-null  category
 5   Travel_Class             35602 non-null  category
 6   Travel_Distance          35602 non-null  int64   
 7   Departure_Delay_in_Mins  35602 non-null  float64 
 8   Arrival_Delay_in_Mins    35602 non-null  float64 
dtypes: category(4), float64(3), int64(2)
memory usage: 1.5 MB


None

### **Multi variable analysis**

In [None]:
'''
******NOTE*******
Couldn't sleep...
I tried to condence all of Jonathan's code into a function.
Thses things worked funny
had to set catagory columns equil to catagory columns for it to take
cannot get the gender null values to take the mode of gender (it is Female, thinking of hard coding it in)
please review all and then see what you make of it all.

'''