# Install dependencies
```shell
sudo apt update
sudo apt upgrade
sudo apt install python3 python3-pip

pip install numpy
pip install pandas
pip install sklearn
pip install xgboost
```

# Init Libraries and Dataset

In [162]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

testSet = pd.read_csv('test.csv')
trainSet = pd.read_csv('train.csv')

# Prelim Transformations
This section aims to convert all columns with multiple string values into multiple columns with a single string value
This section then aims to convert the string values of the solution column into proper numbers (1 or 0)

## Break text columns into binary columns

In [163]:
# For every text column, break into multiple binary columns
testSet_transformed = testSet
for col in testSet.axes[1]:
    if testSet[col].dtype == 'object' and col != 'satisfaction':
        testSet_transformed = pd.get_dummies(testSet_transformed, prefix=[col], columns=[col])

trainSet_transformed = trainSet
for col in trainSet.axes[1]:
    if trainSet[col].dtype == 'object' and col != 'satisfaction':
        trainSet_transformed = pd.get_dummies(trainSet_transformed, prefix=[col], columns=[col])

## drop the not necessary columns

In [164]:
testSet_transformed = testSet_transformed.drop("Unnamed: 0", axis=1)
testSet_transformed = testSet_transformed.drop("id", axis=1)

trainSet_transformed = trainSet_transformed.drop("Unnamed: 0", axis=1)
trainSet_transformed = trainSet_transformed.drop("id", axis=1)

# testSet_transformed

## Transform Satisfaction axes
- move satisfaction to last axes
- convert the strings of satisfied or dissatisfied into binary 1 or 0 respectfully

In [165]:
# Reorganize the data such that the solution axes is last
solutionAxes = ['satisfaction']
testSet_transformed = testSet_transformed[
    [col for col in testSet_transformed if col not in solutionAxes] + 
    [col for col in solutionAxes if col in testSet_transformed]
]
trainSet_transformed = trainSet_transformed[
    [col for col in trainSet_transformed if col not in solutionAxes] + 
    [col for col in solutionAxes if col in trainSet_transformed]
]

# Replace solutionAxes values of 'satisfied' or 'neutral or dissatisfied' with 1 or 0 respectively
testSet_transformed['satisfaction'] = testSet_transformed['satisfaction'].replace(['satisfied', 'neutral or dissatisfied'], [1, 0])
trainSet_transformed['satisfaction'] = trainSet_transformed['satisfaction'].replace(['satisfied', 'neutral or dissatisfied'], [1, 0])

## Standardize the missing data

In [166]:
# replace all nan values with 0's
trainSet_nan_col = trainSet_transformed.isna().any()
for i in trainSet_nan_col.keys():
    if trainSet_nan_col[i] == True: 
        # print("Replacing ", i, " NaN Values with ", testSet_transformed[i].mean())
        testSet_transformed[i] = testSet_transformed[i].replace(np.nan, testSet_transformed[i].mean())

testSet_nan_col = trainSet_transformed.isna().any()
for i in testSet_nan_col.keys():
    if testSet_nan_col[i] == True: 
        # print("Replacing ", i, " NaN Values with ", testSet_transformed[i].mean())
        trainSet_transformed[i] = trainSet_transformed[i].replace(np.nan, trainSet_transformed[i].mean())

trainSet_transformed

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Gender_Female,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction
0,13,460,3,4,3,1,5,3,5,5,...,0,1,1,0,0,1,0,0,1,0
1,25,235,3,2,3,3,1,3,1,1,...,0,1,0,1,1,0,1,0,0,0
2,26,1142,2,2,2,2,5,5,5,5,...,1,0,1,0,1,0,1,0,0,1
3,25,562,2,5,5,5,2,2,2,2,...,1,0,1,0,1,0,1,0,0,0
4,61,214,3,3,3,3,4,5,5,3,...,0,1,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,23,192,2,1,2,3,2,2,2,2,...,1,0,0,1,1,0,0,1,0,0
103900,49,2347,4,4,4,4,2,4,5,5,...,0,1,1,0,1,0,1,0,0,1
103901,30,1995,1,1,1,3,4,1,5,4,...,0,1,0,1,1,0,1,0,0,0
103902,22,1000,1,1,1,5,1,1,1,1,...,1,0,0,1,1,0,0,1,0,0


In [167]:
# trainSet_transformed.plot.scatter(x="satisfaction", y="Age", s = 100)
# trainSet_transformed.plot.scatter(x="satisfaction", y="Flight Distance",)

# Split the data into test and train sets

In [168]:
# Load the test and train data
testData = testSet_transformed[testSet_transformed.axes[1][2:-1]]
testSolution = testSet_transformed[testSet_transformed.axes[1][-1]]
trainData = trainSet_transformed[trainSet_transformed.axes[1][2:-1]]
trainSolution = trainSet_transformed[trainSet_transformed.axes[1][-1]]

# AI's

In [169]:
aiScores = []

## KNN Classification

In [170]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(trainData, trainSolution)

testPredict = knn.predict(testData)
accuracy = accuracy_score(testSolution,testPredict)
aiScores.append({"Name":"KNN", "Score":accuracy})


In [171]:
## Decision Tree
my_DecisionTree = DecisionTreeClassifier(random_state=3)
my_DecisionTree.fit(trainData, trainSolution)
decisionTreePredict = my_DecisionTree.predict(testData)
decisionTreeAccuracy = accuracy_score(testSolution, decisionTreePredict)
aiScores.append({"Name":"Decision Tree", "Score":decisionTreeAccuracy})

In [172]:
## Random Forest
my_RandomForest = RandomForestClassifier(n_estimators=32, bootstrap=True, random_state=1)
my_RandomForest.fit(trainData, trainSolution)

randomForestPredict = my_RandomForest.predict(testData)
randomForestAccuracy = accuracy_score(testSolution, randomForestPredict)
aiScores.append({"Name":"Random Forest", "Score":randomForestAccuracy })

0.9603480135509701


In [173]:
## XGBoost
xgb = XGBClassifier()
xgb.fit(trainData, trainSolution)
xgbPredict = xgb.predict(testData)
xgbAccuracy = accuracy_score(testSolution, xgbPredict)
aiScores.append({"Name":"XGBoost", "Score":xgbAccuracy })

In [176]:
accuracyScores = pd.json_normalize(aiScores)
print("Accuracy Scores")
accuracyScores

Accuracy Scores


Unnamed: 0,Name,Score
0,KNN,0.899908
1,Decision Tree,0.944641
2,Random Forest,0.960348
3,XGBoost,0.961156
