In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
train_df = pd.read_csv("test.csv", index_col=0)
train_df.drop("id", axis=1, inplace=True)
test_df = pd.read_csv("train.csv", index_col=0)
test_df.drop("id", axis=1, inplace=True)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [5]:
#Encoding the satisfaction manually to 0 for a neutral or dissatisfied customer and 1 for a satisfied customer.
train_df["satisfaction"] = train_df["satisfaction"].map(
    {"satisfied": 0, "neutral or dissatisfied": 1}
)


In [6]:
#Creating the training dataset
y_train = train_df["satisfaction"]
X_train = train_df.drop("satisfaction", axis=1)

In [7]:
#Creating the test datset
test_df["satisfaction"] = test_df["satisfaction"].map(
    {"satisfied": 0, "neutral or dissatisfied": 1}
)

y_test = train_df["satisfaction"]
X_test = train_df.drop("satisfaction", axis=1)

In [14]:
# Using a sklearn pipeline
nomimal_variables = ["Gender", "Type of Travel", "Class", "Customer Type"]
nominal_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("nom_encode", nominal_encoder, nomimal_variables),
    ]
)
model_list = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    KNeighborsClassifier(),
    XGBClassifier()
]
for model in model_list:
    model_names = ["Decision Tree","Random Forest","Logitic Regression","KNN","XGB"]
    clf = Pipeline(
        steps=[
            ("prep", preprocessor),
            ("model", model),
        ]
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{model}:")
    print(classification_report(y_pred, y_test))


DecisionTreeClassifier():
              precision    recall  f1-score   support

           0       0.69      0.78      0.73     10106
           1       0.85      0.78      0.81     15870

    accuracy                           0.78     25976
   macro avg       0.77      0.78      0.77     25976
weighted avg       0.79      0.78      0.78     25976

RandomForestClassifier():
              precision    recall  f1-score   support

           0       0.69      0.78      0.73     10106
           1       0.85      0.78      0.81     15870

    accuracy                           0.78     25976
   macro avg       0.77      0.78      0.77     25976
weighted avg       0.79      0.78      0.78     25976

LogisticRegression():
              precision    recall  f1-score   support

           0       0.79      0.72      0.76     12464
           1       0.76      0.82      0.79     13512

    accuracy                           0.78     25976
   macro avg       0.78      0.77      0.77     25976
