In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [12]:
train_df = pd.read_csv("test.csv", index_col=0)
train_df.drop("id", axis=1, inplace=True)
test_df = pd.read_csv("train.csv", index_col=0)
test_df.drop("id", axis=1, inplace=True)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
#Encoding the satisfaction manually to 0 for a neutral or dissatisfied customer and 1 for a satisfied customer.
train_df["satisfaction"] = train_df["satisfaction"].map(
    {"satisfied": 0, "neutral or dissatisfied": 1}
)


In [15]:
#Creating the training dataset
y_train = train_df["satisfaction"]
X_train = train_df.drop("satisfaction", axis=1)

In [16]:
#Creating the test datset
test_df["satisfaction"] = test_df["satisfaction"].map(
    {"satisfied": 0, "neutral or dissatisfied": 1}
)

y_test = train_df["satisfaction"]
X_test = train_df.drop("satisfaction", axis=1)

In [17]:
# Using a sklearn pipeline
nomimal_variables = ["Gender", "Type of Travel", "Class", "Customer Type"]
nominal_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("nom_encode", nominal_encoder, nomimal_variables),
    ]
)
model_list = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    KNeighborsClassifier(),
]
for model in model_list:
    model_names = ["Decision Tree","Random Forest","Logitic Regression","KNN"]
    clf = Pipeline(
        steps=[
            ("prep", preprocessor),
            ("model", model),
        ]
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    for i in model_names:
        print(f"{i}:")
        print(classification_report(y_pred, y_test))


Decision Tree:
              precision    recall  f1-score   support

           0       0.69      0.78      0.73     10106
           1       0.85      0.78      0.81     15870

    accuracy                           0.78     25976
   macro avg       0.77      0.78      0.77     25976
weighted avg       0.79      0.78      0.78     25976

Random Forest:
              precision    recall  f1-score   support

           0       0.69      0.78      0.73     10106
           1       0.85      0.78      0.81     15870

    accuracy                           0.78     25976
   macro avg       0.77      0.78      0.77     25976
weighted avg       0.79      0.78      0.78     25976

Logitic Regression:
              precision    recall  f1-score   support

           0       0.69      0.78      0.73     10106
           1       0.85      0.78      0.81     15870

    accuracy                           0.78     25976
   macro avg       0.77      0.78      0.77     25976
weighted avg       0.79 

In [19]:
random_state = 42
classifier = [
    DecisionTreeClassifier(random_state=random_state),
    RandomForestClassifier(random_state=random_state),
    LogisticRegression(random_state=random_state),
    KNeighborsClassifier(),
]
dt_param_grid = {"min_samples_split": range(10, 500, 20), "max_depth": range(1, 20, 2)}
rf_param_grid = {
    "max_features": [1, 3, 10],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [False],
    "n_estimators": [100, 300],
    "criterion": ["gini"],
}

logreg_param_grid = {"C": np.logspace(-3, 3, 7), "penalty": ["l1", "l2"]}
knn_param_grid = {
    "n_neighbors": np.linspace(1, 19, 10, dtype=int).tolist(),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"],
}
classifier_param = [dt_param_grid, rf_param_grid, logreg_param_grid, knn_param_grid]
