In [1]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load the dataset
file_path = '/Users/shlokkamat/Documents/Documents - Shlok’s MacBook Pro/GitHub/NUS_Proj/SHAP/data/train.csv'
data = pd.read_csv(file_path)

In [3]:
# Data Cleaning: Drop irrelevant columns for analysis
data_cleaned = data.drop(columns=["Unnamed: 0", "id"])

In [4]:
# 1. Encoding Categorical Variables
data_encoded = pd.get_dummies(data_cleaned, columns=["Gender", "Customer Type", "Type of Travel", "Class"], drop_first=True)


In [5]:
# 2. Creating Interaction Features
# Interaction between flight distance and delays
data_encoded['Delay per Distance'] = (
    (data_encoded['Departure Delay in Minutes'] + data_encoded['Arrival Delay in Minutes']) / 
    data_encoded['Flight Distance']
).replace([float('inf'), -float('inf')], 0)  # Handle divisions by zero

In [6]:
scaler = MinMaxScaler()
continuous_features = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
data_encoded[continuous_features] = scaler.fit_transform(data_encoded[continuous_features])

In [7]:
# 4. Creating a Feature for Overall Service Quality
# Aggregate service-related Likert scale ratings
service_features = [
    'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service',
    'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness'
]
data_encoded['Service Quality Score'] = data_encoded[service_features].mean(axis=1)

In [8]:
# 5. Dropping Redundant Features
# Removing features that have been normalized or are part of new composite features
data_encoded.drop(columns=service_features, inplace=True)

In [9]:
# Display the first few rows of the updated dataset
# tools.display_dataframe_to_user(name="Processed Dataset Without Upsampling", dataframe=data_encoded)
data_encoded.head()

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,Delay per Distance,Service Quality Score
0,0.076923,0.086632,0.015704,0.011364,neutral or dissatisfied,1,0,1,0,1,0.093478,3.857143
1,0.230769,0.041195,0.000628,0.003788,neutral or dissatisfied,1,1,0,0,0,0.029787,2.285714
2,0.24359,0.224354,0.0,0.0,satisfied,0,0,0,0,0,0.0,3.714286
3,0.230769,0.107229,0.00691,0.005682,neutral or dissatisfied,0,0,0,0,0,0.035587,3.0
4,0.692308,0.036955,0.0,0.0,satisfied,1,0,0,0,0,0.0,3.5


In [13]:
from sklearn.model_selection import train_test_split

# Prepare features (X) and target (y)
X = data_encoded.drop(columns=["satisfaction"])
y = data_encoded["satisfaction"]

# Encode the target variable
y = y.map({"neutral or dissatisfied": 0, "satisfied": 1})


# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Display shapes of the splits and their readiness for ML
dataset_split_info = {
    "X_train shape": X_train.shape,
    "X_val shape": X_val.shape,
    "X_test shape": X_test.shape,
    "y_train shape": y_train.shape,
    "y_val shape": y_val.shape,
    "y_test shape": y_test.shape
}

#tools.display_dataframe_to_user(name="Validation Features", dataframe=X_val)
dataset_split_info


{'X_train shape': (72732, 11),
 'X_val shape': (15586, 11),
 'X_test shape': (15586, 11),
 'y_train shape': (72732,),
 'y_val shape': (15586,),
 'y_test shape': (15586,)}

In [15]:
from sklearn.impute import SimpleImputer

# Handle missing values in features
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)

# Re-train Logistic Regression (Glass Box Model)
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_imputed, y_train)
y_val_pred_logreg = logreg.predict(X_val_imputed)

# Re-train Random Forest Classifier (Black Box Model)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_imputed, y_train)
y_val_pred_rf = rf.predict(X_val_imputed)

# Re-evaluate Models
logreg_report = classification_report(y_val, y_val_pred_logreg, target_names=["Neutral/Dissatisfied", "Satisfied"])
rf_report = classification_report(y_val, y_val_pred_rf, target_names=["Neutral/Dissatisfied", "Satisfied"])

logreg_accuracy = accuracy_score(y_val, y_val_pred_logreg)
rf_accuracy = accuracy_score(y_val, y_val_pred_rf)

# Updated results
updated_model_results = {
    "Logistic Regression Accuracy": logreg_accuracy,
    "Random Forest Accuracy": rf_accuracy,
    "Logistic Regression Report": logreg_report,
    "Random Forest Report": rf_report
}

updated_model_results


{'Logistic Regression Accuracy': 0.8471063775182857,
 'Random Forest Accuracy': 0.8523033491595021,
 'Logistic Regression Report': '                      precision    recall  f1-score   support\n\nNeutral/Dissatisfied       0.86      0.88      0.87      8832\n           Satisfied       0.84      0.81      0.82      6754\n\n            accuracy                           0.85     15586\n           macro avg       0.85      0.84      0.84     15586\n        weighted avg       0.85      0.85      0.85     15586\n',
 'Random Forest Report': '                      precision    recall  f1-score   support\n\nNeutral/Dissatisfied       0.85      0.90      0.87      8832\n           Satisfied       0.86      0.78      0.82      6754\n\n            accuracy                           0.85     15586\n           macro avg       0.85      0.84      0.85     15586\n        weighted avg       0.85      0.85      0.85     15586\n'}

In [16]:
# Evaluate models on the test set
y_test_pred_logreg = logreg.predict(X_test_imputed)
y_test_pred_rf = rf.predict(X_test_imputed)

# Generate classification reports for the test set
logreg_test_report = classification_report(y_test, y_test_pred_logreg, target_names=["Neutral/Dissatisfied", "Satisfied"])
rf_test_report = classification_report(y_test, y_test_pred_rf, target_names=["Neutral/Dissatisfied", "Satisfied"])

# Calculate test set accuracy
logreg_test_accuracy = accuracy_score(y_test, y_test_pred_logreg)
rf_test_accuracy = accuracy_score(y_test, y_test_pred_rf)

# Organize the results
test_set_results = {
    "Logistic Regression Test Accuracy": logreg_test_accuracy,
    "Random Forest Test Accuracy": rf_test_accuracy,
    "Logistic Regression Test Report": logreg_test_report,
    "Random Forest Test Report": rf_test_report
}

test_set_results


{'Logistic Regression Test Accuracy': 0.8436417297574746,
 'Random Forest Test Accuracy': 0.850955986141409,
 'Logistic Regression Test Report': '                      precision    recall  f1-score   support\n\nNeutral/Dissatisfied       0.85      0.87      0.86      8832\n           Satisfied       0.83      0.81      0.82      6754\n\n            accuracy                           0.84     15586\n           macro avg       0.84      0.84      0.84     15586\n        weighted avg       0.84      0.84      0.84     15586\n',
 'Random Forest Test Report': '                      precision    recall  f1-score   support\n\nNeutral/Dissatisfied       0.85      0.90      0.87      8832\n           Satisfied       0.86      0.79      0.82      6754\n\n            accuracy                           0.85     15586\n           macro avg       0.85      0.84      0.85     15586\n        weighted avg       0.85      0.85      0.85     15586\n'}