In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
# Load dataset
file_path = '/Users/shlokkamat/Documents/Documents - Shlok’s MacBook Pro/GitHub/NUS_Proj/SHAP/data/train.csv'
data = pd.read_csv(file_path)

In [3]:
# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 0', 'id', 'Arrival Delay in Minutes'])

In [4]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [5]:
# Split features and target
X = data.drop(columns=['satisfaction'])
y = data['satisfaction']

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

# Train and evaluate models
results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    # Predictions
    y_pred = model.predict(X_test)
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None
    # Append results
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "ROC-AUC": roc_auc
    })

[LightGBM] [Info] Number of positive: 35957, number of negative: 47166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 679
[LightGBM] [Info] Number of data points in the train set: 83123, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432576 -> initscore=-0.271350
[LightGBM] [Info] Start training from score -0.271350


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [8]:

print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0             CatBoost  0.962754   0.972323  0.941442  0.956634  0.995069
1        Random Forest  0.963380   0.975283  0.939899  0.957264  0.994055
2        Decision Tree  0.946682   0.938906  0.938906  0.938906  0.945804
3             LightGBM  0.962995   0.974717  0.939568  0.956820  0.994846
4  Logistic Regression  0.869400   0.856886  0.841200  0.848971  0.923894


In [11]:


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models with more loss/objective functions
models = {
    "CatBoost (Logloss)": CatBoostClassifier(loss_function='Logloss', verbose=0, random_state=42),
    "CatBoost (CrossEntropy)": CatBoostClassifier(loss_function='CrossEntropy', verbose=0, random_state=42),
    "CatBoost (MultiClass)": CatBoostClassifier(loss_function='MultiClass', verbose=0, random_state=42),
    "LightGBM (Binary Logloss)": lgb.LGBMClassifier(objective='binary', random_state=42),
    "LightGBM (Cross-Entropy)": lgb.LGBMClassifier(objective='cross_entropy', random_state=42),
    "LightGBM (Focal Loss)": lgb.LGBMClassifier(objective='focal_loss', random_state=42),
    "LightGBM (MultiClass)": lgb.LGBMClassifier(objective='multiclass', random_state=42),
    "Random Forest (Default)": RandomForestClassifier(random_state=42),
    "Random Forest (Entropy)": RandomForestClassifier(random_state=42, criterion = 'entropy'),
    "Random Forest (Logloss)": RandomForestClassifier(random_state=42, criterion = 'log_loss'),
    "Decision Tree (Default)": DecisionTreeClassifier(random_state=42),
    "Decision Tree (Entropy)": DecisionTreeClassifier(random_state=42, criterion = 'entropy'),
    "Decision Tree (Logloss)": DecisionTreeClassifier(random_state=42, criterion = 'log_loss'),
    "Logistic Regression (Default)": LogisticRegression(max_iter=1000, random_state=42),
    "Logistic Regression (Balanced)": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    "Logistic Regression (L1 Penalty)": LogisticRegression(max_iter=1000, penalty='l1', solver='liblinear', random_state=42),
    "Logistic Regression (Elastic Net)": LogisticRegression(max_iter=1000, penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42)
}

# Train and evaluate models
results = []

for model_name, model in models.items():
    try:
        # Train model
        model.fit(X_train, y_train)
        # Predictions
        y_pred = model.predict(X_test)
        # Evaluate performance
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        # Append results
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "F1 Score": f1
        })
    except Exception as e:
        results.append({
            "Model": model_name,
            "Accuracy": None,
            "Error": str(e),
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display results
#import ace_tools as tools; tools.display_dataframe_to_user(name="Model Performance with Different Objectives", dataframe=results_df)

print(results_df)


[LightGBM] [Info] Number of positive: 35957, number of negative: 47166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 679
[LightGBM] [Info] Number of data points in the train set: 83123, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432576 -> initscore=-0.271350
[LightGBM] [Info] Start training from score -0.271350
[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 83123.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can s

[LightGBM] [Fatal] Unknown objective type name: focal_loss
[LightGBM] [Fatal] Number of classes should be specified and greater than 1 for multiclass training
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                                Model  Accuracy  F1 Score  \
0                  CatBoost (Logloss)  0.962754  0.956634   
1             CatBoost (CrossEntropy)  0.963428  0.957399   
2               CatBoost (MultiClass)  0.961985  0.955757   
3           LightGBM (Binary Logloss)  0.962995  0.956820   
4            LightGBM (Cross-Entropy)  0.962995  0.956820   
5               LightGBM (Focal Loss)       NaN       NaN   
6               LightGBM (MultiClass)       NaN       NaN   
7             Random Forest (Default)  0.963380  0.957264   
8             Random Forest (Entropy)  0.962803  0.956575   
9             Random Forest (Logloss)  0.962803  0.956575   
10            Decision Tree (Default)  0.946682  0.938906   
11            Decision Tree (Entropy)  0.948126  0.940541   
12            Decision Tree (Logloss)  0.948126  0.940541   
13      Logistic Regression (Default)  0.869400  0.848971   
14     Logistic Regression (Balanced)  0.864877  0.847970   
15   Logistic Regression

