In [4]:
# Re-import necessary libraries and retrain the models from scratch
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import joblib

# Reload the original prepared dataset
prepared_dataset_path = '/Users/shlokkamat/Documents/Documents - Shlok’s MacBook Pro/GitHub/NUS_Proj/SHAP/data/train.csv'
data = pd.read_csv(prepared_dataset_path)

# Target column transformation
data['satisfaction'] = data['satisfaction'].map({"neutral or dissatisfied": 0, "satisfied": 1})

# Prepare features and target
X = data.drop(columns=["satisfaction", "Unnamed: 0", "id"], errors='ignore')
y = data["satisfaction"]

# Encode categorical features
categorical_columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Split data into training and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Handle missing values using imputation
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)

# Train Logistic Regression (Glass Box Model)
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_imputed, y_train)

# Train Random Forest Classifier (Black Box Model)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_imputed, y_train)

# Save the models and imputer
joblib.dump(logreg, 'logistic_regression_model.pkl')
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(imputer, 'data_imputer.pkl')

# Models retrained and saved successfully. Now evaluating unseen test data.

# Reload the unseen test data
unseen_test_data = pd.read_csv('/Users/shlokkamat/Documents/Documents - Shlok’s MacBook Pro/GitHub/NUS_Proj/SHAP/data/test.csv')

# Extract ground truth labels if available (assumed target column is 'satisfaction')
# Align unseen test data with the training dataset's feature columns
required_columns = X_train.columns  # Retrieve feature names from the training dataset
for col in required_columns:
    if col not in unseen_test_data:
        unseen_test_data[col] = 0  # Add missing columns with default values
unseen_test_data = unseen_test_data[required_columns]

# Impute missing values in unseen test data
X_unseen = imputer.transform(unseen_test_data)

# Make predictions using both models
y_pred_logreg = logreg.predict(X_unseen)
y_pred_rf = rf.predict(X_unseen)

# Evaluate performance if ground truth is available
if true_labels is not None:
    logreg_report = classification_report(true_labels, y_pred_logreg, target_names=["Neutral/Dissatisfied", "Satisfied"])
    rf_report = classification_report(true_labels, y_pred_rf, target_names=["Neutral/Dissatisfied", "Satisfied"])
    logreg_accuracy = accuracy_score(true_labels, y_pred_logreg)
    rf_accuracy = accuracy_score(true_labels, y_pred_rf)
    
    evaluation_results = {
        "Logistic Regression Accuracy": logreg_accuracy,
        "Random Forest Accuracy": rf_accuracy,
        "Logistic Regression Report": logreg_report,
        "Random Forest Report": rf_report
    }
else:
    evaluation_results = {
        "Logistic Regression Predictions": y_pred_logreg.tolist(),
        "Random Forest Predictions": y_pred_rf.tolist()
    }

evaluation_results


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression Accuracy': 0.5505851555281799,
 'Random Forest Accuracy': 0.7946566060979365,
 'Logistic Regression Report': '                      precision    recall  f1-score   support\n\nNeutral/Dissatisfied       0.89      0.23      0.36     14573\n           Satisfied       0.49      0.96      0.65     11403\n\n            accuracy                           0.55     25976\n           macro avg       0.69      0.60      0.51     25976\n        weighted avg       0.72      0.55      0.49     25976\n',
 'Random Forest Report': '                      precision    recall  f1-score   support\n\nNeutral/Dissatisfied       0.96      0.66      0.78     14573\n           Satisfied       0.69      0.97      0.80     11403\n\n            accuracy                           0.79     25976\n           macro avg       0.83      0.81      0.79     25976\n        weighted avg       0.84      0.79      0.79     25976\n'}