In [9]:
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [10]:
# Load the dataset
file_path = '/Users/shlokkamat/Documents/Documents - Shlok’s MacBook Pro/GitHub/NUS_Proj/SHAP/data/train.csv'
data = pd.read_csv(file_path)

In [11]:
# Preprocess data
data_cleaned = data.drop(columns=['Unnamed: 0', 'id'])
data_cleaned['satisfaction'] = LabelEncoder().fit_transform(data_cleaned['satisfaction'])
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_cols, drop_first=True)
data_cleaned['Arrival Delay in Minutes'] = data_cleaned['Arrival Delay in Minutes'].fillna(data_cleaned['Arrival Delay in Minutes'].median())

X = data_cleaned.drop(columns=['satisfaction'])
y = data_cleaned['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
# Train Explainable Boosting Machine
ebm = ExplainableBoostingClassifier(random_state=42)
ebm.fit(X_train, y_train)

# # Make predictions
# y_pred = ebm.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print("Accuracy:", accuracy)
# print("Classification Report:\n", report)


In [15]:
# Make predictions
y_pred = ebm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.9521678456282181
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96     11776
           1       0.95      0.94      0.94      9005

    accuracy                           0.95     20781
   macro avg       0.95      0.95      0.95     20781
weighted avg       0.95      0.95      0.95     20781



In [14]:
from interpret import show

ebm_global = ebm.explain_global()
show(ebm_global)

In [18]:
show(ebm.explain_local(X_test[:5], y_test[:5]), 0)

In [None]:
from sklearn.model_selection import GridSearchCV
from interpret.glassbox import ExplainableBoostingClassifier

# Define the parameter grid
param_grid = {
    'max_bins': [128, 255],                     # Number of bins for numerical features
    'learning_rate': [0.01, 0.1],              # Learning rate for boosting
    'min_samples_leaf': [2, 10],               # Minimum samples per leaf
    'max_rounds': [200, 500],                  # Number of boosting rounds
    'interactions': [10, 20],                  # Number of pairwise interactions
    'max_interaction_bins': [32, 64]           # Maximum bins for interactions
}

# Create the EBM model
ebm = ExplainableBoostingClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=ebm,
    param_grid=param_grid,
    cv=3,                          # 3-fold cross-validation
    scoring='accuracy',            # Optimize for accuracy
    verbose=1,                     # Display progress
    n_jobs=-1                      # Use all available CPU cores
)
grid_search.fit(X_train, y_train)

# Output the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)


Fitting 3 folds for each of 64 candidates, totalling 192 fits
