In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [9]:
# load the dataset 
file_path = './heart.csv'
data = pd.read_csv(file_path)


### Data Preprocessing ###

In [10]:
# Separate features and target variable
X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

In [11]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')  # Drop first to avoid dummy variable trap

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [12]:
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = RandomForestClassifier(random_state=42)

In [13]:
# Create a pipeline that first preprocesses the data then applies the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# Train the model
pipeline.fit(X_train, y_train)


In [14]:
# Hyperparameter Tuning using Grid Search
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [15]:
# Best parameters found by Grid Search
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [16]:
# Assess model performance
y_pred = grid_search.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.84      0.84        77
           1       0.89      0.88      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184

Confusion Matrix:
 [[65 12]
 [13 94]]
Accuracy Score:  0.8641304347826086


### LIME for Explainability ###

In [22]:
import lime
from lime.lime_tabular import LimeTabularExplainer

# LIME for explainability
# Fit the final model with best parameters
final_model = grid_search.best_estimator_

# Preprocess the training data for LIME
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

explainer = LimeTabularExplainer(training_data=X_train_transformed,
                                 training_labels=y_train,
                                 feature_names=X.columns,
                                 class_names=['No Disease', 'Disease'],
                                 mode='classification',
                                 discretize_continuous=True)

# Explain a prediction
sample_idx = 0  # Index of the sample to explain
exp = explainer.explain_instance(X_test_transformed[sample_idx], 
                                 final_model.predict_proba, 
                                 num_features=10)

# Display explanation
exp.show_in_notebook(show_table=True, show_all=False)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/syahrezapratama/.pyenv/versions/3.9.10/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/9k/wf1fkfzn6mxbf2xv7znk94r80000gn/T/ipykernel_61171/4018597483.py", line 12, in <module>
    explainer = LimeTabularExplainer(training_data=X_train_transformed,
  File "/Users/syahrezapratama/.pyenv/versions/3.9.10/lib/python3.9/site-packages/lime/lime_tabular.py", line 215, in __init__
    self.discretizer = QuartileDiscretizer(
  File "/Users/syahrezapratama/.pyenv/versions/3.9.10/lib/python3.9/site-packages/lime/discretize.py", line 178, in __init__
    BaseDiscretizer.__init__(self, data, categorical_features,
  File "/Users/syahrezapratama/.pyenv/versions/3.9.10/lib/python3.9/site-packages/lime/discretize.py", line 64, in __init__
    name = feature_names[feature]
IndexError: list index out of range

During handling of the above except