In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
# load the dataset 
file_path = './heart.csv'
data = pd.read_csv(file_path)
data


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### Data Preprocessing ###

In [4]:
# Separate features and target variable
X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

In [5]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')  # Drop first to avoid dummy variable trap

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit the preprocessor to get the correct feature names after transformation
preprocessor.fit(X)
X_transformed = preprocessor.transform(X)

# Get feature names after transformation
encoded_feature_names = list(preprocessor.transformers_[0][1].get_feature_names_out(numerical_cols)) + \
                        list(preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols))




In [6]:
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)


# Define the model
model = RandomForestClassifier(random_state=42)
model

In [7]:
# Create a pipeline that applies the model (preprocessing is already done)
pipeline = Pipeline(steps=[('classifier', model)])


# Train the model
pipeline.fit(X_train, y_train)


In [8]:
# Hyperparameter Tuning using Grid Search
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [9]:
# Best parameters found by Grid Search
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [10]:
# Assess model performance
y_pred = grid_search.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.84      0.84        77
           1       0.89      0.88      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184

Confusion Matrix:
 [[65 12]
 [13 94]]
Accuracy Score:  0.8641304347826086


### Surrogate Model: Decision Tree ###

In [11]:
from sklearn.tree import export_graphviz
import graphviz
# Extract the best model (Random Forest) from grid search
best_rf_model = grid_search.best_estimator_.named_steps['classifier']

In [15]:
# Extract one tree from the Random Forest
single_tree = best_rf_model.estimators_[0]

# Export the decision tree to a dot file
dot_data = export_graphviz(single_tree, out_file=None, 
                           feature_names=encoded_feature_names,  
                           class_names=['No Heart Disease', 'Heart Disease'],  
                           filled=True, rounded=True,  
                           special_characters=True)  

# Visualize the decision tree using graphviz
graph = graphviz.Source(dot_data)  
graph.render("heart_disease_tree")
graph.view()

'heart_disease_tree.pdf'