Importing All Neccesary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import tkinter as tk
from tkinter import ttk


Loading the CSV

In [None]:
heart_disease = pd.read_csv(r"C:\Users\shanm\Downloads\heart disease prediction.csv")

In [None]:
heart_disease.head()

In [None]:
heart_disease.tail()

In [None]:
heart_disease.info()

The Datatypes seems fine. Let's check for Nulls 

In [None]:
heart_disease.isnull().any()

In [None]:
null_count_age = heart_disease['age'].isnull().sum()
total_count_age = len(heart_disease['age'])
null_percentage_age = (null_count_age / total_count_age) * 100

null_count_slope = heart_disease['slope'].isnull().sum()
total_count_slope = len(heart_disease['slope'])
null_percentage_slope = (null_count_slope / total_count_slope) * 100

print(f"The percentage of null values in 'age' is: {null_percentage_age:.2f}%")
print(f"The percentage of null values in 'slope' is: {null_percentage_slope:.2f}%")


This is very less. So we can drop those rows

In [None]:
heart_disease.dropna(inplace= True)
heart_disease

In [None]:
heart_disease['age'] = heart_disease['age'].astype('int64')
heart_disease['slope'] = heart_disease['slope'].astype('int64')

In [None]:
heart_disease.nunique()

In [None]:
categorical_cols = []
numerical_cols = []
for i, col in enumerate(heart_disease.columns):
    if col != 'target':
        unique_value_count = heart_disease[col].nunique()
        if unique_value_count <= 4:
            categorical_cols.append(col)
        else:
            numerical_cols.append(col)


In [None]:
numerical_cols

In [None]:
categorical_cols

In [None]:
numerical_df = heart_disease[numerical_cols]
numerical_df.describe()

In [None]:
# Calculate the correlation matrix
correlation_matrix = numerical_df.corr()
correlation_matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Columns')
plt.show()


In [None]:
fig , axes = plt.subplots(nrows= 2, ncols= 3, figsize= (15,10))
axes = axes.flatten()

for i , col in enumerate(numerical_cols):
    sns.boxplot(x = col, data = heart_disease, ax = axes[i], palette = 'rocket')

plt.tight_layout()
plt.suptitle("Univariate analysis for Continuous Columns", y=1.02, fontsize=16)
plt.show()

In [None]:
fig , axes = plt.subplots(nrows= 3, ncols= 3, figsize= (15,10))
axes = axes.flatten()

for i , col in enumerate(categorical_cols):
    sns.countplot(x = col, data = heart_disease, ax = axes[i], palette = 'rocket')

for j in range(len(categorical_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle("Univariate analysis for Categorical Columns", y=1.02, fontsize=16)
plt.show()

In [None]:
fig , axes = plt.subplots(nrows= 2, ncols= 3, figsize= (15,10))
axes = axes.flatten()

for i , col in enumerate(numerical_cols):
    sns.histplot(data=heart_disease, x=col, hue='target', kde=True, multiple='stack', palette='rocket', ax=axes[i])

plt.tight_layout()
plt.suptitle("Bivariate analysis of Continuous Columns Vs Target Variable", y=1.02, fontsize=16)
plt.show()

In [None]:
fig , axes = plt.subplots(nrows= 3, ncols= 3, figsize= (15,10))
axes = axes.flatten()

for i , col in enumerate(categorical_cols):
    sns.countplot(x = col, hue = 'target', data = heart_disease, ax = axes[i], palette = 'rocket')

for j in range(len(categorical_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle("Bivariate analysis of Categorical Columns Vs Target Variable", y=1.02, fontsize=16)
plt.show()

In [None]:

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 10))
axes = axes.flatten()

# Pie Chart
heart_disease['target'].value_counts().plot.pie(autopct='%1.1f%%', ax=axes[0], labels=['No Disease', 'Disease'])
axes[0].set_title('Distribution of Heart Disease')

# Count Plot
sns.countplot(x='target', data=heart_disease, ax=axes[1])
axes[1].set_title('Distribution of Heart Disease')

plt.show()


In [None]:
x = heart_disease.drop('target', axis = 1)
y = heart_disease['target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size= 0.30, random_state=42)

In [None]:
scaler = StandardScaler()

x_train_normalized = scaler.fit_transform(x_train)
x_test_normalized = scaler.transform(x_test)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

results = []

for name, model in models.items():
    # Train the model
    model.fit(x_train_normalized, y_train)

    # Make predictions
    y_pred = model.predict(x_test_normalized)
    y_pred_prob = model.predict_proba(x_test_normalized)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    # Append to results list
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC-AUC': roc_auc
    })

# Create DataFrame from the list of dictionaries
metrics_df = pd.DataFrame(results)

# Display the DataFrame
metrics_df



In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Confusion Matrix for Each Model', fontsize=16)

for (name, model), ax in zip(models.items(), axes.flatten()):
    # Train the model
    model.fit(x_train_normalized, y_train)

    # Make predictions
    y_pred = model.predict(x_test_normalized)

    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                annot_kws={'size': 14}, linewidths=0.5, linecolor='black', ax=ax)

    ax.set_title(name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

# Adjust layout to prevent overlap
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('ROC-AUC Curve for Each Model', fontsize=16)

for (name, model), ax in zip(models.items(), axes.flatten()):
    # Train the model
    model.fit(x_train_normalized, y_train)

    # Make predictions
    y_pred_prob = model.predict_proba(x_test_normalized)[:, 1]

    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    ax.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
    ax.plot([0, 1], [0, 1], '--', color='gray')

    ax.set_title(name)
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend()

# Adjust layout to prevent overlap
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

Hyper Parameter Tuning

In [None]:
# Logistic Regression
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['liblinear', 'lbfgs']
}

lr = LogisticRegression(random_state=42)
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(x_train_normalized, y_train)

# Decision Tree
param_grid_dt = {
    'max_depth': [None, 3, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(x_train_normalized, y_train)

# Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(x_train_normalized, y_train)

# SVM
param_grid_svm = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

svm = SVC(probability=True, random_state=42)
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(x_train_normalized, y_train)

# KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy')
grid_search_knn.fit(x_train_normalized, y_train)

# XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xgb = XGBClassifier(random_state=42)
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(x_train_normalized, y_train)

# Results
results = {
    'Logistic Regression': grid_search_lr.best_params_,
    'Decision Tree': grid_search_dt.best_params_,
    'Random Forest': grid_search_rf.best_params_,
    'SVM': grid_search_svm.best_params_,
    'KNN': grid_search_knn.best_params_,
    'XGBoost': grid_search_xgb.best_params_
}

# Display results
for model, params in results.items():
    print(f"Best Hyperparameters for {model}: {params}")


best_lr = grid_search_lr.best_estimator_
best_dt = grid_search_dt.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_svm = grid_search_svm.best_estimator_
best_knn = grid_search_knn.best_estimator_
best_xgb = grid_search_xgb.best_estimator_

y_pred_lr = best_lr.predict(x_test_normalized)
y_pred_dt = best_dt.predict(x_test_normalized)
y_pred_rf = best_rf.predict(x_test_normalized)
y_pred_svm = best_rf.predict(x_test_normalized)
y_pred_knn = best_rf.predict(x_test_normalized)
y_pred_xgb = best_rf.predict(x_test_normalized)


In [None]:
from sklearn.metrics import accuracy_score

# Logistic Regression
train_score_lr = accuracy_score(y_train, best_lr.predict(x_train_normalized))
test_score_lr = accuracy_score(y_test, y_pred_lr)

# Decision Tree
train_score_dt = accuracy_score(y_train, best_dt.predict(x_train_normalized))
test_score_dt = accuracy_score(y_test, y_pred_dt)

# Random Forest
train_score_rf = accuracy_score(y_train, best_rf.predict(x_train_normalized))
test_score_rf = accuracy_score(y_test, y_pred_rf)

# SVM
train_score_svm = accuracy_score(y_train, best_svm.predict(x_train_normalized))
test_score_svm = accuracy_score(y_test, y_pred_svm)

# KNN
train_score_knn = accuracy_score(y_train, best_knn.predict(x_train_normalized))
test_score_knn = accuracy_score(y_test, y_pred_knn)

# XGBoost
train_score_xgb = accuracy_score(y_train, best_xgb.predict(x_train_normalized))
test_score_xgb = accuracy_score(y_test, y_pred_xgb)

# Display scores
print(f"Logistic Regression - Training Score: {train_score_lr:.4f}, Testing Score: {test_score_lr:.4f}")
print(f"Decision Tree - Training Score: {train_score_dt:.4f}, Testing Score: {test_score_dt:.4f}")
print(f"Random Forest - Training Score: {train_score_rf:.4f}, Testing Score: {test_score_rf:.4f}")
print(f"SVM - Training Score: {train_score_svm:.4f}, Testing Score: {test_score_svm:.4f}")
print(f"KNN - Training Score: {train_score_knn:.4f}, Testing Score: {test_score_knn:.4f}")
print(f"XGBoost - Training Score: {train_score_xgb:.4f}, Testing Score: {test_score_xgb:.4f}")


In [None]:
# Initialize a list to store the metrics
metrics_list = []

# Evaluate Logistic Regression
y_pred_lr_prob = best_lr.predict_proba(x_test_normalized)[:, 1]
lr_metrics = {
    'Model': 'Logistic Regression',
    'Accuracy': accuracy_score(y_test, y_pred_lr),
    'Precision': precision_score(y_test, y_pred_lr),
    'Recall': recall_score(y_test, y_pred_lr),
    'F1 Score': f1_score(y_test, y_pred_lr),
    'ROC-AUC': roc_auc_score(y_test, y_pred_lr_prob)
}
metrics_list.append(lr_metrics)

# Evaluate Decision Tree
y_pred_dt_prob = best_dt.predict_proba(x_test_normalized)[:, 1]
dt_metrics = {
    'Model': 'Decision Tree',
    'Accuracy': accuracy_score(y_test, y_pred_dt),
    'Precision': precision_score(y_test, y_pred_dt),
    'Recall': recall_score(y_test, y_pred_dt),
    'F1 Score': f1_score(y_test, y_pred_dt),
    'ROC-AUC': roc_auc_score(y_test, y_pred_dt_prob)
}
metrics_list.append(dt_metrics)

# Evaluate Random Forest
y_pred_rf_prob = best_rf.predict_proba(x_test_normalized)[:, 1]
rf_metrics = {
    'Model': 'Random Forest',
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf),
    'Recall': recall_score(y_test, y_pred_rf),
    'F1 Score': f1_score(y_test, y_pred_rf),
    'ROC-AUC': roc_auc_score(y_test, y_pred_rf_prob)
}
metrics_list.append(rf_metrics)

# Evaluate Support Vector Machine
y_pred_svm_prob = best_svm.predict_proba(x_test_normalized)[:, 1]
svm_metrics = {
    'Model': 'Support Vector Machine',
    'Accuracy': accuracy_score(y_test, y_pred_svm),
    'Precision': precision_score(y_test, y_pred_svm),
    'Recall': recall_score(y_test, y_pred_svm),
    'F1 Score': f1_score(y_test, y_pred_svm),
    'ROC-AUC': roc_auc_score(y_test, y_pred_svm_prob)
}
metrics_list.append(svm_metrics)

# Evaluate K Nearest Neighbours
y_pred_knn_prob = best_knn.predict_proba(x_test_normalized)[:, 1]
knn_metrics = {
    'Model': 'K Nearest Neighbours',
    'Accuracy': accuracy_score(y_test, y_pred_knn),
    'Precision': precision_score(y_test, y_pred_knn),
    'Recall': recall_score(y_test, y_pred_knn),
    'F1 Score': f1_score(y_test, y_pred_knn),
    'ROC-AUC': roc_auc_score(y_test, y_pred_knn_prob)
}
metrics_list.append(knn_metrics)

# Evaluate XG Boost Classifier
y_pred_xgb_prob = best_xgb.predict_proba(x_test_normalized)[:, 1]
xgb_metrics = {
    'Model': 'XG Boost Classifier',
    'Accuracy': accuracy_score(y_test, y_pred_xgb),
    'Precision': precision_score(y_test, y_pred_xgb),
    'Recall': recall_score(y_test, y_pred_xgb),
    'F1 Score': f1_score(y_test, y_pred_xgb),
    'ROC-AUC': roc_auc_score(y_test, y_pred_xgb_prob)
}
metrics_list.append(xgb_metrics)

# Create a DataFrame from the list of dictionaries
metrics_df = pd.DataFrame(metrics_list)

# Display the DataFrame
metrics_df


Since Random Forest appears to be the strongest performer , we chose that as our best model.

In [None]:
# Make predictions
y_pred_rf = best_rf.predict(x_test_normalized)

# Create a confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 16})
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
y_pred_rf_prob = best_rf.predict_proba(x_test_normalized)[:, 1]

# Compute ROC curve and ROC-AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_rf_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - Random Forest')
plt.legend(loc='lower right')
plt.show()

GUI using Tkinter to make predictions

In [None]:
class PredictionApp:
    def __init__(self, master):
        self.master = master
        self.master.title("Machine Learning Prediction App")

        # Creating input fields for each feature
        self.feature_labels = ['Age',
                                'Sex (0 or 1)',
                                'CP (0 to 3)',
                                'trestbps (90 - 200)',
                                'chol (120 - 600)',
                                'fbs (0 or 1)',
                                'restecg (0 to 2)',
                                'thalach (70 - 210)',
                                'exang (0 or 1)', 
                                'oldpeak (0.0 to 10.0)', 
                                'slope (0 to 2)', 
                                'ca (0 to 4)', 
                                'thal (0 to 3)']  
        self.feature_entries = []

        for i, feature_label in enumerate(self.feature_labels):
            label = ttk.Label(master, text=feature_label)
            label.grid(row=i, column=0, padx=10, pady=10)

            entry = ttk.Entry(master)
            entry.grid(row=i, column=1, padx=10, pady=10)
            self.feature_entries.append(entry)

        # Create buttons for specific actions
        predict_button = ttk.Button(master, text="Predict", command=self.predict)
        predict_button.grid(row=len(self.feature_labels), column=1, pady=20)

        clear_button = ttk.Button(master, text="Clear", command=self.clear_inputs)
        clear_button.grid(row=len(self.feature_labels), column=0, pady=20)

        # Create a label to display predictions
        self.result_label = ttk.Label(master, text="")
        self.result_label.grid(row=len(self.feature_labels) + 1, columnspan=2)

    def predict(self):
        # Get user input values
        user_inputs = [float(entry.get()) for entry in self.feature_entries]

        # Make predictions using the model
        prediction = self.make_prediction(user_inputs)

        # Map predicted values to labels
        predicted_label = "Disease" if prediction == 1 else "No Disease"

        # Display the prediction
        self.result_label.config(text=f"Predicted Target: {predicted_label}")

    def make_prediction(self, user_inputs):
        prediction = best_rf.predict([user_inputs])  
        return prediction[0]

    def clear_inputs(self):
        for entry in self.feature_entries:
            entry.delete(0, tk.END)
        self.result_label.config(text="")

def main():
    root = tk.Tk()
    app = PredictionApp(root)
    root.mainloop()

if __name__ == "__main__":
    main()
