Data Preprocessing
First, let's load and preprocess the data:

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df = pd.read_csv('processed.cleveland.csv', names=column_names)
print(df.head())
print(df.info())
print(df.describe())
df.dtypes
print(df.isnull().sum())

# Convert target to binary (0 = no disease, 1 = disease)
df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

# Handle missing values (marked as '?')
df = df.replace('?', np.nan)
df = df.dropna()

# Split into features and target
X = df.drop('target', axis=1)
y = df['target']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  target  
0    3.0  0.0  6.0       0  
1    2.0  3.0  3.0       2  
2    2.0  2.0  7.0       1  
3    3.0  0.0  3.0       0  
4    1.0  0.0  3.0       0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-n

Model Training and Fine-Tuning
Now let's train and optimize each model:

1. Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Logistic Regression with hyperparameter tuning
lr_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train, y_train)

# Best model
best_lr = lr_grid.best_estimator_
lr_pred = best_lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy:.2f}")
print(classification_report(y_test, lr_pred))

Logistic Regression Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        36
           1       0.83      0.83      0.83        24

    accuracy                           0.87        60
   macro avg       0.86      0.86      0.86        60
weighted avg       0.87      0.87      0.87        60



2. Support Vector Machine (SVM)

In [4]:
from sklearn.svm import SVC

# SVM with hyperparameter tuning
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

svm_grid = GridSearchCV(SVC(), svm_params, cv=5, scoring='accuracy')
svm_grid.fit(X_train, y_train)

# Best model
best_svm = svm_grid.best_estimator_
svm_pred = best_svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)

print(f"SVM Accuracy: {svm_accuracy:.2f}")
print(classification_report(y_test, svm_pred))

SVM Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        36
           1       0.88      0.88      0.88        24

    accuracy                           0.90        60
   macro avg       0.90      0.90      0.90        60
weighted avg       0.90      0.90      0.90        60



3. XGBoost


In [5]:
# %pip install --upgrade xgboost scikit-learn

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# XGBoost with hyperparameter tuning
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid = GridSearchCV(XGBClassifier(random_state=42), xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X_train, y_train)

# Best model
best_xgb = xgb_grid.best_estimator_
xgb_pred = best_xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

print(f"XGBoost Accuracy: {xgb_accuracy:.2f}")
print(classification_report(y_test, xgb_pred))

XGBoost Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.89      0.86      0.87        36
           1       0.80      0.83      0.82        24

    accuracy                           0.85        60
   macro avg       0.84      0.85      0.84        60
weighted avg       0.85      0.85      0.85        60



4. Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest with hyperparameter tuning
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)

# Best model
best_rf = rf_grid.best_estimator_
rf_pred = best_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"Random Forest Accuracy: {rf_accuracy:.2f}")
print(classification_report(y_test, rf_pred))

Random Forest Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.89      0.86      0.87        36
           1       0.80      0.83      0.82        24

    accuracy                           0.85        60
   macro avg       0.84      0.85      0.84        60
weighted avg       0.85      0.85      0.85        60



5. Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree with hyperparameter tuning
dt_params = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=5, scoring='accuracy')
dt_grid.fit(X_train, y_train)

# Best model
best_dt = dt_grid.best_estimator_
dt_pred = best_dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

print(f"Decision Tree Accuracy: {dt_accuracy:.2f}")
print(classification_report(y_test, dt_pred))

Decision Tree Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.82      0.78      0.80        36
           1       0.69      0.75      0.72        24

    accuracy                           0.77        60
   macro avg       0.76      0.76      0.76        60
weighted avg       0.77      0.77      0.77        60



Final Model Deployment Recommendation
For production deployment, I recommend using the XGBoost model as it provides the best balance of accuracy and performance. Here's how to save and load the model:

In [8]:
import joblib

# Save the model and scaler
joblib.dump(best_xgb, 'heart_disease_xgboost_model.pkl')
joblib.dump(scaler, 'heart_disease_scaler.pkl')

# Later, to load and use the model
loaded_model = joblib.load('heart_disease_xgboost_model.pkl')
loaded_scaler = joblib.load('heart_disease_scaler.pkl')

# Example prediction
# new_data = np.array([[63, 1, 1, 145, 233, 1, 2, 150, 0, 2.3, 3, 0, 6]])  # example patient data
new_data = np.array([[65,1,0,138,282,1,2,174,0,1.4,1,1,0]])
scaled_data = loaded_scaler.transform(new_data)
prediction = loaded_model.predict(scaled_data)
print("Heart Disease Prediction (0 = No, 1 = Yes):", prediction[0])

Heart Disease Prediction (0 = No, 1 = Yes): 1


