In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from scipy.stats import uniform, randint

In [2]:
np.random.seed(42)

In [3]:
project_dir = 'D:\\Heart_Disease_Project'
os.makedirs(project_dir, exist_ok=True)
os.makedirs(os.path.join(project_dir, 'notebooks'), exist_ok=True)
os.makedirs(os.path.join(project_dir, 'results'), exist_ok=True)

In [4]:
train_data_path = os.path.join(project_dir, 'data', 'heart_disease_train.csv')
test_data_path = os.path.join(project_dir, 'data', 'heart_disease_test.csv')
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [5]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [6]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42, probability=True)
}
param_grids = {
    'Logistic Regression':{
        'C': [0.001,0.1,1,10,100],
        'solver': ['lbfgs', 'liblinear']
    }, 
    'Decision Tree':{
        'max_depth': [3,5,7,10,None],
        'min_samples_split': [2,5,10]
    },
    'Random Forest':{
        'n_estimators': [50,100,200],
        'max_depth': [3,5,7,None],
        'min_samples_split': [2,5,10]
    },
    'Support Vector Machine':{
        'C': [0.1,1,10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto', 0.1]
    }
}

param_distributions = {
    'Logistic Regression':{
        'C': uniform(0.01,100),
        'solver': ['lbfgs', 'liblinear']
    },
    'Decision Tree':{
        'max_depth': [3,5,7,10,None],
        'min_samples_split': randint(2,11) 
    },
    'Random Forest':{
        'n_estimators': randint(50,201),
        'max_depth': [3,5,7,None],
        'min_samples_split': randint(2,11) 
    },
    'Support Vector Machine':{
        'C': uniform(0.1,10),
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto', 0.1]
    }
}

In [7]:
best_models={}
for name, model in models.items():
    print(f'\nTuning {name}...')
    
    grid_search= GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[f'{name}_Grid'] = grid_search.best_estimator_
    print(f'GridSearchCV Best Params: {grid_search.best_params_}')
    print(f'GridSearchCV Best Cross-Val Score: {grid_search.best_score_:.4f}')

    random_search = RandomizedSearchCV(model, param_distributions[name], n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)
    best_models[f'{name}_Random'] = random_search.best_estimator_
    print(f"RandomizedSearchCV Best Params: {random_search.best_params_}")
    print(f"RandomizedSearchCV Best Cross-Val Score: {random_search.best_score_:.4f}")
    


Tuning Logistic Regression...




GridSearchCV Best Params: {'C': 1, 'solver': 'liblinear'}
GridSearchCV Best Cross-Val Score: 0.6036
RandomizedSearchCV Best Params: {'C': np.float64(5.818361216819946), 'solver': 'liblinear'}
RandomizedSearchCV Best Cross-Val Score: 0.5994

Tuning Decision Tree...
GridSearchCV Best Params: {'max_depth': 5, 'min_samples_split': 10}
GridSearchCV Best Cross-Val Score: 0.5783
RandomizedSearchCV Best Params: {'max_depth': 5, 'min_samples_split': 4}
RandomizedSearchCV Best Cross-Val Score: 0.5784

Tuning Random Forest...
GridSearchCV Best Params: {'max_depth': 7, 'min_samples_split': 10, 'n_estimators': 100}
GridSearchCV Best Cross-Val Score: 0.6241
RandomizedSearchCV Best Params: {'max_depth': 7, 'min_samples_split': 9, 'n_estimators': 70}
RandomizedSearchCV Best Cross-Val Score: 0.6241

Tuning Support Vector Machine...
GridSearchCV Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
GridSearchCV Best Cross-Val Score: 0.5992
RandomizedSearchCV Best Params: {'C': np.float64(0.6808361

In [8]:
metrics={}
baseline_metrics={}
baseline_path = os.path.join(project_dir, 'results', 'evaluation_metrics.txt')
if os.path.exists(baseline_path):
    with open(baseline_path, 'r') as f:
        lines = [line.strip() for line in f.read().split('\n') if line.strip()]  # Remove empty lines and strip whitespace
        i = 0
        while i < len(lines):
            if 'Metrics:' in lines[i]:
                model_name = lines[i].replace(' Metrics:', '')
                baseline_metrics[model_name] = {}
                i += 1
                for _ in range(5):  # Expect 5 metrics
                    if i < len(lines) and ':' in lines[i]:
                        key, value = lines[i].split(': ', 1)  # Split on first ':'
                        if value.strip():
                            baseline_metrics[model_name][key] = float(value)
                        else:
                            print(f"Warning: No value for {key} in {model_name} at line {i}")
                    i += 1
            else:
                i += 1
else:
    print("Baseline metrics file not found. Skipping comparison.")
for name, model in best_models.items():
    base_name = name.split('_')[0]
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')

    metrics[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'AUC': roc_auc
    }

In [9]:
# Cell 6: Export the best model
from joblib import dump, load
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Define the export path
model_path = os.path.join(project_dir, 'models', 'final_model.pkl')
os.makedirs(os.path.join(project_dir, 'models'), exist_ok=True)  # Ensure models directory exists

# Load scaled training data
train_data_path = os.path.join(project_dir, 'data', 'heart_disease_train.csv')
train_df = pd.read_csv(train_data_path)
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']

# Use the best estimator directly from best_models
best_model_name = 'Logistic Regression_Grid'
best_model = best_models[best_model_name]
print("Best model type:", type(best_model))
print("Best model params:", best_model.get_params())

# Create a pipeline with a no-op scaler (since data is pre-scaled) and the best model
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False, with_std=False)),  # No-op scaler
    ('model', best_model)
])
pipeline.fit(X_train, y_train)  # Fit to maintain pipeline structure

# Save the pipeline
dump(pipeline, model_path)
print(f"Best model pipeline saved to {model_path}")

# Validate the saved pipeline
loaded_pipeline = load(model_path)
y_pred = loaded_pipeline.predict(X_test)
y_prob = loaded_pipeline.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')
print(f"Loaded Model Accuracy: {accuracy:.4f}")
print(f"Loaded Model AUC: {roc_auc:.4f}")

Best model type: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Best model params: {'C': 1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Best model pipeline saved to D:\Heart_Disease_Project\models\final_model.pkl
Loaded Model Accuracy: 0.6230
Loaded Model AUC: 0.8648




In [10]:
best_model_name = max(metrics, key=lambda x:metrics[x]['Accuracy'])
best_model = best_models[best_model_name]
best_params = best_models[best_model_name].get_params()
print('\nOptimized Model Performance:')
for name, m in metrics.items():
    print(f'\n{name} Metrics:')
    for k,v in m.items():
        print(f'{k}: {v:.4f}')
    if name in baseline_metrics:
        print(f'Baseline {k}: {baseline_metrics[name.split('_')[0]][k]:.4f}')
print(f"\nBest Performing Model: {best_model_name}")



Optimized Model Performance:

Logistic Regression_Grid Metrics:
Accuracy: 0.6230
Precision: 0.5183
Recall: 0.6230
F1-score: 0.5524
AUC: 0.8648

Logistic Regression_Random Metrics:
Accuracy: 0.6066
Precision: 0.5060
Recall: 0.6066
F1-score: 0.5428
AUC: 0.8697

Decision Tree_Grid Metrics:
Accuracy: 0.5410
Precision: 0.4716
Recall: 0.5410
F1-score: 0.4994
AUC: 0.7178

Decision Tree_Random Metrics:
Accuracy: 0.5410
Precision: 0.4953
Recall: 0.5410
F1-score: 0.5156
AUC: 0.7259

Random Forest_Grid Metrics:
Accuracy: 0.5410
Precision: 0.4169
Recall: 0.5410
F1-score: 0.4709
AUC: 0.7906

Random Forest_Random Metrics:
Accuracy: 0.5410
Precision: 0.4249
Recall: 0.5410
F1-score: 0.4760
AUC: 0.7938

Support Vector Machine_Grid Metrics:
Accuracy: 0.6230
Precision: 0.5135
Recall: 0.6230
F1-score: 0.5486
AUC: 0.8789

Support Vector Machine_Random Metrics:
Accuracy: 0.6230
Precision: 0.5135
Recall: 0.6230
F1-score: 0.5486
AUC: 0.8757

Best Performing Model: Logistic Regression_Grid


In [24]:
output_path = os.path.join(project_dir, 'results', 'best_model_details.txt')
with open(output_path, 'w') as f:
    f.write(f"Best Performing Model: {best_model_name}\n")
    f.write("Best Hyperparameters:\n")
    for param, value in best_params.items():
        f.write(f"{param}: {value}\n")
    f.write("\nPerformance Metrics:\n")
    for k, v in metrics[best_model_name].items():
        f.write(f"{k}: {v:.4f}\n")
print(f"Best model details saved to {output_path}")

Best model details saved to D:\Heart_Disease_Project\results\best_model_details.txt
