# Import CSV to dataframe


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, log_loss
import numpy as np
import pickle

In [3]:
df=pd.read_csv("/content/drive/MyDrive/Project/Cardio_Cleaned.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 21 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Height_(cm)                   308854 non-null  float64
 1   Weight_(kg)                   308854 non-null  float64
 2   BMI                           308854 non-null  float64
 3   Alcohol_Consumption           308854 non-null  float64
 4   Fruit_Consumption             308854 non-null  float64
 5   Green_Vegetables_Consumption  308854 non-null  float64
 6   FriedPotato_Consumption       308854 non-null  float64
 7   Age                           308854 non-null  int64  
 8   Checkup_Encoded               308854 non-null  int64  
 9   General_Health_Encoded        308854 non-null  int64  
 10  Exercise_Encoded              308854 non-null  int64  
 11  Heart_Disease_Encoded         308854 non-null  int64  
 12  Skin_Cancer_Encoded           308854 non-nul

In [4]:
df.Heart_Disease_Encoded.value_counts()

Unnamed: 0_level_0,count
Heart_Disease_Encoded,Unnamed: 1_level_1
0,283883
1,24971


# Train Test Split

In [5]:
X = df.drop('Heart_Disease_Encoded', axis=1)
y = df['Heart_Disease_Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
print(y_train.value_counts())
print(y_test.value_counts())

Shape of X_train: (216197, 20)
Shape of X_test: (92657, 20)
Shape of y_train: (216197,)
Shape of y_test: (92657,)
Heart_Disease_Encoded
0    198717
1     17480
Name: count, dtype: int64
Heart_Disease_Encoded
0    85166
1     7491
Name: count, dtype: int64


# Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_estimator = DecisionTreeClassifier(random_state=1)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1],
    'estimator__max_depth': [3,4,5],
    'estimator__class_weight': ['balanced']
}

# Define the AdaBoost classifier with a base estimator
adaboost = AdaBoostClassifier(estimator=base_estimator, random_state=1)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=adaboost,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit to your training data
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best f1 score:", grid_search.best_score_)

Best parameters: {'estimator__class_weight': 'balanced', 'estimator__max_depth': 4, 'learning_rate': 0.1, 'n_estimators': 50}
Best recall score: 0.3178142022982561


In [None]:
best_adaboost_model = grid_search.best_estimator_

y_pred = best_adaboost_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[61741 23425]
 [ 1622  5869]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.72      0.83     85166
           1       0.20      0.78      0.32      7491

    accuracy                           0.73     92657
   macro avg       0.59      0.75      0.58     92657
weighted avg       0.91      0.73      0.79     92657



In [None]:
print(best_adaboost_model)

AdaBoostClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                    max_depth=4,
                                                    random_state=42),
                   learning_rate=0.1, random_state=42)


# Saving Adaboost model

In [None]:
filename = 'best_adaboost_model.pkl'
pickle.dump(best_adaboost_model, open(filename, 'wb'))

print(f"Model saved to {filename}")

Model saved to best_adaboost_model.pkl


# Saving scaler to pickle file

In [None]:
filename = 'scaler.pkl'
pickle.dump(scaler, open(filename, 'wb'))

print(f"Scaler saved to {filename}")

Scaler saved to scaler.pkl


# Feature Importance

In [None]:
feature_names = [
    'height', 'weight', 'bmi', 'alcohol_consumption', 'fruit_consumption',
    'green_vegetables_consumption', 'fried_potato_consumption', 'age',
    'checkup', 'general_health', 'exercise', 'skin_cancer', 'other_cancer',
    'depression', 'arthritis', 'diabetes', 'smoking_history', 'female', 'male',
    'bmi_category'
]

importances = best_adaboost_model.feature_importances_

feature_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feature_df)

                         Feature  Importance
7                            age    0.602878
9                 general_health    0.311735
18                          male    0.027912
14                     arthritis    0.020061
15                      diabetes    0.018055
17                        female    0.015258
16               smoking_history    0.002849
11                   skin_cancer    0.001140
8                        checkup    0.000066
13                    depression    0.000045
1                         weight    0.000000
0                         height    0.000000
3            alcohol_consumption    0.000000
2                            bmi    0.000000
5   green_vegetables_consumption    0.000000
6       fried_potato_consumption    0.000000
10                      exercise    0.000000
4              fruit_consumption    0.000000
12                  other_cancer    0.000000
19                  bmi_category    0.000000


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


# CatBoost Model Training & Fine Tuning using GridsearchCV

In [None]:
from catboost import CatBoostClassifier

# Define the parameter grid for GridSearchCV
param_grid = {
    'iterations': [100, 250, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5] # L2 regularization
}

# Initialize CatBoost Classifier
catboost_base = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_seed=1,
    verbose=0,
    auto_class_weights='Balanced'
)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=catboost_base,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

print("Starting GridSearchCV for CatBoost hyperparameter tuning...")
# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters found: {best_params}")
print(f"Best cross-validation accuracy: {best_score:.4f}")

best_catboost_model = grid_search.best_estimator_
print(best_catboost_model)

Starting GridSearchCV for CatBoost hyperparameter tuning...
GridSearchCV complete.
Best parameters found: {'depth': 8, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
Best cross-validation accuracy: 0.7769
<catboost.core.CatBoostClassifier object at 0x78d55578c5d0>


# CatBoost Report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the scaled test data
y_pred = best_catboost_model.predict(X_test_scaled)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[65149 20017]
 [ 2041  5450]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.76      0.86     85166
           1       0.21      0.73      0.33      7491

    accuracy                           0.76     92657
   macro avg       0.59      0.75      0.59     92657
weighted avg       0.91      0.76      0.81     92657



# Feature Importances

In [None]:
feature_names = [
    'height', 'weight', 'bmi', 'alcohol_consumption', 'fruit_consumption',
    'green_vegetables_consumption', 'fried_potato_consumption', 'age',
    'checkup', 'general_health', 'exercise', 'skin_cancer', 'other_cancer',
    'depression', 'arthritis', 'diabetes', 'smoking_history', 'female', 'male',
    'bmi_category'
]
importances = best_catboost_model.feature_importances_

feature_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feature_df)

                         Feature  Importance
7                            age   16.162024
2                            bmi   10.400043
1                         weight    8.754322
9                 general_health    8.682649
6       fried_potato_consumption    7.561811
5   green_vegetables_consumption    7.514105
3            alcohol_consumption    6.945360
4              fruit_consumption    6.827687
0                         height    6.326867
8                        checkup    4.101227
16               smoking_history    2.891472
14                     arthritis    2.550504
18                          male    2.377869
13                    depression    1.813917
10                      exercise    1.687607
15                      diabetes    1.585604
19                  bmi_category    1.072013
17                        female    0.982421
12                  other_cancer    0.944639
11                   skin_cancer    0.817859


# Saving catboost model

In [None]:
filename = 'best_catboost_model.pkl'
pickle.dump(best_catboost_model, open(filename, 'wb'))

print(f"Model saved to {filename}")

Model saved to best_catboost_model.pkl


# XGBoost Model Training & Fine Tuning using GridsearchCV

In [None]:
from xgboost import XGBClassifier
param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_base = XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='recall')

grid_search_xgb = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid_xgb,
    cv=3,
    scoring='recall',
    n_jobs=-1
)

print("Starting GridSearchCV for XGBoost hyperparameter tuning...")
grid_search_xgb.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")

best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

print(f"Best parameters found for XGBoost: {best_params_xgb}")
print(f"Best cross-validation accuracy for XGBoost: {best_score_xgb:.4f}")

best_xgboost_model = grid_search_xgb.best_estimator_
print(best_xgboost_model)

Starting GridSearchCV for XGBoost hyperparameter tuning...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


GridSearchCV complete.
Best parameters found for XGBoost: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.6}
Best cross-validation accuracy for XGBoost: 0.9196
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, ...)


# XGBoost Report & Feature Importances

In [None]:
y_pred_xg = best_xgboost_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xg))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xg))

feature_names = [
    'height', 'weight', 'bmi', 'alcohol_consumption', 'fruit_consumption',
    'green_vegetables_consumption', 'fried_potato_consumption', 'age',
    'checkup', 'general_health', 'exercise', 'skin_cancer', 'other_cancer',
    'depression', 'arthritis', 'diabetes', 'smoking_history', 'female', 'male',
    'bmi_category'
]

importances = best_xgboost_model.feature_importances_

feature_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feature_df)

Confusion Matrix:
 [[84945   221]
 [ 7215   276]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     85166
           1       0.56      0.04      0.07      7491

    accuracy                           0.92     92657
   macro avg       0.74      0.52      0.51     92657
weighted avg       0.89      0.92      0.89     92657

                         Feature  Importance
7                            age    0.192897
9                 general_health    0.184772
15                      diabetes    0.161239
14                     arthritis    0.093448
17                        female    0.074359
16               smoking_history    0.049046
8                        checkup    0.037969
18                          male    0.035696
11                   skin_cancer    0.033623
10                      exercise    0.023250
12                  other_cancer    0.017508
3            alcohol_consumption    0.016956
13           

# Saving xgboost model

In [None]:
filename = 'best_xgboost_model.pkl'
pickle.dump(best_xgboost_model, open(filename, 'wb'))

print(f"Model saved to {filename}")

Model saved to best_xgboost_model.pkl


# RandomForest training and Fine Tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4],

}
rf_base = RandomForestClassifier(random_state=1,class_weight='balanced')

grid_search_rf = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid_rf,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

print("Starting GridSearchCV for Random Forest hyperparameter tuning...")
grid_search_rf.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")

best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

print(f"Best parameters found for Random Forest: {best_params_rf}")
print(f"Best cross-validation accuracy for Random Forest: {best_score_rf:.4f}")

best_rf_model = grid_search_rf.best_estimator_
print(best_rf_model)

Starting GridSearchCV for Random Forest hyperparameter tuning...
GridSearchCV complete.
Best parameters found for Random Forest: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation accuracy for Random Forest: 0.7520
RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=2, n_estimators=200, random_state=1)


# RandomForest Report & Feature Importances

In [None]:
y_pred_rf = best_rf_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

feature_names = [
    'height', 'weight', 'bmi', 'alcohol_consumption', 'fruit_consumption',
    'green_vegetables_consumption', 'fried_potato_consumption', 'age',
    'checkup', 'general_health', 'exercise', 'skin_cancer', 'other_cancer',
    'depression', 'arthritis', 'diabetes', 'smoking_history', 'female', 'male',
    'bmi_category'
]

importances = best_rf_model.feature_importances_

feature_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feature_df)

Confusion Matrix:
 [[63059 22107]
 [ 1725  5766]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.74      0.84     85166
           1       0.21      0.77      0.33      7491

    accuracy                           0.74     92657
   macro avg       0.59      0.76      0.58     92657
weighted avg       0.91      0.74      0.80     92657

                         Feature  Importance
7                            age    0.326447
9                 general_health    0.219256
15                      diabetes    0.081987
14                     arthritis    0.073772
16               smoking_history    0.040802
8                        checkup    0.036058
17                        female    0.027387
2                            bmi    0.023685
18                          male    0.022840
3            alcohol_consumption    0.021356
1                         weight    0.021273
0                         height    0.018321
10           

# Saving RandomForest model

In [None]:
filename = 'best_rf_model.pkl'
pickle.dump(best_rf_model, open(filename, 'wb'))

print(f"Model saved to {filename}")

Model saved to best_rf_model.pkl


# Decision Tree Model Fine Tuning

In [None]:
from sklearn.tree import DecisionTreeClassifier
param_grid_dt = {
    'max_depth': [3, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
dt_base = DecisionTreeClassifier(random_state=1,class_weight='balanced')

grid_search_dt = GridSearchCV(
    estimator=dt_base,
    param_grid=param_grid_dt,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

print("Starting GridSearchCV for Decision Tree hyperparameter tuning...")
grid_search_dt.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")

best_params_dt = grid_search_dt.best_params_
best_score_dt = grid_search_dt.best_score_

print(f"Best parameters found for Decision Tree: {best_params_dt}")
print(f"Best cross-validation accuracy for Decision Tree: {best_score_dt:.4f}")

best_dt_model = grid_search_dt.best_estimator_
print(best_dt_model)

Starting GridSearchCV for Decision Tree hyperparameter tuning...
GridSearchCV complete.
Best parameters found for Decision Tree: {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best cross-validation accuracy for Decision Tree: 0.7558
DecisionTreeClassifier(class_weight='balanced', max_depth=15, random_state=1)


# Decision Tree Report & Feature Importances

In [None]:
y_pred_rf = best_rf_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

feature_names = [
    'height', 'weight', 'bmi', 'alcohol_consumption', 'fruit_consumption',
    'green_vegetables_consumption', 'fried_potato_consumption', 'age',
    'checkup', 'general_health', 'exercise', 'skin_cancer', 'other_cancer',
    'depression', 'arthritis', 'diabetes', 'smoking_history', 'female', 'male',
    'bmi_category'
]
importances = best_rf_model.feature_importances_

feature_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feature_df)

Confusion Matrix:
 [[63059 22107]
 [ 1725  5766]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.74      0.84     85166
           1       0.21      0.77      0.33      7491

    accuracy                           0.74     92657
   macro avg       0.59      0.76      0.58     92657
weighted avg       0.91      0.74      0.80     92657

                         Feature  Importance
7                            age    0.326447
9                 general_health    0.219256
15                      diabetes    0.081987
14                     arthritis    0.073772
16               smoking_history    0.040802
8                        checkup    0.036058
17                        female    0.027387
2                            bmi    0.023685
18                          male    0.022840
3            alcohol_consumption    0.021356
1                         weight    0.021273
0                         height    0.018321
10           

# Saving Decision Tree Model

In [None]:
filename = 'best_dt_model.pkl'
pickle.dump(best_dt_model, open(filename, 'wb'))

print(f'model saved to {filename}')

model saved to best_dt_model.pkl


# Logistic Regression Model Training & Fine Tuning using GridsearchCV

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pickle

param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['liblinear', 'lbfgs'],
    'class_weight': ['balanced']
}

lr_base = LogisticRegression(random_state=1)

grid_search_lr = GridSearchCV(
    estimator=lr_base,
    param_grid=param_grid_lr,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

print("Starting GridSearchCV for Logistic Regression hyperparameter tuning...")
grid_search_lr.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")

best_params_lr = grid_search_lr.best_params_
best_score_lr = grid_search_lr.best_score_

print(f"Best parameters found for Logistic Regression: {best_params_lr}")
print(f"Best cross-validation accuracy for Logistic Regression: {best_score_lr:.4f}")

best_lr_model = grid_search_lr.best_estimator_
print(best_lr_model)

Starting GridSearchCV for Logistic Regression hyperparameter tuning...
GridSearchCV complete.
Best parameters found for Logistic Regression: {'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validation accuracy for Logistic Regression: 0.7363
LogisticRegression(C=0.001, class_weight='balanced', random_state=1)


# Logistic Regression Report & feature importances

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_lr = best_lr_model.predict(X_test_scaled)
roc_auc = roc_auc_score(y_test, best_lr_model.predict_proba(X_test_scaled)[:,1])

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("ROC-AUC Score:", roc_auc)

importances = pd.Series(best_lr_model.coef_[0], index=X_train.columns).sort_values(ascending=False)
print("Feature Coefficients: ")
print(importances)

Confusion Matrix:
 [[62186 22980]
 [ 1633  5858]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.73      0.83     85166
           1       0.20      0.78      0.32      7491

    accuracy                           0.73     92657
   macro avg       0.59      0.76      0.58     92657
weighted avg       0.91      0.73      0.79     92657

ROC-AUC Score: 0.8326064906644363
Feature Coefficients: 
Age                             0.962858
Male                            0.226399
Smoking_History_Encoded         0.207286
Diabetes_Encoded                0.197190
Checkup_Encoded                 0.162950
Arthritis_Encoded               0.142796
Depression_Encoded              0.105307
Skin_Cancer_Encoded             0.038618
Other_Cancer_Encoded            0.034726
Green_Vegetables_Consumption    0.019137
BMI                             0.005974
Fruit_Consumption              -0.009111
Exercise_Encoded               -0.018065
FriedPot

# Saving Logistic Regression Model

In [None]:
filename = 'best_lr_model.pkl'
pickle.dump(best_lr_model, open(filename, 'wb'))

print(f'model saved to {filename}')

model saved to best_lr_model.pkl


# K-Nearest Neighbors (KNN) Model Training & Fine Tuning using GridsearchCV

In [None]:
from sklearn.neighbors import KNeighborsClassifier
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn_base = KNeighborsClassifier()

grid_search_knn = GridSearchCV(
    estimator=knn_base,
    param_grid=param_grid_knn,
    cv=3,
    scoring='recall',
    n_jobs=-1
)

print("Starting GridSearchCV for KNN hyperparameter tuning...")
grid_search_knn.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")
best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_

print(f"Best parameters found for KNN: {best_params_knn}")
print(f"Best cross-validation ROC AUC for KNN: {best_score_knn:.4f}")

best_knn_model = grid_search_knn.best_estimator_
print(best_knn_model)

Starting GridSearchCV for KNN hyperparameter tuning...
GridSearchCV complete.
Best parameters found for KNN: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best cross-validation ROC AUC for KNN: 0.1359
KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')


# KNN Report

In [None]:
y_pred_knn = best_knn_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))
print("ROC-AUC Score:", roc_auc_score(y_test, best_knn_model.predict_proba(X_test_scaled)[:,1]))

Confusion Matrix:
 [[82402  2764]
 [ 6542   949]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95     85166
           1       0.26      0.13      0.17      7491

    accuracy                           0.90     92657
   macro avg       0.59      0.55      0.56     92657
weighted avg       0.87      0.90      0.88     92657

ROC-AUC Score: 0.6382877011847168


# Saving KNN Model

In [None]:
filename = 'best_knn_model.pkl'
pickle.dump(best_knn_model, open(filename, 'wb'))

print(f'model saved to {filename}')

model saved to best_knn_model.pkl


# Naive Bayes Model Training & Fine Tuning using GridsearchCV

In [None]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

param_grid_nb = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

nb_base = GaussianNB()

grid_search_nb = GridSearchCV(
    estimator=nb_base,
    param_grid=param_grid_nb,
    cv=3,
    scoring='precision', # Using roc_auc for scoring
    n_jobs=-1
)

print("Starting GridSearchCV for Naive Bayes hyperparameter tuning...")
grid_search_nb.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")

best_params_nb = grid_search_nb.best_params_
best_score_nb = grid_search_nb.best_score_

print(f"Best parameters found for Naive Bayes: {best_params_nb}")
print(f"Best cross-validation ROC AUC for Naive Bayes: {best_score_nb:.4f}")

best_nb_model = grid_search_nb.best_estimator_
print(best_nb_model)

Starting GridSearchCV for Naive Bayes hyperparameter tuning...
GridSearchCV complete.
Best parameters found for Naive Bayes: {'var_smoothing': np.float64(1.0)}
Best cross-validation ROC AUC for Naive Bayes: 0.3521
GaussianNB(var_smoothing=np.float64(1.0))


# Naive Bayes Report

In [None]:
y_pred_nb = best_nb_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))
print("ROC-AUC Score:", roc_auc_score(y_test, best_nb_model.predict_proba(X_test_scaled)[:,1]))

Confusion Matrix:
 [[82888  2278]
 [ 6256  1235]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95     85166
           1       0.35      0.16      0.22      7491

    accuracy                           0.91     92657
   macro avg       0.64      0.57      0.59     92657
weighted avg       0.88      0.91      0.89     92657

ROC-AUC Score: 0.8053019594989302


# Saving Naive Bayes Model

In [None]:
filename = 'best_nb_model.pkl'
pickle.dump(best_nb_model, open(filename, 'wb'))

print(f'model saved to {filename}')

model saved to best_nb_model.pkl


# Support Vector Machine (SVM) Model Training

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=1)
X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train_scaled, y_train)

print("Shape of original X_train_scaled:", X_train_scaled.shape)
print("Shape of undersampled X_train_scaled:", X_train_undersampled.shape)
print("Shape of original y_train:", y_train.shape)
print("Shape of undersampled y_train:", y_train_undersampled.shape)
print("\nValue counts of original y_train:\n", y_train.value_counts())
print("\nValue counts of undersampled y_train:\n", y_train_undersampled.value_counts())

Shape of original X_train_scaled: (216197, 20)
Shape of undersampled X_train_scaled: (34960, 20)
Shape of original y_train: (216197,)
Shape of undersampled y_train: (34960,)

Value counts of original y_train:
 Heart_Disease_Encoded
0    198717
1     17480
Name: count, dtype: int64

Value counts of undersampled y_train:
 Heart_Disease_Encoded
0    17480
1    17480
Name: count, dtype: int64


In [None]:
from sklearn.svm import SVC

best_svm_model = SVC(C=1, gamma='scale', kernel='rbf', class_weight='balanced',random_state=1, probability=True)

best_svm_model.fit(X_train_undersampled, y_train_undersampled)
print(best_svm_model)

SVC(C=1, class_weight='balanced', probability=True, random_state=1)


# SVM Report

In [None]:
y_pred_svm = best_svm_model.predict(X_test_scaled)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("ROC-AUC Score:", roc_auc_score(y_test, best_svm_model.predict_proba(X_test_scaled)[:,1]))

Confusion Matrix:
 [[59492 25674]
 [ 1382  6109]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.70      0.81     85166
           1       0.19      0.82      0.31      7491

    accuracy                           0.71     92657
   macro avg       0.58      0.76      0.56     92657
weighted avg       0.91      0.71      0.77     92657

ROC-AUC Score: 0.8252058949459342


# Saving SVM Model

In [None]:
filename = 'best_svm_model.pkl'
pickle.dump(best_svm_model, open(filename, 'wb'))

print(f'model saved to {filename}')

model saved to best_svm_model.pkl


In [None]:
filename = '/content/drive/MyDrive/Project/best_adaboost_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 0.5668
ROC AUC : 0.8147


In [None]:
!pip install catboost
import catboost
filename = '/content/drive/MyDrive/Project/best_catboost_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
LogLoss : 0.4476
ROC AUC : 0.8210


In [None]:
filename = '/content/drive/MyDrive/Project/best_xgboost_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 0.4973
ROC AUC : 0.8353


In [None]:
filename = '/content/drive/MyDrive/Project/best_rf_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 0.4767
ROC AUC : 0.8314


In [None]:
filename = '/content/drive/MyDrive/Project/best_dt_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 1.3250
ROC AUC : 0.7192


In [None]:
filename = '/content/drive/MyDrive/Project/best_lr_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 0.5095
ROC AUC : 0.8326


In [7]:
filename = '/content/drive/MyDrive/Project/best_nb_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 0.2462
ROC AUC : 0.8053


In [8]:
filename = '/content/drive/MyDrive/Project/best_svm_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 0.5211
ROC AUC : 0.8252


In [9]:
filename = '/content/drive/MyDrive/Project/best_knn_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred_proba = loaded_model.predict_proba(X_test_scaled)[:, 1]

logloss = log_loss(y_test, y_pred_proba)
print(f"LogLoss : {logloss:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC : {roc_auc:.4f}")

LogLoss : 1.8308
ROC AUC : 0.6383
