In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')


In [2]:
DATA_PATH = "heart_attack_youngsters_india.csv"

df = pd.read_csv(DATA_PATH)
df.columns = [c.strip() for c in df.columns]  

print("Columns:", df.columns.tolist())
print("Shape:", df.shape)

TARGET = "Heart Attack Likelihood"   

df[TARGET] = df[TARGET].astype(str).str.strip().str.capitalize()
y = df[TARGET].map({"No": 0, "Yes": 1}).values

X = df.drop(columns=[TARGET]).copy()

bp_col = None
for c in X.columns:
    if "Blood Pressure" in c or "systolic" in c.lower():
        bp_col = c
        break

if bp_col is not None:
    def split_bp(val):
        try:
            s = str(val)
            if "/" in s:
                a, b = s.split("/")
                return float(a), float(b)
            else:
                return np.nan, np.nan
        except:
            return np.nan, np.nan

    bp_parsed = X[bp_col].apply(split_bp)
    X["BP_systolic"] = bp_parsed.apply(lambda x: x[0])
    X["BP_diastolic"] = bp_parsed.apply(lambda x: x[1])
    X.drop(columns=[bp_col], inplace=True)

print("Final feature shape:", X.shape)
X.head()


Columns: ['Age', 'Gender', 'Region', 'Urban/Rural', 'SES', 'Smoking Status', 'Alcohol Consumption', 'Diet Type', 'Physical Activity Level', 'Screen Time (hrs/day)', 'Sleep Duration (hrs/day)', 'Family History of Heart Disease', 'Diabetes', 'Hypertension', 'Cholesterol Levels (mg/dL)', 'BMI (kg/m²)', 'Stress Level', 'Blood Pressure (systolic/diastolic mmHg)', 'Resting Heart Rate (bpm)', 'ECG Results', 'Chest Pain Type', 'Maximum Heart Rate Achieved', 'Exercise Induced Angina', 'Blood Oxygen Levels (SpO2%)', 'Triglyceride Levels (mg/dL)', 'Heart Attack Likelihood']
Shape: (10000, 26)
Final feature shape: (10000, 26)


Unnamed: 0,Age,Gender,Region,Urban/Rural,SES,Smoking Status,Alcohol Consumption,Diet Type,Physical Activity Level,Screen Time (hrs/day),...,Stress Level,Resting Heart Rate (bpm),ECG Results,Chest Pain Type,Maximum Heart Rate Achieved,Exercise Induced Angina,Blood Oxygen Levels (SpO2%),Triglyceride Levels (mg/dL),BP_systolic,BP_diastolic
0,30,Male,East,Urban,Middle,Never,Regularly,Non-Vegetarian,Sedentary,3,...,High,82,Normal,Non-anginal,183,No,94.1,58,177.0,63.1
1,24,Female,East,Urban,Low,Occasionally,Occasionally,Non-Vegetarian,Sedentary,15,...,High,76,Normal,Non-anginal,118,No,97.1,341,137.5,110.7
2,24,Female,North,Urban,Low,Occasionally,Occasionally,Vegan,High,15,...,Low,86,Normal,Typical,164,No,92.7,373,138.3,76.6
3,27,Male,East,Urban,Middle,Occasionally,Never,Vegetarian,Sedentary,6,...,Medium,106,Normal,Non-anginal,188,No,98.4,102,177.1,90.0
4,21,Female,West,Rural,Low,Occasionally,Occasionally,Vegetarian,Moderate,4,...,Low,73,Normal,Atypical,216,No,94.9,235,130.7,108.8


In [3]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, categorical_cols)
], remainder="drop")


Numeric columns: ['Age', 'Screen Time (hrs/day)', 'Sleep Duration (hrs/day)', 'Cholesterol Levels (mg/dL)', 'BMI (kg/m²)', 'Resting Heart Rate (bpm)', 'Maximum Heart Rate Achieved', 'Blood Oxygen Levels (SpO2%)', 'Triglyceride Levels (mg/dL)', 'BP_systolic', 'BP_diastolic']
Categorical columns: ['Gender', 'Region', 'Urban/Rural', 'SES', 'Smoking Status', 'Alcohol Consumption', 'Diet Type', 'Physical Activity Level', 'Family History of Heart Disease', 'Diabetes', 'Hypertension', 'Stress Level', 'ECG Results', 'Chest Pain Type', 'Exercise Induced Angina']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    class_weight="balanced",   
    random_state=42,
    n_jobs=-1
)

rf_pipeline = Pipeline([
    ("prep", preprocessor),
    ("rf", rf_clf)
])

rf_pipeline.fit(X_train, y_train)

y_pred = rf_pipeline.predict(X_test)

print("Baseline Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Baseline Random Forest Accuracy: 0.796

Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89      1592
           1       0.00      0.00      0.00       408

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.63      0.80      0.71      2000

Confusion Matrix:
 [[1592    0]
 [ 408    0]]


In [5]:
param_dist = {
    "rf__n_estimators": [200, 300, 400, 500],
    "rf__max_depth": [None, 5, 10, 15, 20],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["sqrt", "log2", 0.5, 0.7],
    "rf__bootstrap": [True, False]
}

rand_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_dist,
    n_iter=30,                  
    scoring="accuracy",          
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

rand_search.fit(X_train, y_train)

print("Best Accuracy (CV):", rand_search.best_score_)
print("Best Params:", rand_search.best_params_)

best_model = rand_search.best_estimator_

y_pred_best = best_model.predict(X_test)

print("\nTest Accuracy (best RF):", accuracy_score(y_test, y_pred_best))
print("\nClassification Report (best RF):\n", classification_report(y_test, y_pred_best))
print("Confusion Matrix (best RF):\n", confusion_matrix(y_test, y_pred_best))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Accuracy (CV): 0.79625
Best Params: {'rf__n_estimators': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 4, 'rf__max_features': 'sqrt', 'rf__max_depth': 20, 'rf__bootstrap': True}

Test Accuracy (best RF): 0.796

Classification Report (best RF):
               precision    recall  f1-score   support

           0       0.80      1.00      0.89      1592
           1       0.00      0.00      0.00       408

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.63      0.80      0.71      2000

Confusion Matrix (best RF):
 [[1592    0]
 [ 408    0]]


In [6]:
ohe = best_model.named_steps["prep"].named_transformers_["cat"].named_steps["ohe"]
cat_feature_names = ohe.get_feature_names_out(categorical_cols)
feature_names = np.concatenate([numeric_cols, cat_feature_names])

rf_best = best_model.named_steps["rf"]

importances = rf_best.feature_importances_
idx = np.argsort(importances)[::-1]

print("Top 20 important features:\n")
for i in idx[:20]:
    print(f"{feature_names[i]}: {importances[i]:.4f}")


Top 20 important features:

BP_diastolic: 0.0722
BP_systolic: 0.0719
Cholesterol Levels (mg/dL): 0.0712
Triglyceride Levels (mg/dL): 0.0707
BMI (kg/m²): 0.0704
Blood Oxygen Levels (SpO2%): 0.0674
Maximum Heart Rate Achieved: 0.0665
Resting Heart Rate (bpm): 0.0631
Age: 0.0506
Screen Time (hrs/day): 0.0479
Sleep Duration (hrs/day): 0.0353
Diet Type_Non-Vegetarian: 0.0103
Smoking Status_Never: 0.0103
Physical Activity Level_Sedentary: 0.0100
SES_Middle: 0.0099
SES_Low: 0.0098
Physical Activity Level_Moderate: 0.0096
Diet Type_Vegetarian: 0.0096
Alcohol Consumption_Never: 0.0093
Stress Level_Medium: 0.0093


In [7]:
from collections import Counter

X_train_os = X_train.copy()
y_train_os = y_train.copy()

counter = Counter(y_train)
max_count = max(counter.values())

for cls in counter:
    cls_idx = np.where(y_train == cls)[0]
    reps = max_count - len(cls_idx)
    if reps > 0:
        extra_idx = np.random.choice(cls_idx, size=reps, replace=True)
        X_train_os = pd.concat([X_train_os, X_train.iloc[extra_idx]])
        y_train_os = np.concatenate([y_train_os, y_train[extra_idx]])

print("Before balancing:", Counter(y_train))
print("After balancing :", Counter(y_train_os))


Before balancing: Counter({0: 6370, 1: 1630})
After balancing : Counter({0: 6370, 1: 6370})


In [8]:
rf_balanced = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=4,
    max_features="sqrt",
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

rf_balanced_pipeline = Pipeline([
    ("prep", preprocessor),
    ("rf", rf_balanced)
])

rf_balanced_pipeline.fit(X_train_os, y_train_os)

y_pred_bal = rf_balanced_pipeline.predict(X_test)

print("Balanced RF Accuracy:", accuracy_score(y_test, y_pred_bal))
print("\nClassification Report (Balanced RF):\n", classification_report(y_test, y_pred_bal))
print("Confusion Matrix (Balanced RF):\n", confusion_matrix(y_test, y_pred_bal))


Balanced RF Accuracy: 0.795

Classification Report (Balanced RF):
               precision    recall  f1-score   support

           0       0.80      1.00      0.89      1592
           1       0.00      0.00      0.00       408

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.63      0.80      0.71      2000

Confusion Matrix (Balanced RF):
 [[1590    2]
 [ 408    0]]


In [9]:
y_prob = rf_balanced_pipeline.predict_proba(X_test)[:,1]

threshold = 0.35   
y_pred_tuned = (y_prob >= threshold).astype(int)

print("Threshold:", threshold)
print("\nTuned Classification Report:\n", classification_report(y_test, y_pred_tuned))
print("Tuned Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned))


Threshold: 0.35

Tuned Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.53      0.64      1592
           1       0.22      0.52      0.31       408

    accuracy                           0.53      2000
   macro avg       0.52      0.52      0.47      2000
weighted avg       0.69      0.53      0.57      2000

Tuned Confusion Matrix:
 [[842 750]
 [196 212]]


In [12]:
import joblib

rf_pipeline.fit(X_train, y_train)
joblib.dump(rf_pipeline, "Heart_Disease_RF.pkl")


['Heart_Disease_RF.pkl']