In [1]:
# Basic imports & load
import warnings
warnings.filterwarnings('ignore')
import numpy as np, pandas as pd, os, joblib, json
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

DATA_PATH = r"heart_attack_youngsters_india.csv"

df = pd.read_csv(DATA_PATH)
df.columns = [c.strip() for c in df.columns]

print('Data shape:', df.shape)
df.head()

Data shape: (10000, 26)


Unnamed: 0,Age,Gender,Region,Urban/Rural,SES,Smoking Status,Alcohol Consumption,Diet Type,Physical Activity Level,Screen Time (hrs/day),...,Stress Level,Blood Pressure (systolic/diastolic mmHg),Resting Heart Rate (bpm),ECG Results,Chest Pain Type,Maximum Heart Rate Achieved,Exercise Induced Angina,Blood Oxygen Levels (SpO2%),Triglyceride Levels (mg/dL),Heart Attack Likelihood
0,30,Male,East,Urban,Middle,Never,Regularly,Non-Vegetarian,Sedentary,3,...,High,177.0/63.1,82,Normal,Non-anginal,183,No,94.1,58,No
1,24,Female,East,Urban,Low,Occasionally,Occasionally,Non-Vegetarian,Sedentary,15,...,High,137.5/110.7,76,Normal,Non-anginal,118,No,97.1,341,No
2,24,Female,North,Urban,Low,Occasionally,Occasionally,Vegan,High,15,...,Low,138.3/76.6,86,Normal,Typical,164,No,92.7,373,Yes
3,27,Male,East,Urban,Middle,Occasionally,Never,Vegetarian,Sedentary,6,...,Medium,177.1/90.0,106,Normal,Non-anginal,188,No,98.4,102,Yes
4,21,Female,West,Rural,Low,Occasionally,Occasionally,Vegetarian,Moderate,4,...,Low,130.7/108.8,73,Normal,Atypical,216,No,94.9,235,No


In [2]:
# Target handling & BP split
TARGET = 'Heart Attack Likelihood'
df[TARGET] = df[TARGET].astype(str).str.strip().str.capitalize()
le_target = LabelEncoder()
y = le_target.fit_transform(df[TARGET])  # 0/1 mapping
df = df.drop(columns=[TARGET])

# split blood pressure column if exists
bp_col = None
for c in df.columns:
    if 'Blood Pressure' in c or 'systolic' in c.lower():
        bp_col = c
        break
if bp_col is not None:
    def split_bp(val):
        try:
            s = str(val)
            if '/' in s:
                a,b = s.split('/')
                return float(a), float(b)
            else:
                return np.nan, np.nan
        except:
            return np.nan, np.nan
    bp_parsed = df[bp_col].apply(split_bp)
    df['BP_systolic'] = bp_parsed.apply(lambda x: x[0])
    df['BP_diastolic'] = bp_parsed.apply(lambda x: x[1])
    df = df.drop(columns=[bp_col])

numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object','category','bool']).columns.tolist()

# drop columns with >80% missing (if any)
high_missing = [c for c in df.columns if df[c].isna().mean() > 0.8]
if high_missing:
    df = df.drop(columns=high_missing)
    numeric_cols = [c for c in numeric_cols if c not in high_missing]
    categorical_cols = [c for c in categorical_cols if c not in high_missing]

print('Numeric cols:', numeric_cols)
print('Categorical cols:', categorical_cols)

Numeric cols: ['Age', 'Screen Time (hrs/day)', 'Sleep Duration (hrs/day)', 'Cholesterol Levels (mg/dL)', 'BMI (kg/m²)', 'Resting Heart Rate (bpm)', 'Maximum Heart Rate Achieved', 'Blood Oxygen Levels (SpO2%)', 'Triglyceride Levels (mg/dL)', 'BP_systolic', 'BP_diastolic']
Categorical cols: ['Gender', 'Region', 'Urban/Rural', 'SES', 'Smoking Status', 'Alcohol Consumption', 'Diet Type', 'Physical Activity Level', 'Family History of Heart Disease', 'Diabetes', 'Hypertension', 'Stress Level', 'ECG Results', 'Chest Pain Type', 'Exercise Induced Angina']


In [3]:
# Preprocessor: median impute + standard scale for numeric, most-frequent + one-hot for categorical

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, categorical_cols)
], remainder='drop')

# Fit and transform
X = df.copy()
preprocessor.fit(X)
X_pre = preprocessor.transform(X)
input_dim = X_pre.shape[1]

print("Transformed shape:", X_pre.shape)

Transformed shape: (10000, 54)


In [4]:
# Create out-of-fold probabilities for NB and MLP (sklearn) using StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
nb_oof_proba = np.zeros((X_pre.shape[0], len(np.unique(y))))
ann_oof_proba = np.zeros_like(nb_oof_proba)

fold = 1
for train_idx, val_idx in skf.split(X_pre, y):
    X_tr_f, X_val_f = X_pre[train_idx], X_pre[val_idx]
    y_tr_f, y_val_f = y[train_idx], y[val_idx]
    # GaussianNB
    nb = GaussianNB(); nb.fit(X_tr_f, y_tr_f)
    nb_oof_proba[val_idx] = nb.predict_proba(X_val_f)
    # MLPClassifier with simple random oversampling by duplication to balance classes in fold
    unique, counts = np.unique(y_tr_f, return_counts=True)
    max_count = max(counts)
    X_tr_bal = X_tr_f.copy(); y_tr_bal = y_tr_f.copy()
    for cls in unique:
        idxs = np.where(y_tr_f==cls)[0]
        reps = max_count - len(idxs)
        if reps > 0:
            choice = np.random.choice(idxs, size=reps, replace=True)
            X_tr_bal = np.vstack([X_tr_bal, X_tr_f[choice]])
            y_tr_bal = np.concatenate([y_tr_bal, y_tr_f[choice]])
    perm = np.random.permutation(len(y_tr_bal))
    X_tr_bal = X_tr_bal[perm]; y_tr_bal = y_tr_bal[perm]
    mlp = MLPClassifier(hidden_layer_sizes=(128,64), activation='relu', max_iter=300, early_stopping=True, random_state=42)
    mlp.fit(X_tr_bal, y_tr_bal)
    ann_oof_proba[val_idx] = mlp.predict_proba(X_val_f)
    print(f'Fold {fold} done.')
    fold += 1

# Stack features: use second-column probability (prob of positive class) for binary case
if nb_oof_proba.shape[1] == 2:
    nb_feat = nb_oof_proba[:,1].reshape(-1,1)
    ann_feat = ann_oof_proba[:,1].reshape(-1,1)
else:
    nb_feat = nb_oof_proba; ann_feat = ann_oof_proba
stack_features = np.hstack([nb_feat, ann_feat])

# Train meta classifier
meta_clf = LogisticRegression(max_iter=1000)
meta_clf.fit(stack_features, y)

# OOF performance
stack_oof_preds = meta_clf.predict(stack_features)
acc_oof = accuracy_score(y, stack_oof_preds)
f1_oof = f1_score(y, stack_oof_preds, average='weighted')
try:
    roc_oof = roc_auc_score(y, stack_features[:,1])
except:
    roc_oof = np.nan
print('\nOOF stacked results — Accuracy: {:.4f}, F1-weighted: {:.4f}, ROC AUC(approx): {:.4f}'.format(acc_oof, f1_oof, roc_oof))

Fold 1 done.
Fold 2 done.
Fold 3 done.
Fold 4 done.
Fold 5 done.

OOF stacked results — Accuracy: 0.7962, F1-weighted: 0.7059, ROC AUC(approx): 0.5018


In [5]:
# Held-out test evaluation (train final models on train split, evaluate on test split)
X_tr, X_val, y_tr, y_val = train_test_split(X_pre, y, test_size=0.2, stratify=y, random_state=42)
nb_final = GaussianNB(); nb_final.fit(X_tr, y_tr); p_nb_val = nb_final.predict_proba(X_val)
# ANN final train with oversampling
unique, counts = np.unique(y_tr, return_counts=True); max_count = max(counts)
X_tr_bal = X_tr.copy(); y_tr_bal = y_tr.copy()
for cls in unique:
    idxs = np.where(y_tr==cls)[0]
    reps = max_count - len(idxs)
    if reps > 0:
        choice = np.random.choice(idxs, size=reps, replace=True)
        X_tr_bal = np.vstack([X_tr_bal, X_tr[choice]])
        y_tr_bal = np.concatenate([y_tr_bal, y_tr[choice]])
perm = np.random.permutation(len(y_tr_bal)); X_tr_bal = X_tr_bal[perm]; y_tr_bal = y_tr_bal[perm]
mlp_final = MLPClassifier(hidden_layer_sizes=(128,64), activation='relu', max_iter=400, early_stopping=True, random_state=42)
mlp_final.fit(X_tr_bal, y_tr_bal)
p_ann_val = mlp_final.predict_proba(X_val)
if nb_oof_proba.shape[1]==2:
    f_nb_val = p_nb_val[:,1].reshape(-1,1)
    f_ann_val = p_ann_val[:,1].reshape(-1,1)
    fstack_val = np.hstack([f_nb_val, f_ann_val])
else:
    fstack_val = np.hstack([p_nb_val, p_ann_val])
y_meta_pred = meta_clf.predict(fstack_val)
y_meta_prob = meta_clf.predict_proba(fstack_val) if hasattr(meta_clf, 'predict_proba') else None

acc_final = accuracy_score(y_val, y_meta_pred)
f1_final = f1_score(y_val, y_meta_pred, average='weighted')
try:
    roc_final = roc_auc_score(y_val, y_meta_prob[:,1])
except:
    roc_final = np.nan

print('\nHeld-out stacked results — Accuracy: {:.4f}, F1-weighted: {:.4f}, ROC AUC: {:.4f}'.format(acc_final, f1_final, roc_final))
print('\nClassification Report (held-out):\n', classification_report(y_val, y_meta_pred, target_names=le_target.classes_))
print('\nConfusion Matrix (held-out):\n', confusion_matrix(y_val, y_meta_pred))


Held-out stacked results — Accuracy: 0.7960, F1-weighted: 0.7056, ROC AUC: 0.4863

Classification Report (held-out):
               precision    recall  f1-score   support

          No       0.80      1.00      0.89      1592
         Yes       0.00      0.00      0.00       408

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.63      0.80      0.71      2000


Confusion Matrix (held-out):
 [[1592    0]
 [ 408    0]]


In [6]:

# Save models & preprocessor
MODEL_DIR = '/mnt/data/models_for_user'
os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(preprocessor, os.path.join(MODEL_DIR, 'preprocessor.joblib'))
joblib.dump(nb_final, os.path.join(MODEL_DIR, 'gaussiannb_final.joblib'))
joblib.dump(mlp_final, os.path.join(MODEL_DIR, 'mlp_final.joblib'))
joblib.dump(meta_clf, os.path.join(MODEL_DIR, 'meta_logistic.joblib'))
summary = {
    "oof_accuracy": float(acc_oof),
    "oof_f1": float(f1_oof),
    "heldout_accuracy": float(acc_final),
    "heldout_f1": float(f1_final),
    "heldout_roc": float(roc_final),
    "label_classes": list(le_target.classes_)
}
with open(os.path.join(MODEL_DIR, 'summary.json'), 'w') as f:
    json.dump(summary, f, indent=2)
print('\nSaved models and summary to', MODEL_DIR)


Saved models and summary to /mnt/data/models_for_user
