In [None]:
# 1. Load and Normalize All Datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

# Load Framingham
df_fram = pd.read_csv("framingham.csv")
print("Framingham columns:", df_fram.columns.tolist())

# Rename known columns if they exist
rename_map = {
    'sysBP': 'trestbps',
    'glucose': 'chol',
    'TenYearCHD': 'target',
    'male': 'sex'
}
rename_map = {k: v for k, v in rename_map.items() if k in df_fram.columns}
df_fram.rename(columns=rename_map, inplace=True)

# Ensure all required columns exist
for col in ['age', 'sex', 'trestbps', 'chol', 'fbs', 'target']:
    if col not in df_fram.columns:
        if col == 'sex':
            df_fram[col] = 1
        elif col == 'fbs':
            df_fram[col] = 0
        else:
            df_fram[col] = np.nan

# Final column selection
df_fram = df_fram[['age', 'sex', 'trestbps', 'chol', 'fbs', 'target']]
df_fram['target'] = df_fram['target'].astype(float)

# Load Cardiovascular
df_cardio = pd.read_csv("cardio_train.csv", sep=';')
df_cardio['age'] = (df_cardio['age'] / 365).astype(int)
df_cardio.rename(columns={
    'ap_hi': 'trestbps',
    'cholesterol': 'chol',
    'gluc': 'fbs',
    'cardio': 'target'
}, inplace=True)

# Ensure all required columns exist
for col in ['age', 'sex', 'trestbps', 'chol', 'fbs', 'target']:
    if col not in df_cardio.columns:
        if col == 'sex':
            df_cardio[col] = 1  # Assume male if sex is missing
        elif col == 'fbs':
            df_cardio[col] = 0
        else:
            df_cardio[col] = np.nan

df_cardio = df_cardio[['age', 'sex', 'trestbps', 'chol', 'fbs', 'target']]

# Load Cleveland
cleveland_cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                  'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df_cleveland = pd.read_csv("processed.cleveland.csv", header=None)
df_cleveland.columns = cleveland_cols
df_cleveland.replace("?", np.nan, inplace=True)
df_cleveland.dropna(inplace=True)
df_cleveland[['ca', 'thal']] = df_cleveland[['ca', 'thal']].astype(float)
df_cleveland['target'] = df_cleveland['target'].apply(lambda x: 1 if int(x) > 0 else 0)
df_cleveland = df_cleveland[['age', 'sex', 'trestbps', 'chol', 'fbs', 'target']]

# Load Statlog
df_statlog = pd.read_csv("heart.csv")

# Check and create missing 'target' if necessary
if 'target' not in df_statlog.columns:
    print("Warning: 'target' column not found in Statlog dataset. Creating a default target (0).")
    df_statlog['target'] = 0  # Default class for all rows; should be updated if true labels exist

# Ensure all required columns exist
for col in ['age', 'sex', 'trestbps', 'chol', 'fbs', 'target']:
    if col not in df_statlog.columns:
        if col == 'sex':
            df_statlog[col] = 1
        elif col == 'fbs':
            df_statlog[col] = 0
        else:
            df_statlog[col] = np.nan

# Ensure binary target
df_statlog['target'] = df_statlog['target'].apply(lambda x: 1 if int(x) > 0 else 0)
df_statlog = df_statlog[['age', 'sex', 'trestbps', 'chol', 'fbs', 'target']]

# Combine all datasets
df_combined = pd.concat([df_cleveland, df_statlog, df_fram, df_cardio], ignore_index=True)
df_combined.dropna(inplace=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Split features and target
X = df_combined.drop('target', axis=1)
y = df_combined['target']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Model configurations
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": GridSearchCV(RandomForestClassifier(random_state=42),
                                   {'n_estimators': [100, 200], 'max_depth': [5, 10]},
                                   scoring='accuracy', cv=5),
    "XGBoost": GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                             {'n_estimators': [100, 200], 'max_depth': [3, 5]},
                             scoring='accuracy', cv=5),
    "Decision Tree": GridSearchCV(DecisionTreeClassifier(random_state=42),
                                   {'max_depth': [3, 5, 7]},
                                   scoring='accuracy', cv=5),
    "SVM": GridSearchCV(SVC(),
                         {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
                         scoring='accuracy', cv=5)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    best_model = model.best_estimator_ if hasattr(model, 'best_estimator_') else model
    y_pred = best_model.predict(X_test)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Framingham columns: ['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD']

=== Logistic Regression ===
Accuracy: 0.7076871207012811
Confusion Matrix:
 [[5876 1812]
 [2523 4619]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.70      0.76      0.73      7688
         1.0       0.72      0.65      0.68      7142

    accuracy                           0.71     14830
   macro avg       0.71      0.71      0.71     14830
weighted avg       0.71      0.71      0.71     14830


=== Random Forest ===
Accuracy: 0.7314902225219151
Confusion Matrix:
 [[5897 1791]
 [2191 4951]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.73      0.77      0.75      7688
         1.0       0.73      0.69      0.71      7142

    accuracy                           0.73     