In [26]:
import pandas as pd
df_pima = pd.read_csv("pima_diabetes.csv")
df_health = pd.read_csv("diabetes_health_indicators.csv")
df_cardio = pd.read_csv("cardio_data.csv")
df_ckd = pd.read_csv("ckd.csv")
df_nhanes = pd.read_csv("nhanes.csv")


In [27]:
print(df_pima.head());
print(df_health.head());
print(df_cardio.head());
print(df_ckd.head());
print(df_nhanes.head());

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0

In [12]:
df_pima = df_pima.rename(columns={
    'Glucose': 'Fasting_Glucose',
    'BloodPressure': 'Systolic_BP',  # Assuming this is systolic
    'BMI': 'BMI',
    'Age': 'Age',
    'Outcome': 'Diabetes_Risk'
})

df_pima_clean = df_pima[['Fasting_Glucose', 'Systolic_BP', 'BMI', 'Age', 'Diabetes_Risk']]


In [23]:
# print(df_health.head())
print(list(df_health.columns))


['Diabetes_Risk', 'Hypertension_Risk', 'HighChol', 'CholCheck', 'BMI', 'Smoking', 'Stroke', 'HeartDiseaseorAttack', 'Physical_Activity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']


In [24]:
df_health_clean = df_health.rename(columns={
    'BMI': 'BMI',
    'Smoking': 'Smoking',
    'HvyAlcoholConsump': 'Alcohol',
    'Physical_Activity': 'Physical_Activity',
    'Hypertension_Risk': 'Hypertension_Risk',
    'Diabetes_Risk': 'Diabetes_Risk',
    'Age': 'Age'
})[['BMI', 'Smoking', 'Alcohol', 'Physical_Activity', 'Hypertension_Risk', 'Diabetes_Risk', 'Age']]



In [29]:
print(df_cardio.columns)

Index(['id;age;gender;height;weight;ap_hi;ap_lo;cholesterol;gluc;smoke;alco;active;cardio'], dtype='object')


In [31]:
df_cardio = pd.read_csv('cardio_data.csv', sep=';')  # Important!

df_cardio = df_cardio.rename(columns={
    'ap_hi': 'Systolic_BP',
    'ap_lo': 'Diastolic_BP',
    'cholesterol': 'Cholesterol_Level',
    'age': 'Age_Days',
    'cardio': 'Cardio_Risk',
    'smoke': 'Smoking',
    'alco': 'Alcohol',
    'active': 'Physical_Activity'
})

df_cardio['Age'] = df_cardio['Age_Days'] // 365
df_cardio_clean = df_cardio[['Systolic_BP', 'Diastolic_BP', 'Cholesterol_Level', 'Smoking', 'Alcohol', 'Physical_Activity', 'Age', 'Cardio_Risk']]


In [33]:
df_ckd = df_ckd.rename(columns={
    'bp': 'Systolic_BP',
    'age': 'Age',
    'bgr': 'Fasting_Glucose',
    'classification': 'CKD_Risk'
})

# Replace 'ckd'/'notckd' with 1/0
df_ckd['CKD_Risk'] = df_ckd['CKD_Risk'].map({'ckd': 1, 'notckd': 0})

df_ckd_clean = df_ckd[['Systolic_BP', 'Fasting_Glucose', 'Age', 'CKD_Risk']]


In [34]:
df_nhanes_clean = df_nhanes.rename(columns={
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'INDFMPIR': 'Income',
})

df_nhanes_clean = df_nhanes_clean[['Gender', 'Age', 'Income']]


In [52]:
from functools import reduce

# Add missing columns in each dataset with default value
all_columns = set(df_pima_clean.columns) | set(df_health_clean.columns) | set(df_cardio_clean.columns) | set(df_ckd_clean.columns) | set(df_nhanes_clean.columns)

def align_columns(df):
    for col in all_columns:
        if col not in df.columns:
            df[col] = pd.NA
    return df[sorted(all_columns)]

df_list = [df_pima_clean, df_health_clean, df_cardio_clean, df_ckd_clean, df_nhanes_clean]
df_aligned = [align_columns(df) for df in df_list]

df_final = pd.concat(df_aligned, ignore_index=True)


  df_final = pd.concat(df_aligned, ignore_index=True)


In [53]:
df_final.to_csv('chronic_disease.csv', index=False)
print("Saved as chronic_disease.csv with shape:", df_final.shape)


Saved as chronic_disease.csv with shape: (335023, 15)


In [57]:
print(df_final.columns)

Index(['Age', 'Alcohol', 'BMI', 'CKD_Risk', 'Cardio_Risk', 'Cholesterol_Level',
       'Diabetes_Risk', 'Diastolic_BP', 'Fasting_Glucose', 'Gender',
       'Hypertension_Risk', 'Income', 'Physical_Activity', 'Smoking',
       'Systolic_BP'],
      dtype='object')


In [67]:
df_final.isnull().sum()


Unnamed: 0,0
Age,9
Alcohol,11343
BMI,80575
CKD_Risk,335023
Cardio_Risk,265023
Cholesterol_Level,265023
Diabetes_Risk,80575
Diastolic_BP,265023
Fasting_Glucose,333899
Gender,324848


In [68]:
df_final.shape

(335023, 15)

In [73]:
df_final.head()

Unnamed: 0,Age,Alcohol,BMI,CKD_Risk,Cardio_Risk,Cholesterol_Level,Diabetes_Risk,Diastolic_BP,Fasting_Glucose,Gender,Hypertension_Risk,Income,Physical_Activity,Smoking,Systolic_BP
0,50.0,,33.6,,,,1.0,,148.0,,,,,,72.0
1,31.0,,26.6,,,,0.0,,85.0,,,,,,66.0
2,32.0,,23.3,,,,1.0,,183.0,,,,,,64.0
3,21.0,,28.1,,,,0.0,,89.0,,,,,,66.0
4,33.0,,43.1,,,,1.0,,137.0,,,,,,40.0


In [70]:
features = ['Age', 'BMI', 'Smoking', 'Alcohol', 'Physical_Activity',
            'Systolic_BP', 'Diastolic_BP', 'Fasting_Glucose', 'Income']

targets = ['Diabetes_Risk', 'Hypertension_Risk', 'CKD_Risk',
           'Cardio_Risk', 'Cholesterol_Level']


In [71]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

models = {}

for target in targets:
    print(f"\nTraining model for {target}...")

    # Drop NA values for the current target
    data = df_final[features + [target]].dropna()

    X = data[features]
    y = data[target]

    # Encode categorical targets if needed
    if y.dtype == 'object':
        y = y.astype('category').cat.codes

    # Train-Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Model Training
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    # Evaluation
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.2f}")
    print(classification_report(y_test, y_pred))

    # Save model
    models[target] = model



Training model for Diabetes_Risk...


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [74]:
print(f"{target} → {len(data)} rows after dropna")


Diabetes_Risk → 0 rows after dropna


In [79]:
df_final['Diabetes_Risk'].isna().sum()


np.int64(80575)

In [80]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

models = {}

for target in targets:
    print(f"\nTraining model for {target}...")

    # Drop rows with NA in features or current target
    data = df_final[features + [target]].dropna()

    if data.shape[0] < 100:  # Skip if too little data
        print(f"Skipping {target} due to insufficient data: {data.shape[0]} rows")
        continue

    X = data[features]
    y = data[target]

    # Encode categorical targets if needed
    if y.dtype == 'object':
        y = y.astype('category').cat.codes

    # Train-Test Split
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )
    except ValueError as e:
        print(f"Skipping {target} due to split error: {e}")
        continue

    # Model Training
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    # Evaluation
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {target}: {acc:.2f}")
    print(classification_report(y_test, y_pred))

    # Save model
    models[target] = model



Training model for Diabetes_Risk...
Skipping Diabetes_Risk due to insufficient data: 0 rows

Training model for Hypertension_Risk...
Skipping Hypertension_Risk due to insufficient data: 0 rows

Training model for CKD_Risk...
Skipping CKD_Risk due to insufficient data: 0 rows

Training model for Cardio_Risk...
Skipping Cardio_Risk due to insufficient data: 0 rows

Training model for Cholesterol_Level...
Skipping Cholesterol_Level due to insufficient data: 0 rows


In [81]:
print(len(df_pima_clean))        # For Diabetes_Risk
print(len(df_health_clean))      # For Hypertension_Risk
# (Add more if you have others)


768
253680


In [82]:
# Fill Diabetes_Risk (first 768 rows)
df_final.loc[:767, 'Diabetes_Risk'] = df_pima_clean['Diabetes_Risk'].values

# Fill Hypertension_Risk (first 253680 rows)
df_final.loc[:253679, 'Hypertension_Risk'] = df_health_clean['Hypertension_Risk'].values


In [83]:
print("Diabetes_Risk filled:", df_final['Diabetes_Risk'].notna().sum())
print("Hypertension_Risk filled:", df_final['Hypertension_Risk'].notna().sum())


Diabetes_Risk filled: 254448
Hypertension_Risk filled: 254448


In [86]:
print(df_final['Diabetes_Risk'].notna().sum())
print(df_final['Hypertension_Risk'].notna().sum())


254448
254448


In [87]:
# Fill first 768 Diabetes rows
df_final.loc[:767, 'Diabetes_Risk'] = df_pima_clean['Diabetes_Risk'].values

# Fill first 253680 Hypertension rows
df_final.loc[:253679, 'Hypertension_Risk'] = df_health_clean['Hypertension_Risk'].values


In [88]:
print(df_final['Diabetes_Risk'].notna().sum())         # Should be 768
print(df_final['Hypertension_Risk'].notna().sum())     # Should be 253680


254448
254448


In [105]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import pandas as pd
import numpy as np

# -----------------------
# Features and Targets
# -----------------------
features = [
    'Age', 'BMI', 'Smoking', 'Alcohol', 'Physical_Activity',
    'Systolic_BP', 'Diastolic_BP', 'Fasting_Glucose', 'Income'
]

targets = ['Diabetes_Risk', 'Hypertension_Risk', 'CKD_Risk',
           'Cardio_Risk', 'Cholesterol_Level']

# -----------------------
# Clean and Fill Data
# -----------------------
# Assuming df_final is already loaded
df_final[features] = df_final[features].fillna(df_final[features].median())

models = {}
summary = []  # For final summary table

# -----------------------
# Model Training Loop
# -----------------------
for target in targets:
    print(f"\n📌 Training model for: {target}")

    # Drop rows where target is missing
    df_target = df_final[features + [target]].dropna(subset=[target])

    if df_target.shape[0] < 100:
        print(f"❌ Skipping {target} — too few samples: {df_target.shape[0]}")
        continue

    X = df_target[features]
    y = df_target[target]

    # Encode categorical targets
    if y.dtype == 'object' or str(y.dtype).startswith('category'):
        y = y.astype('category').cat.codes

    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )
    except ValueError as e:
        print(f"⚠️ Skipping {target} due to stratify error: {e}")
        continue

    # Model training
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"✅ Accuracy for {target}: {acc:.2f}")
    print(classification_report(y_test, y_pred))

    # Store model and summary
    models[target] = model
    summary.append({
        "Target": target,
        "Samples": len(y),
        "Accuracy": round(acc, 4),
        "F1-Score (Macro)": round(f1, 4)
    })

# -----------------------
# Summary Table
# -----------------------
summary_df = pd.DataFrame(summary)
print("\n📊 Model Performance Summary:")
print(summary_df.to_string(index=False))



📌 Training model for: Diabetes_Risk
✅ Accuracy for Diabetes_Risk: 0.84
              precision    recall  f1-score   support

         0.0       0.85      0.99      0.91     42841
         1.0       0.70      0.03      0.06       980
         2.0       0.48      0.03      0.06      7069

    accuracy                           0.84     50890
   macro avg       0.68      0.35      0.35     50890
weighted avg       0.79      0.84      0.78     50890


📌 Training model for: Hypertension_Risk


Parameters: { "use_label_encoder" } are not used.



✅ Accuracy for Hypertension_Risk: 0.57
              precision    recall  f1-score   support

         0.0       0.57      0.99      0.73     29055
         1.0       0.47      0.01      0.02     21835

    accuracy                           0.57     50890
   macro avg       0.52      0.50      0.37     50890
weighted avg       0.53      0.57      0.42     50890


📌 Training model for: CKD_Risk
❌ Skipping CKD_Risk — too few samples: 0

📌 Training model for: Cardio_Risk


Parameters: { "use_label_encoder" } are not used.



✅ Accuracy for Cardio_Risk: 0.72
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      7004
           1       0.74      0.68      0.71      6996

    accuracy                           0.72     14000
   macro avg       0.72      0.72      0.72     14000
weighted avg       0.72      0.72      0.72     14000


📌 Training model for: Cholesterol_Level


Parameters: { "use_label_encoder" } are not used.



✅ Accuracy for Cholesterol_Level: 0.75
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     10477
           1       0.21      0.00      0.01      1910
           2       0.33      0.01      0.02      1613

    accuracy                           0.75     14000
   macro avg       0.43      0.34      0.29     14000
weighted avg       0.63      0.75      0.64     14000


📊 Model Performance Summary:
           Target  Samples  Accuracy  F1-Score (Macro)
    Diabetes_Risk   254448    0.8421            0.3470
Hypertension_Risk   254448    0.5705            0.3708
      Cardio_Risk    70000    0.7202            0.7199
Cholesterol_Level    70000    0.7468            0.2942


In [102]:
import joblib

# Saving
for disease, model in models.items():
    joblib.dump(model, f"{disease}_model.pkl")

# Loading later
models = {}
for disease in ['Diabetes_Risk', 'Hypertension_Risk', 'Cardio_Risk', 'Cholesterol_Level']:
    models[disease] = joblib.load(f"{disease}_model.pkl")
