In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GroupShuffleSplit

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from catboost import CatBoostClassifier

from lightgbm import LGBMClassifier



In [None]:
df = pd.read_csv("Dementia Prediction Dataset.csv", low_memory=False)

In [None]:
df

In [None]:
df.DEMENTED.value_counts()

In [None]:
# --- A1 Demographics ---
a1_cols = [
    "NACCAGE", "SEX", "EDUC", "RACE", "RACESEC", "RACETER",
    "PRIMLANG", "MARISTAT", "RESIDENC", "NACCLIVS",
    "INDEPEND", "HANDED", "NACCREFR", "NACCREAS",
]

# --- A5 Self-reported Health History ---
a5_cols = [
    "CVHATT", "HATTMULT", "CVAFIB", "CVANGIO", "CVBYPASS", "CVPACDEF", "CVPACE",
    "CVCHF", "CVANGINA", "CVHVALVE", "CVOTHR",
    "CBSTROKE", "STROKMUL", "CBTIA", "TIAMULT",
    "DIABETES", "DIABTYPE", "HYPERTEN", "HYPERCHO",
    "PD", "SEIZURES", "TBI",
    "APNEA", "INSOMN", "DEP2YRS", "PTSD", "BIPOLAR", "ANXIETY",
    "TOBAC30", "TOBAC100", "SMOKYRS", "PACKSPER", "QUITSMOK",
    "ALCOCCAS", "ALCFREQ", "ABUSOTHR",
    "NACCBMI", 'HEIGHT', 'WEIGHT','THYROID','B12DEF','ARTHRIT','INCONTU'
]

# --- A3 Family History (optional/borderline) ---
a3_cols = ["NACCFAM", "NACCMOM", "NACCDAD"]

# Combine all columns
selected_cols = a1_cols + a5_cols + a3_cols

#define target column
target_col="DEMENTED"

# Extract only those columns
df_sel = df[selected_cols].copy()

In [None]:
def clean_missing_age(x):
    return np.nan if x in [888, 999, 995, 996, 997] else x

def clean_educ(x):
    return np.nan if x == 99 else x

def a5_to_flag(s):
    """
    A5 coding: 
    0 = No, 1 = Recent/Active, 2 = Remote/Inactive,
    9 = Unknown, -4 = Not available
    Convert (1) -> 1,(2) -> 2, (0) -> 0, unknown -> NaN
    """
    return s.replace({1:1, 2:2, 0:0, 9:np.nan, -4:np.nan})

In [None]:
# Create final dataframe
df_final = pd.DataFrame(index=df_sel.index)

# --- AGE ---
df_final["AGE"] = df_sel["NACCAGE"].apply(clean_missing_age)

# --- BASIC DEMO ---
df_final["SEX"] = df_sel["SEX"]
df_final["EDUC_YEARS"] = df_sel["EDUC"].apply(clean_educ)
df_final["HANDED"] = df_sel["HANDED"]
df_final["INDEPEND"] = df_sel["INDEPEND"]
df_final["NACCLIVS"] = df_sel["NACCLIVS"]

# --- LANGUAGE ---
df_final["PRIMLANG"] = df_sel["PRIMLANG"]
df_final["IS_NON_ENGLISH_HOME"] = (df_sel["PRIMLANG"] != 1).astype(int)

# --- MARITAL (One-hot) ---
df_final = pd.concat([df_final,
                      pd.get_dummies(df_sel["MARISTAT"], prefix="MARITAL", dtype=int)],
                     axis=1)

# --- RESIDENCE (One-hot) ---
df_final = pd.concat([df_final,
                      pd.get_dummies(df_sel["RESIDENC"], prefix="RESIDENCE", dtype=int)],
                     axis=1)

# --- RACE ---
df_final["RACE"] = df_sel["RACE"]
df_final["IS_MULTIRACIAL"] = (
    df_sel["RACESEC"].isin([1,2,3,4,5,50]) | 
    df_sel["RACETER"].isin([1,2,3,4,5,50])
).astype(int)
df_final["RACE_OTHER"] = (df_sel["RACE"] == 50).astype(int)

# One-hot for primary race
df_final = pd.concat([df_final,
                      pd.get_dummies(df_sel["RACE"], prefix="RACE", dtype=int)],
                     axis=1)


# A5 VARIABLES → BINARY FLAGS

for col in a5_cols:
    
    if col == "DIABTYPE":
        # keep DIABTYPE as numeric category, optional
        df_final["DIABTYPE"] = df_sel["DIABTYPE"].replace({9:np.nan, -4:np.nan})
    else:
        df_final[col + "_FLAG"] = a5_to_flag(df_sel[col])

# Smoking extras
df_final["SMOKED_100PLUS"] = df_sel["TOBAC100"].replace({0:0, 1:1, 9:np.nan, -4:np.nan})

df_final["SMOKED_LAST_30D"] = df_sel["TOBAC30"].replace({0:0,1:1, 9:np.nan, -4:np.nan})

# BMI
df_final["BMI"] = df_sel["NACCBMI"].replace({888.8:np.nan,-4:np.nan})

# Fill missing BMI values by calculating BMI for rows where HEIGHT and WEIGHT exist.
df_sel["HEIGHT"] = df_sel["HEIGHT"].replace({88.8:np.nan,-4:np.nan})
df_sel["WEIGHT"] = df_sel["WEIGHT"].replace({888.0:np.nan,-4:np.nan})

## BMI = (weight in lbs × 703) / (height in inches)^2
df_final.loc[
    df_final["BMI"].isna() &
    df_sel["HEIGHT"].notna() &
    df_sel["WEIGHT"].notna(),
    "BMI"
] =  (df_sel["WEIGHT"] * 703) / (df_sel["HEIGHT"] ** 2)

# A3 FAMILY HISTORY — OPTIONAL BUT INCLUDED

for col in a3_cols:
    df_final[col + "_FLAG"] = df_sel[col].replace({0:0,1:1,9:np.nan,-4:np.nan})




In [None]:
y=df[target_col]
X=df_final.copy()

In [None]:
print("X:", X.shape)
print("y:", y.shape)

df_final


In [None]:
total = df["NACCID"].nunique()
duplicates = (df["NACCID"].value_counts() > 1).sum()

print("Total unique participants:", total)
print("Participants with >1 visit:", duplicates)

In [None]:
# 1. Get participant IDs

groups = df.loc[df.index, "NACCID"].reset_index(drop=True)

# 2. First split: Train vs Temp (Val+Test)

gss1 = GroupShuffleSplit(n_splits=1, test_size=0.30, random_state=42)

train_idx, temp_idx = next(gss1.split(X, y, groups=groups))

X_train = X.iloc[train_idx]
y_train = y.iloc[train_idx]

X_temp  = X.iloc[temp_idx]
y_temp  = y.iloc[temp_idx]
groups_temp = groups.iloc[temp_idx]


# 3. Second split: Validation vs Test

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.50, random_state=42)

val_idx, test_idx = next(gss2.split(X_temp, y_temp, groups=groups_temp))

X_val  = X_temp.iloc[val_idx]
y_val  = y_temp.iloc[val_idx]

X_test = X_temp.iloc[test_idx]
y_test = y_temp.iloc[test_idx]

# 4. Print results

print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

print("\nTarget distribution:")
print("Train:\n", y_train.value_counts(normalize=True))
print("Val:\n",   y_val.value_counts(normalize=True))
print("Test:\n",  y_test.value_counts(normalize=True))

print("\nUnique participants:")
print("Train:", len(groups.iloc[train_idx].unique()))
print("Val:",   len(groups_temp.iloc[val_idx].unique()))
print("Test:",  len(groups_temp.iloc[test_idx].unique()))


In [None]:
def model_evaluation(model, X_val, y_val, X_test, y_test):
    # VALIDATION SET Evaluation
    val_proba = model.predict_proba(X_val)[:, 1]
    val_pred = (val_proba > 0.5).astype(int)

    val_auc = roc_auc_score(y_val, val_proba)
    val_acc = accuracy_score(y_val, val_pred)

    # TEST SET Evaluation
    test_proba = model.predict_proba(X_test)[:, 1]
    test_pred = (test_proba > 0.5).astype(int)

    test_auc = roc_auc_score(y_test, test_proba)
    test_acc = accuracy_score(y_test, test_pred)

    print("====================================")
    print("   RANDOM FOREST FINAL EVALUATION    ")
    print("====================================")
    print(f"Validation AUC: {val_auc:.4f}")
    print(f"Validation ACC: {val_acc:.4f}")
    print("------------------------------------")
    print(f"Test AUC:       {test_auc:.4f}")
    print(f"Test ACC:       {test_acc:.4f}")
    print("====================================")


    
   

In [None]:
'''rf_grid = {
    "n_estimators": [200, 300],
    "max_depth": [15, 20, 25],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": ["balanced"]
    }

rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)

grid_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_grid,
    cv=3,
    scoring="roc_auc",
    verbose=2,
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

print("\nBest RF Params:", grid_rf.best_params_)
print("Best RF CV AUC:", grid_rf.best_score_)'''

In [None]:
# 1. BEST PARAMS FROM GRID SEARCH

best_rf_params = {
    'class_weight': 'balanced',
    'max_depth': 15,
    'min_samples_leaf': 4,
    'min_samples_split': 10,
    'n_estimators': 300
}


# 2. TRAIN FINAL MODEL ON TRAINING SET

rf_final = RandomForestClassifier(
    **best_rf_params,
    n_jobs=-1,
    random_state=42
)

rf_final.fit(X_train, y_train)



In [None]:
#Evaluate Random Forest Model 
model_evaluation(rf_final, X_val, y_val, X_test, y_test)

In [None]:
importance_df_of_rf=pd.DataFrame({
    'featura' : X.columns,
    'importance' : rf_final.feature_importances_
}).sort_values('importance',ascending=False)

In [None]:
importance_df_of_rf.head(10)

In [None]:
# 1. XGBOOST PARAMETER SEARCH SPACE (Optimized for tabular data)
'''

xgb_param_dist = {
    "n_estimators": [300, 400, 500],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "subsample": [0.6, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 1.0],
    "gamma": [0, 1, 5],
    "min_child_weight": [1, 3, 5, 7],
    "reg_alpha": [0, 0.01, 0.1],
    "reg_lambda": [1, 1.5, 2.0]
}


# 2. Create base XGB model

xgb_base = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",     
    n_jobs=-1,
    random_state=42
)


# 3. Randomized Search 

rand_xgb = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=xgb_param_dist,
    n_iter=40,                 
    scoring="roc_auc",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)


# 4. Fit on the TRAIN set


rand_xgb.fit(X_train, y_train)

print("\nBest XGB Params:", rand_xgb.best_params_)
print("Best XGB CV AUC:", rand_xgb.best_score_)
'''

In [None]:
# 1. Extract best parameters
best_xgb_params = { 'subsample': 0.6, 
                    'reg_lambda': 2.0, 
                    'reg_alpha': 0.01, 
                    'n_estimators': 300, 
                    'min_child_weight': 1, 
                    'max_depth': 5,
                    'learning_rate': 0.03, 
                    'gamma': 0, 
                    'colsample_bytree': 1.0
                    }

# 2. Re-train final XGB model on full TRAIN set

xgb_final = XGBClassifier(
    **best_xgb_params,
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

xgb_final.fit(X_train, y_train)

In [None]:
#Evaluate XGB Model
model_evaluation(xgb_final, X_val, y_val, X_test, y_test)

In [None]:
importance_df_of_xgb=pd.DataFrame({
    'featura' : X.columns,
    'importance' : xgb_final.feature_importances_
}).sort_values('importance',ascending=False)

importance_df_of_xgb.head(10)

In [None]:


# 1. Define Search Space for CatBoost

cat_param_dist = {
    "iterations": [300, 500, 700, 900],
    "depth": [4, 5, 6, 7, 8],
    "learning_rate": [0.01, 0.03, 0.05, 0.07],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "subsample": [0.6, 0.7, 0.8, 1.0],
    "border_count": [32, 64, 128, 254]
}


# 2. Base CatBoost Model

cat_base = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=0,
    random_state=42
    
)

# 3. Random Search

cat_search = RandomizedSearchCV(
    estimator=cat_base,
    param_distributions=cat_param_dist,
    n_iter=40,            
    scoring="roc_auc",
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# 4. Fit
cat_search.fit(X_train, y_train)

print("Best CatBoost Params:", cat_search.best_params_)
print("Best CV AUC:", cat_search.best_score_)


In [None]:
best_cat_params={
                'subsample': 1.0, 
                 'learning_rate': 0.03, 
                 'l2_leaf_reg': 9, 
                 'iterations': 500, 
                 'depth': 8, 
                 'border_count': 254}
    

# 1. CATBOOST FINAL MODEL TRAINING

cat_final = CatBoostClassifier(
   **best_cat_params,
    eval_metric='AUC',
    random_state=42,
    verbose=0,
    thread_count = -1

)

cat_final.fit(X_train, y_train)


In [None]:
#Evaluate CatBoost Model
model_evaluation(cat_final, X_val, y_val, X_test, y_test)

In [None]:


# 1. LightGBM Search Space

lgb_param_dist = {
    "n_estimators": [300, 500, 800],
    "max_depth": [-1, 4, 6, 8, 10],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "num_leaves": [20, 30, 40, 50, 60],
    "subsample": [0.6, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 1.0],
    "reg_alpha": [0, 0.01, 0.1],
    "reg_lambda": [0.1, 0.3, 0.5, 1.0]
}


# 2. Base Model

lgb_base = LGBMClassifier(
    objective="binary",
    class_weight="balanced",
    random_state=42
)


# 3. Randomized Search

lgb_search = RandomizedSearchCV(
    estimator=lgb_base,
    param_distributions=lgb_param_dist,
    n_iter=40,
    scoring="roc_auc",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)


# 4. Fit

lgb_search.fit(X_train, y_train)

print("Best LGBM Params:", lgb_search.best_params_)
print("Best LGBM CV AUC:", lgb_search.best_score_)


In [None]:

# 2. LIGHTGBM (Fast, strong boosting)

best_lgb_params = {'subsample': 1.0, 
                   'reg_lambda': 0.5, 
                   'reg_alpha': 0,
                    'num_leaves': 40,
                    'n_estimators': 800, 
                    'max_depth': 8, 
                    'learning_rate': 0.01, 
                    'colsample_bytree': 0.6}

lgb_final = LGBMClassifier(
    n_estimators=500,
    max_depth=-1,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lgb_final.fit(X_train, y_train)


In [None]:
#Evaluate LightGBM Model
model_evaluation(lgb_final, X_val, y_val, X_test, y_test)

In [None]:
# 1. Get PREDICTIONS for VALIDATION SET

rf_val_proba  = rf_final.predict_proba(X_val)[:, 1]
xgb_val_proba = xgb_final.predict_proba(X_val)[:, 1]
lgb_val_proba = lgb_final.predict_proba(X_val)[:, 1]
cat_val_proba = cat_final.predict_proba(X_val)[:, 1]

# Build meta-feature matrix
stack_val_X = np.column_stack([
    rf_val_proba,
    xgb_val_proba,
    lgb_val_proba,
    cat_val_proba
])

# 2. Train META-MODEL (Linear Regression)


meta_model = LinearRegression()
meta_model.fit(stack_val_X, y_val)

print("\nMeta Model Weights:", meta_model.coef_)
print("Meta Model Bias:", meta_model.intercept_)


# 3. Evaluate stacking on VALIDATION SET

val_meta_pred = meta_model.predict(stack_val_X)
val_auc = roc_auc_score(y_val, val_meta_pred)

print("\n============================================")
print("Validation AUC (4-Model Stacking):", round(val_auc, 4))
print("============================================")


# 4. Get predictions on TEST SET

rf_test_proba  = rf_final.predict_proba(X_test)[:, 1]
xgb_test_proba = xgb_final.predict_proba(X_test)[:, 1]
lgb_test_proba = lgb_final.predict_proba(X_test)[:, 1]
cat_test_proba = cat_final.predict_proba(X_test)[:, 1]

# Build test meta-feature set
stack_test_X = np.column_stack([
    rf_test_proba,
    xgb_test_proba,
    lgb_test_proba,
    cat_test_proba
])

# 5. Final TEST SET prediction

test_meta_pred = meta_model.predict(stack_test_X)
test_auc = roc_auc_score(y_test, test_meta_pred)

print("\n============================================")
print("Test AUC (4-Model Stacking):", round(test_auc, 4))
print("============================================")
