<a href="https://colab.research.google.com/github/ullasbc02/obesity-risk-analytics/blob/main/05_MCDM_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import (
    KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN
)
from sklearn.mixture import GaussianMixture
import hdbscan
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)
import umap
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
import joblib
from sklearn.model_selection import cross_val_score

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [None]:
# Load dataset
DATASET = "/content/drive/MyDrive/obesity-risk-analytics/data/processed_final/"
df_final = pd.read_csv(os.path.join(DATASET, "df_final_demographic.csv"))

df_final = pd.read_csv(
    os.path.join(DATASET, "df_final_demographic.csv"),
    dtype={"GEOID": str}
)

df_final["GEOID"] = df_final["GEOID"].str.zfill(5)


print("Dataset shape:", df_final.shape)
df_final.head()

In [None]:
economic_features = [
    "poverty_rate",
    "median_household_income",
    "unemployment_rate",
]

health_features = [
    "physical_inactivity_rate",
    "low_access_food_rate",
]

demo_features = [
    "pct_white",
    "pct_black",
    "pct_hispanic",
    "pct_asian",
    "pct_native"
]

all_features = economic_features + health_features + demo_features
target_col = "obesity_rate"

# Model Finder

In [None]:
best_model_all = joblib.load("/content/drive/My Drive/obesity-risk-analytics/data/processed_final/final_gradient_boosting_model.pkl")
best_model_health = joblib.load("/content/drive/My Drive/obesity-risk-analytics/data/processed_final/health_gradient_boosting_model.pkl")
print("Final model loaded successfully")

Final model loaded successfully


In [None]:
best_model_health

In [None]:
best_model_all

In [None]:
def prepare_xy(df, features, target="obesity_rate"):
    X = df[features].copy()
    y = df[target].values

    X = X.apply(pd.to_numeric, errors="coerce")
    X = X.fillna(X.median())

    return X, y


In [None]:
from sklearn.metrics import r2_score

def calculate_feature_weights(df, features, group_name):

    X, y = prepare_xy(df, features, target="obesity_rate")


    if group_name == "ALL FEATURES":
        model = best_model_all.named_steps['model'] # Access the regressor from the pipeline


    elif group_name == "HEALTH":
        model = best_model_health.named_steps['model'] # Access the regressor from the pipeline

    else:
        raise ValueError("Unknown feature group")

    # Note: The model is already fitted as it's loaded from a pre-trained file.
    # We don't need to call model.fit(X,y) here.

    # Calculate R2 score
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    print(f"R2 score for {group_name} model: {r2:.4f}")

    importances = model.feature_importances_
    weights = importances / importances.mean()

    print("Feature Importances:")
    display(pd.Series(importances, index=features).sort_values(ascending=False))

    print("Weights used for clustering:")
    display(pd.Series(weights, index=features).sort_values(ascending=False))

    return weights

In [None]:
# Compute weights for ALL features
weights_all = calculate_feature_weights(df_final, all_features, "ALL FEATURES")

X = df_final[all_features].copy()
X = X.apply(pd.to_numeric, errors="coerce")
X = X.fillna(X.median())

scaler = StandardScaler()
X_scaled_all = scaler.fit_transform(X)


# X_weighted_all = X_scaled_all * weights_all

X_unweighted_all = X_scaled_all

R2 score for ALL FEATURES model: 0.6494
Feature Importances:


Unnamed: 0,0
physical_inactivity_rate,0.497571
median_household_income,0.095862
low_access_food_rate,0.085443
pct_asian,0.074989
pct_hispanic,0.04779
pct_black,0.04489
pct_native,0.042406
unemployment_rate,0.040521
pct_white,0.035414
poverty_rate,0.035113


Weights used for clustering:


Unnamed: 0,0
physical_inactivity_rate,4.975713
median_household_income,0.958625
low_access_food_rate,0.854433
pct_asian,0.749887
pct_hispanic,0.477899
pct_black,0.448904
pct_native,0.424063
unemployment_rate,0.405206
pct_white,0.354143
poverty_rate,0.351128


In [None]:
#Health
weights_heal = calculate_feature_weights(df_final, health_features, "HEALTH")

scaler_heal = StandardScaler()
X_heal_scaled = scaler_heal.fit_transform(df_final[health_features])
# X_demo_weighted = X_demo_scaled * weights_demo
X_heal_unweighted = X_heal_scaled

R2 score for HEALTH model: 0.4086
Feature Importances:


Unnamed: 0,0
physical_inactivity_rate,0.82684
low_access_food_rate,0.17316


Weights used for clustering:


Unnamed: 0,0
physical_inactivity_rate,1.65368
low_access_food_rate,0.34632


# Risk Score

In [None]:
#All Features
df_final["risk_score_all"] = np.dot(X_unweighted_all, weights_all)
df_final["risk_rank_all"] = df_final["risk_score_all"].rank(ascending=False)

In [None]:
#Health Features
df_final["risk_score_heal"] = np.dot(X_heal_unweighted, weights_heal)
df_final["risk_rank_heal"] = df_final["risk_score_heal"].rank(ascending=False)

# Validate Risk Score

In [None]:
from scipy.stats import spearmanr

corr, pval = spearmanr(df_final["risk_score_all"], df_final["obesity_rate"])
print("Spearman correlation with obesity:", corr)
print("p-value:", pval)


Spearman correlation with obesity: 0.5449191581838733
p-value: 6.395367943235699e-241


In [None]:
corr, pval = spearmanr(df_final["risk_score_heal"], df_final["obesity_rate"])
print("Spearman correlation with obesity:", corr)
print("p-value:", pval)


Spearman correlation with obesity: 0.5318412486969577
p-value: 1.769021863842884e-227


# Final Spatial Risk Analytics


In [None]:

export_cols = [
    "GEOID",
    "CTYNAME",
    "STNAME",
    "obesity_rate",
    "physical_inactivity_rate",
    "poverty_rate",
    "low_access_food_rate",
    "median_household_income",
    "unemployment_rate",
    "risk_score_all",
    "risk_rank_all",
    "risk_score_heal",
    "risk_rank_heal",
    "pct_white",
    "pct_black",
    "pct_hispanic",
    "pct_asian",
    "pct_native"
]

df_final[export_cols].to_csv(
    "dashboard_risk_data.csv",
    index=False
)

print("Dashboard data exported as dashboard_risk_data.csv")


out = "/content/drive/MyDrive/obesity-risk-analytics/dashboard_data_final/"
os.makedirs(out, exist_ok=True)

print("Saving dashboard datasets to:", out)


df_final[export_cols].to_csv(out + "dashboard_risk_data.csv", index=False)

Dashboard data exported as dashboard_risk_data.csv
Saving dashboard datasets to: /content/drive/MyDrive/obesity-risk-analytics/dashboard_data_final/


In [None]:
top_10pct = df_final.sort_values(
    "risk_score_all", ascending=False
).head(int(0.01 * len(df_final)))

top_10pct_display = top_10pct[
    [
        "GEOID",
        "CTYNAME",      # county name
        "STNAME",       # state name
        "risk_score_all",
        "risk_rank_all",
        "obesity_rate",
        "physical_inactivity_rate",
        "poverty_rate",
        "median_household_income",
        "unemployment_rate",
        "low_access_food_rate"
    ]
]

top_10pct_display


Unnamed: 0,GEOID,CTYNAME,STNAME,risk_score_all,risk_rank_all,obesity_rate,physical_inactivity_rate,poverty_rate,median_household_income,unemployment_rate,low_access_food_rate
2149,28151,Washington County,Mississippi,18.97456,1.0,0.426012,0.387003,0.337001,31018.0,0.079487,0.627237
2123,28083,Leflore County,Mississippi,18.406386,2.0,0.391003,0.378006,0.357008,29687.0,0.076481,0.811554
663,31043,Dakota County,Nebraska,17.51179,3.0,0.397029,0.362022,0.106005,61010.0,0.039522,0.791441
2076,29201,Scott County,Missouri,16.304661,4.0,0.342005,0.382011,0.179004,44924.0,0.034921,0.56217
1369,12093,Okeechobee County,Florida,15.731323,5.0,0.319004,0.363011,0.184003,45225.0,0.035582,0.819957
2135,28113,Pike County,Mississippi,15.624108,6.0,0.431005,0.368009,0.262004,35044.0,0.066309,0.528834
2097,28011,Bolivar County,Mississippi,15.534241,7.0,0.403005,0.358,0.366004,30309.0,0.071387,0.692693
2092,28001,Adams County,Mississippi,15.409863,8.0,0.353005,0.364,0.279002,34583.0,0.071838,0.525188
2240,37155,Robeson County,North Carolina,15.164916,9.0,0.411003,0.361002,0.315,36366.0,0.052925,0.204356
2105,28027,Coahoma County,Mississippi,15.162073,10.0,0.398008,0.361005,0.382003,30242.0,0.077531,0.366372


In [None]:
state_risk = (
    df_final.groupby("STNAME")["risk_score_all"]
    .mean()
    .sort_values(ascending=False)
)

state_risk.head(10)


Unnamed: 0_level_0,risk_score_all
STNAME,Unnamed: 1_level_1
Mississippi,6.756126
Delaware,5.420129
Florida,4.764523
South Carolina,3.418604
Alabama,3.402314
Arkansas,3.310413
Oklahoma,3.243523
Ohio,2.785835
West Virginia,2.707726
Hawaii,2.162919


In [None]:
top_10pct = df_final.sort_values(
    "risk_score_heal", ascending=False
).head(int(0.10 * len(df_final)))

top_10pct_display = top_10pct[
    [
        "GEOID",
        "CTYNAME",      # county name
        "STNAME",       # state name
        "risk_score_heal",
        "risk_rank_heal",
        "obesity_rate",
        "physical_inactivity_rate",
        "low_access_food_rate"
    ]
]

top_10pct_display


Unnamed: 0,GEOID,CTYNAME,STNAME,risk_score_heal,risk_rank_heal,obesity_rate,physical_inactivity_rate,low_access_food_rate
2149,28151,Washington County,Mississippi,6.113887,1.0,0.426012,0.387003,0.627237
2123,28083,Leflore County,Mississippi,5.969495,2.0,0.391003,0.378006,0.811554
2076,29201,Scott County,Missouri,5.831795,3.0,0.342005,0.382011,0.562170
1369,12093,Okeechobee County,Florida,5.368228,4.0,0.319004,0.363011,0.819957
1363,12059,Holmes County,Florida,5.317130,5.0,0.426011,0.367022,0.642094
...,...,...,...,...,...,...,...,...
3111,56027,Niobrara County,Wyoming,2.286713,308.0,0.298107,0.282105,1.000000
201,13127,Glynn County,Georgia,2.278106,309.0,0.263001,0.290004,0.726032
1678,20001,Allen County,Kansas,2.277594,310.0,0.323008,0.307031,0.150400
208,13153,Houston County,Georgia,2.271031,311.0,0.314003,0.289000,0.754081


In [None]:
state_risk = (
    df_final.groupby("STNAME")["risk_score_heal"]
    .mean()
    .sort_values(ascending=False)
)

state_risk.head(10)


Unnamed: 0_level_0,risk_score_heal
STNAME,Unnamed: 1_level_1
Mississippi,2.241891
Florida,1.564974
Delaware,1.494754
Arkansas,1.363768
Alabama,1.337701
South Carolina,1.273603
Oklahoma,1.224209
West Virginia,1.071962
Ohio,0.987994
Nebraska,0.850775


# Merge


In [None]:
MCDM_DATASET = "/content/drive/My Drive/obesity-risk-analytics/dashboard_data_final/"

In [None]:

df_risk = pd.read_csv(
    os.path.join(MCDM_DATASET, "dashboard_risk_data.csv"),
    dtype={"GEOID": str}
)

df_risk["GEOID"] = df_risk["GEOID"].str.zfill(5)

# df_risk = pd.read_csv(MCDM_DATASET + "dashboard_risk_data.csv")

df_risk.columns

In [None]:
df_final_with_predictions = df_risk.merge(
    df_final[[
        "GEOID",
        "predicted_obesity",
        "residual",
        "high_obesity",
        "abs_error"
    ]],
    on="GEOID",
    how="left"
)

In [None]:
df_final_with_predictions.columns

In [None]:
df_final_with_predictions.to_csv(
    "/content/drive/My Drive/obesity-risk-analytics/data/processed_final/df_final_with_predictions.csv", index=False
)