# Diabetes Risk Prediction Using SDOH
This notebook performs data preprocessing, merging, correlation analysis, and visualization of diabetes risk using social determinants of health data.

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv('synthetic_clinical_dataset_diabetes.csv')
sdoh = pd.read_csv('synthetic_sdoh_dataset.csv')

sdoh.rename(columns={'dtname':'district'}, inplace=True)
sdoh['district'] = sdoh['district'].str.lower()
df['district'] = df['district'].str.lower()

rename_map = {'garhwal':'pauri','pauri garhwal':'pauri','tehri garhwal':'tehri','hardwar':'haridwar'}
sdoh['district'] = sdoh['district'].replace(rename_map)

# Merge
merged = df.merge(sdoh, on='district', how='left')
merged.to_csv('merged_diabetes_sdoh.csv', index=False)
print('✅ Merged dataset saved as merged_diabetes_sdoh.csv')

In [ ]:
# Correlation analysis
merged.columns = merged.columns.str.lower()
num_cols = merged.select_dtypes(include=['number'])
corr = num_cols.corr()['high_risk_diabetes'].sort_values(ascending=False)
print(corr.head(15))

plt.figure(figsize=(8,4))
corr.drop('high_risk_diabetes', errors='ignore').plot(kind='bar')
plt.title('Correlation of SDOH with Diabetes Risk')
plt.tight_layout()
plt.show()

In [ ]:
# Choropleth map
geo = gpd.read_file('UTTARAKHAND_DISTRICTS.geojson')
geo['district'] = geo['dtname'].str.lower()
risk = merged.groupby('district')['high_risk_diabetes'].mean().reset_index().rename(columns={'high_risk_diabetes':'predicted_risk'})
map_data = geo.merge(risk, on='district', how='left')

fig, ax = plt.subplots(figsize=(8,8))
map_data.plot(column='predicted_risk', legend=True, ax=ax, edgecolor='black')
ax.set_title('Predicted Diabetes Risk by District in Uttarakhand')
ax.axis('off')
plt.show()