In [41]:
import os
os.chdir("/content")



In [42]:
import os
print(os.getcwd())


/content


In [43]:
!ls /content



biometric_final_FINAL_STANDARDIZED.csv	enrollment_final_no_duplicates.csv
demographic_final_no_duplicates.csv	sample_data


In [44]:
import pandas as pd

bio = pd.read_csv("/content/biometric_final_FINAL_STANDARDIZED.csv")
demo = pd.read_csv("/content/demographic_final_no_duplicates.csv")
enr = pd.read_csv("/content/enrollment_final_no_duplicates.csv", encoding="latin1")

print("Loaded successfully")
print("Biometric:", bio.shape)
print("Demographic:", demo.shape)
print("Enrollment:", enr.shape)


Loaded successfully
Biometric: (1652822, 10)
Demographic: (1487725, 6)
Enrollment: (936238, 7)


In [68]:
print("Enrollment columns:", enr.columns.tolist())
enr.head()


Enrollment columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-09-2025,Andaman & Nicobar Islands,Nicobars,744303,1,0,0
1,17-10-2025,Andaman & Nicobar Islands,Nicobars,744301,3,1,0
2,02-09-2025,Andaman & Nicobar Islands,Nicobars,744301,3,1,0
3,04-09-2025,Andaman & Nicobar Islands,Nicobars,744301,4,0,0
4,24-10-2025,Andaman & Nicobar Islands,Nicobars,744301,1,2,0


In [69]:
df = bio.merge(
    demo,
    on=["date", "state", "district", "pincode"],
    how="inner"
).merge(
    enr,
    on=["date", "state", "district", "pincode"],
    how="inner"
)

print("Merged dataset shape:", df.shape)
df.head()


Merged dataset shape: (487568, 15)


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,district_norm,district_clean,district_regex,state_clean,demo_age_5_17,demo_age_17_,age_0_5,age_5_17,age_18_greater
0,12-09-2025,Andhra Pradesh,Adilabad,504001,4,38,Adilabad,Adilabad,Adilabad,andhra pradesh,0,29,2,0,0
1,02-09-2025,Andhra Pradesh,Adilabad,504001,4,21,Adilabad,Adilabad,Adilabad,andhra pradesh,0,13,4,1,0
2,13-09-2025,Andhra Pradesh,Adilabad,504001,4,19,Adilabad,Adilabad,Adilabad,andhra pradesh,2,5,1,0,0
3,25-10-2025,Andhra Pradesh,Adilabad,504001,19,43,Adilabad,Adilabad,Adilabad,andhra pradesh,4,8,1,1,0
4,24-10-2025,Andhra Pradesh,Adilabad,504001,9,16,Adilabad,Adilabad,Adilabad,andhra pradesh,1,10,3,0,0


In [70]:
df = df.rename(columns={
    "bio_age_17_": "bio_age_17_plus",
    "demo_age_17_": "demo_age_17_plus",
    "age_0_5": "enroll_age_0_5",
    "age_5_17": "enroll_age_5_17",
    "age_18_greater": "enroll_age_18_plus"
})


In [71]:
df["total_bio_updates"] = (
    df["bio_age_5_17"] + df["bio_age_17_plus"]
)

df["total_demo_updates"] = (
    df["demo_age_5_17"] + df["demo_age_17_plus"]
)

df["total_enrollment"] = (
    df["enroll_age_0_5"] +
    df["enroll_age_5_17"] +
    df["enroll_age_18_plus"]
)

# Prevent divide by zero
df["total_enrollment"] = df["total_enrollment"].replace(0, 1)


In [72]:
# Update pressure
df["bio_update_rate"] = df["total_bio_updates"] / df["total_enrollment"]
df["demo_update_rate"] = df["total_demo_updates"] / df["total_enrollment"]

# Age behaviour indicators
df["bio_youth_ratio"] = (
    df["bio_age_5_17"] / (df["bio_age_17_plus"] + 1)
)

df["demo_youth_ratio"] = (
    df["demo_age_5_17"] / (df["demo_age_17_plus"] + 1)
)

# Combined Aadhaar stress signal
df["update_to_enroll_ratio"] = (
    df["total_bio_updates"] + df["total_demo_updates"]
) / df["total_enrollment"]


In [73]:
features = [
    "bio_update_rate",
    "demo_update_rate",
    "bio_youth_ratio",
    "demo_youth_ratio",
    "update_to_enroll_ratio"
]

X = df[features]


In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [75]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.05,   # top 5% stress zones
    random_state=42,
    n_jobs=-1
)

df["stress_label"] = iso_forest.fit_predict(X_scaled)


In [76]:
df["stress_status"] = df["stress_label"].map({
    -1: "High Stress",
     1: "Normal"
})

df["stress_status"].value_counts()


Unnamed: 0_level_0,count
stress_status,Unnamed: 1_level_1
Normal,463189
High Stress,24379


In [77]:
df[df["stress_status"] == "High Stress"][
    ["date", "state", "district", "pincode", "update_to_enroll_ratio"]
].sort_values(
    "update_to_enroll_ratio",
    ascending=False
).head(15)


Unnamed: 0,date,state,district,pincode,update_to_enroll_ratio
249633,12-11-2025,Maharashtra,Yavatmal,445204,1432.0
230247,12-11-2025,Maharashtra,Nanded,431712,1359.0
249660,13-11-2025,Maharashtra,Yavatmal,445204,942.0
219791,13-11-2025,Maharashtra,Hingoli,431513,932.0
219793,17-11-2025,Maharashtra,Hingoli,431513,909.0
250170,08-11-2025,Manipur,Bishnupur,795126,908.0
249816,12-11-2025,Maharashtra,Yavatmal,445215,907.0
56479,10-09-2025,Bihar,Kishanganj,855116,787.5
230676,12-11-2025,Maharashtra,Nanded,431801,779.0
235065,04-11-2025,Maharashtra,Parbhani,431509,769.0


In [78]:
district_stress = (
    df[df["stress_status"] == "High Stress"]
    .groupby(["state", "district"])
    .size()
    .reset_index(name="stress_days")
    .sort_values("stress_days", ascending=False)
)

district_stress.head(10)


Unnamed: 0,state,district,stress_days
311,Maharashtra,Ahilyanagar,551
342,Maharashtra,Solapur,456
332,Maharashtra,Nashik,386
335,Maharashtra,Pune,375
330,Maharashtra,Nanded,339
346,Maharashtra,Yavatmal,331
323,Maharashtra,Jalgaon,323
325,Maharashtra,Kolhapur,293
313,Maharashtra,Amravati,277
314,Maharashtra,Beed,276


In [79]:
df["date"] = pd.to_datetime(df["date"], dayfirst=True)

stress_over_time = (
    df[df["stress_status"] == "High Stress"]
    .groupby(df["date"].dt.to_period("M"))
    .size()
)

stress_over_time


Unnamed: 0_level_0,0
date,Unnamed: 1_level_1
2025-09,6952
2025-10,5581
2025-11,10124
2025-12,1722


In [80]:
from sklearn.cluster import KMeans

X_stress = stress_df[
    ["bio_update_rate", "demo_update_rate", "bio_youth_ratio", "demo_youth_ratio"]
]

kmeans = KMeans(n_clusters=3, random_state=42)
stress_df["stress_type"] = kmeans.fit_predict(X_stress)


In [81]:
stress_df.groupby("stress_type")[[
    "bio_update_rate",
    "demo_update_rate",
    "bio_youth_ratio",
    "demo_youth_ratio"
]].mean()
#This table tells you WHY each cluster exists.

Unnamed: 0_level_0,bio_update_rate,demo_update_rate,bio_youth_ratio,demo_youth_ratio
stress_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,136.352713,33.997658,2.386638,0.109027
1,40.399203,176.551046,0.595945,0.041253
2,29.457755,29.999633,2.829487,0.4134


In [59]:
stress_labels = {
    0: "Migration-driven stress",
    1: "Biometric-heavy stress",
    2: "Lifecycle-driven stress"
}

stress_df["stress_reason"] = stress_df["stress_type"].map(stress_labels)


In [82]:
stress_df[[
    "date", "state", "district", "pincode", "stress_reason"
]].head(10)


Unnamed: 0,date,state,district,pincode,stress_reason
8,2025-09-09,Andhra Pradesh,Adilabad,504001,Lifecycle-driven stress
12,2025-10-31,Andhra Pradesh,Adilabad,504001,Lifecycle-driven stress
13,2025-09-19,Andhra Pradesh,Adilabad,504001,Lifecycle-driven stress
15,2025-11-07,Andhra Pradesh,Adilabad,504001,Lifecycle-driven stress
17,2025-10-27,Andhra Pradesh,Adilabad,504001,Lifecycle-driven stress
55,2025-10-31,Andhra Pradesh,Adilabad,504103,Lifecycle-driven stress
260,2025-10-30,Andhra Pradesh,Adilabad,504219,Lifecycle-driven stress
317,2025-11-05,Andhra Pradesh,Adilabad,504272,Lifecycle-driven stress
459,2025-10-15,Andhra Pradesh,Adilabad,504299,Lifecycle-driven stress
460,2025-10-15,Andhra Pradesh,Adilabad,504299,Lifecycle-driven stress


In [83]:
df["stress_status"] = df["stress_label"].map({
    -1: "High Stress",
     1: "Normal"
})
df["stress_status"].value_counts()


Unnamed: 0_level_0,count
stress_status,Unnamed: 1_level_1
Normal,463189
High Stress,24379


In [85]:
#FINAL RESULT TABLE 1: Top Aadhaar Stress Zones
final_stress_table = (
    df[df["stress_status"] == "High Stress"]
    [["date", "state", "district", "pincode", "update_to_enroll_ratio"]]
    .sort_values("update_to_enroll_ratio", ascending=False)
    .head(15)
)

final_stress_table


Unnamed: 0,date,state,district,pincode,update_to_enroll_ratio
249633,2025-11-12,Maharashtra,Yavatmal,445204,1432.0
230247,2025-11-12,Maharashtra,Nanded,431712,1359.0
249660,2025-11-13,Maharashtra,Yavatmal,445204,942.0
219791,2025-11-13,Maharashtra,Hingoli,431513,932.0
219793,2025-11-17,Maharashtra,Hingoli,431513,909.0
250170,2025-11-08,Manipur,Bishnupur,795126,908.0
249816,2025-11-12,Maharashtra,Yavatmal,445215,907.0
56479,2025-09-10,Bihar,Kishanganj,855116,787.5
230676,2025-11-12,Maharashtra,Nanded,431801,779.0
235065,2025-11-04,Maharashtra,Parbhani,431509,769.0


In [86]:
#FINAL RESULT TABLE 2: Most Affected Districts (Repeated Stress)
district_level_stress = (
    df[df["stress_status"] == "High Stress"]
    .groupby(["state", "district"])
    .size()
    .reset_index(name="number_of_stress_days")
    .sort_values("number_of_stress_days", ascending=False)
    .head(10)
)

district_level_stress


Unnamed: 0,state,district,number_of_stress_days
311,Maharashtra,Ahilyanagar,551
342,Maharashtra,Solapur,456
332,Maharashtra,Nashik,386
335,Maharashtra,Pune,375
330,Maharashtra,Nanded,339
346,Maharashtra,Yavatmal,331
323,Maharashtra,Jalgaon,323
325,Maharashtra,Kolhapur,293
313,Maharashtra,Amravati,277
314,Maharashtra,Beed,276


In [87]:
#FINAL RESULT TABLE 3: WHY Stress Happened (Cluster Explanation)
stress_reason_summary = (
    stress_df.groupby("stress_reason")
    .size()
    .reset_index(name="number_of_cases")
)

stress_reason_summary


Unnamed: 0,stress_reason,number_of_cases
0,Biometric-heavy stress,3398
1,Lifecycle-driven stress,41563
2,Migration-driven stress,3040


In [88]:
#FINAL RESULT TABLE 4: Feature Validation (Proof Model Makes Sense)
df.groupby("stress_status")[
    ["bio_update_rate", "demo_update_rate", "update_to_enroll_ratio"]
].mean()


Unnamed: 0_level_0,bio_update_rate,demo_update_rate,update_to_enroll_ratio
stress_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High Stress,35.285333,37.954194,73.239526
Normal,6.07506,6.170436,12.245496


**We validated the model by comparing update-to-enrolment ratios across stress categories. High-stress zones consistently show disproportionately high update activity, confirming that the model captures genuine system pressure rather than noise**