## Section 1: Imports and loading clean data

In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

enrol_agg = pd.read_csv("../data/enrolment_clean_monthly.csv")
demo_agg = pd.read_csv("../data/demographic_clean_monthly.csv")
bio_agg = pd.read_csv("../data/biometric_clean_monthly.csv")

## Section 2: Merge datasets on common keys

In [2]:
df = enrol_agg.merge(
    demo_agg,
    on=["state", "year_month"],
    how="left"
).merge(
    bio_agg,
    on=["state", "year_month"],
    how="left"
)

df.head()

Unnamed: 0,state,year_month,enrol_age_5_17,enrol_age_18_plus,demo_age_5_17,demo_age_18_plus,bio_age_5_17,bio_age_18_plus
0,ANDAMAN & NICOBAR ISLANDS,2025-01,1,0,0.0,46.0,98.0,933.0
1,ANDAMAN & NICOBAR ISLANDS,2025-02,0,0,0.0,44.0,17.0,51.0
2,ANDAMAN & NICOBAR ISLANDS,2025-03,0,0,2.0,71.0,27.0,50.0
3,ANDAMAN & NICOBAR ISLANDS,2025-04,0,0,0.0,74.0,11.0,36.0
4,ANDAMAN & NICOBAR ISLANDS,2025-05,0,0,1.0,43.0,13.0,45.0


## Section 3: Handle missing update values

In [3]:
update_cols = [
    "demo_age_5_17", "demo_age_18_plus",
    "bio_age_5_17", "bio_age_18_plus"
]

df[update_cols] = df[update_cols].fillna(0)

## Section 4: Compute update-to-enrolment ratios

### Step 4.1 Safe division helper

In [4]:
def safe_ratio(numerator, denominator):
    return numerator / denominator if denominator > 0 else 0

### Step 4.2 Apply ratios

In [5]:
df["demo_ratio_5_17"] = df["demo_age_5_17"] / df["enrol_age_5_17"]
df["demo_ratio_18_plus"] = df["demo_age_18_plus"] / df["enrol_age_18_plus"]

df["bio_ratio_5_17"] = df["bio_age_5_17"] / df["enrol_age_5_17"]
df["bio_ratio_18_plus"] = df["bio_age_18_plus"] / df["enrol_age_18_plus"]

ratio_cols = [
    "demo_ratio_5_17", "demo_ratio_18_plus",
    "bio_ratio_5_17", "bio_ratio_18_plus"
]

df[ratio_cols] = df[ratio_cols].replace([float("inf")], 0).fillna(0)

## Section 5: Update intensity metric

In [6]:
df["total_updates"] = (
    df["demo_age_5_17"] + df["demo_age_18_plus"] +
    df["bio_age_5_17"] + df["bio_age_18_plus"]
)

df["total_enrolment"] = (
    df["enrol_age_5_17"] + df["enrol_age_18_plus"]
)

df["update_intensity"] = df["total_updates"] / df["total_enrolment"]
df["update_intensity"] = df["update_intensity"].replace([float("inf")], 0).fillna(0)

## Section 6: Update decay indicators

### Step 6.1 Sort properly

In [7]:
df = df.sort_values(["state", "year_month"])

### Step 6.2 Rolling average of update intensity

In [8]:
df["update_intensity_3m_avg"] = (
    df.groupby("state")["update_intensity"]
    .rolling(window=3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

### Step 6.3 Decay signal

In [9]:
df["update_decay_signal"] = (
    df["update_intensity"] - df["update_intensity_3m_avg"]
)

## Section 7: Consistency score

In [10]:
df["update_consistency"] = (
    df.groupby("state")["update_intensity"]
    .transform("std")
)

## Section 8: Validation and inspection

In [11]:
df[[
    "state", "year_month",
    "update_intensity",
    "update_decay_signal",
    "update_consistency"
]].head()

df.describe()

Unnamed: 0,enrol_age_5_17,enrol_age_18_plus,demo_age_5_17,demo_age_18_plus,bio_age_5_17,bio_age_18_plus,demo_ratio_5_17,demo_ratio_18_plus,bio_ratio_5_17,bio_ratio_18_plus,total_updates,total_enrolment,update_intensity,update_intensity_3m_avg,update_decay_signal,update_consistency
count,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,572.0
mean,3002.413613,293.431065,8487.554974,77541.24,59731.74,62015.34,10.098408,1054.109684,71.720185,686.66044,207775.9,3295.844677,265.275827,375.39734,-110.121513,499.830784
std,14771.082722,1561.373329,27438.548347,265028.5,257591.5,253689.1,25.60959,2441.81176,263.535984,1730.155461,757159.2,15686.613828,1404.349487,1630.522647,824.688145,1295.782684
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14448.9,2.648856
25%,4.0,0.0,53.0,525.0,234.0,290.0,1.773345,0.0,7.0,0.0,1222.0,5.0,35.768116,40.447813,-29.326653,25.832835
50%,150.0,7.0,1029.0,7849.0,6327.0,4504.0,4.26506,298.117162,21.927614,170.833333,20887.0,168.0,78.730697,96.220966,-1.333333,84.790541
75%,977.0,65.0,4666.0,36038.0,20781.0,22789.0,9.484018,1105.307692,51.908163,596.0,88558.0,1063.0,189.637799,224.164113,7.110439,354.640381
max,209320.0,26247.0,274604.0,3013244.0,3864019.0,3407643.0,458.130435,30956.5,3184.5,18461.173913,9192975.0,215690.0,29223.0,29223.0,553.740741,8008.959592


## Section 9: Save feature-engineered dataset

In [12]:
df.to_csv("../data/feature_engineered_monthly.csv", index=False)