# Import Libraries

In [None]:
import pandas as pd
import os

# Function to Load All State Files
## ðŸ”¹ This function:

- Looks at all CSV files in a folder
- Reads each one
- Stacks them together

In [None]:
def load_all_states(folder_path):
    all_files = os.listdir(folder_path)
    df_list = []

    for file in all_files:
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(folder_path, file))
            df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load Each Dataset

In [None]:
enrol = load_all_states("data/enrolment-data/")
demo  = load_all_states("data/demographic-data/")
bio   = load_all_states("data/biometric-data/")

# Check Data Loaded Correctly

In [None]:
enrol.head()

## Step 1: Inspect Dataset Columns

Before performing any calculations, we inspect the column names of each dataset
to ensure consistency and understand the available variables.

In [None]:
enrol.columns
demo.columns
bio.columns

The inspection confirms that the datasets contain spatial (State, District, Pincode),
temporal (Date), and age-wise enrolment/update information required for analysis.


## Step 2: Convert Date Column to Datetime Format

The Date column is converted into a datetime format to enable
time-based analysis and aggregation.


In [None]:
enrol['date'] = pd.to_datetime(enrol['date'], format='mixed', dayfirst=True)
demo['date']  = pd.to_datetime(demo['date'],  format='mixed', dayfirst=True)
bio['date']   = pd.to_datetime(bio['date'],   format='mixed', dayfirst=True)
enrol.dtypes

## Step 3: Compute Total Aadhaar Enrolments

The enrolment dataset contains age-wise enrolments.
To enable unified analysis, we aggregate enrolments across
all age groups to compute total enrolments per record.

In [None]:
enrol['total_enrolments'] = (
    enrol['age_0_5'] +
    enrol['age_5_17'] +
    enrol['age_18_greater']
)

In [None]:
enrol[['age_0_5', 'age_5_17', 'age_18_greater', 'total_enrolments']].head()

Check:

- Numbers should add correctly
- total_enrolments should be positive

## Step 4: Compute Total Aadhaar Updates

Demographic and biometric updates are provided separately
for different age groups. These are aggregated to compute
total updates per record.


In [None]:
demo['total_demo_updates'] = (
    demo['demo_age_5_17'] +
    demo['demo_age_17_']
)

In [None]:
demo[['demo_age_5_17', 'demo_age_17_', 'total_demo_updates']].head()

In [None]:
bio['total_bio_updates'] = (
    bio['bio_age_5_17'] +
    bio['bio_age_17_']
)

In [None]:
bio[['bio_age_5_17', 'bio_age_17_', 'total_bio_updates']].head()

## Step 5: Merge Enrolment and Update Datasets

To compute the Update Dependency Index (UDI), enrolment,
demographic update, and biometric update datasets are merged
using common spatial and temporal identifiers.

In [None]:
merged = enrol.merge(
    demo,
    on=['date', 'state', 'district', 'pincode'],
    how='left'
)

In [None]:
merged = merged.merge(
    bio,
    on=['date', 'state', 'district', 'pincode'],
    how='left'
)

In [None]:
merged.fillna(0, inplace=True)
merged.head()

We will now see:

- age columns
- total_enrolments
- total_demo_updates
- total_bio_updates

all in one table

## Step 6: Remove Invalid Records

Records with zero total enrolments are removed to avoid
division-by-zero errors during index computation.

In [None]:
merged = merged[merged['total_enrolments'] > 0]

## Step 7: Compute Update Dependency Index (UDI)

The Update Dependency Index (UDI) measures the dependence
of Aadhaar records on post-enrolment updates.


In [None]:
merged['UDI'] = (
    merged['total_demo_updates'] +
    merged['total_bio_updates']
) / merged['total_enrolments']

In [None]:
merged[['total_enrolments', 'total_demo_updates', 'total_bio_updates', 'UDI']].head()

UDI values:

- Mostly small
- Some higher values â†’ interesting & useful

## Step 8: Visualisation of UDI Distribution

A histogram is used to examine the overall distribution
of the Update Dependency Index across regions.

In [None]:
import matplotlib.pyplot as plt

merged['UDI'].hist(bins=40)
plt.title("Distribution of Update Dependency Index (UDI)")
plt.xlabel("UDI")
plt.ylabel("Frequency")
plt.show()

## Step 9: Anomaly Detection Using Z-Score

Anomalies are identified by measuring how far a regionâ€™s UDI
deviates from the overall average using statistical Z-scores.

In [None]:
from scipy.stats import zscore

merged['UDI_zscore'] = zscore(merged['UDI'])

In [None]:
anomalies = merged[merged['UDI_zscore'].abs() > 3]
anomalies.head()

What this means:

- `> 3` = very rare / abnormal
- These are high-risk records

## Step 10: Identification of High-Risk Regions

Anomalous UDI values are aggregated to identify districts
and pincodes with consistently high update dependency.

### Top risky districts

In [None]:
top_districts = (
    anomalies
    .groupby(['state','district'])
    .size()
    .sort_values(ascending=False)
    .head(10)
)

top_districts

### Top risky pincodes

In [None]:
top_pincodes = (
    anomalies
    .groupby(['state','district','pincode'])
    .size()
    .sort_values(ascending=False)
    .head(10)
)

top_pincodes

## Step 11: Comparison of Normal vs Anomalous UDI Values

This visual compares the distribution of UDI values
for normal and anomalous regions.

In [None]:
merged['is_anomaly'] = merged['UDI_zscore'].abs() > 3

merged.boxplot(
    column='UDI',
    by='is_anomaly',
    figsize=(6,4)
)
plt.title("UDI Distribution: Normal vs Anomalous")
plt.suptitle("")
plt.xlabel("Anomaly Flag")
plt.ylabel("UDI")
plt.show()