In [None]:
#IMPORTING REQUIRED LIBRARIES
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

In [None]:
#LOADING THE DATASET
df = pd.read_csv("C:\\Users\\srith\\OneDrive\\Documents\\WONN\\My_phase\\api_data_aadhar_enrolment\\api_data_aadhar_enrolment.csv")
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (1006029, 7)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [None]:
#DATE TIME CONVERSION
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
df.tail()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
1006024,2025-12-31,West Bengal,West Midnapore,721149,2,0,0
1006025,2025-12-31,West Bengal,West Midnapore,721150,2,2,0
1006026,2025-12-31,West Bengal,West Midnapore,721305,0,1,0
1006027,2025-12-31,West Bengal,West Midnapore,721504,1,0,0
1006028,2025-12-31,West Bengal,West Midnapore,721517,2,1,0


In [None]:
#HANDLING MISSING VALUES FOR NUMERICAL COLUMNS
numeric_cols = ['age_0_5', 'age_5_17', 'age_18_greater']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[numeric_cols].isnull().sum()


age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

In [None]:
#HANDLING MISSING VALUES FOR CATEGORICAL COLUMNS
df['state'] = df['state'].fillna(df['state'].mode()[0])
df['district'] = df['district'].fillna(df['district'].mode()[0])

df.isnull().sum()

date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

In [None]:
#NORMALIZATION FUNCTION
def integer_min_max_normalize(series, scale=1000):
    min_val = series.min()
    max_val = series.max()
    return (((series - min_val) * scale) // (max_val - min_val)).astype(int)


In [None]:
#NORMALIZING FUNCTION CALLING
df['age_0_5_norm'] = integer_min_max_normalize(df['age_0_5'])
df['age_5_17_norm'] = integer_min_max_normalize(df['age_5_17'])
df['age_18_greater_norm'] = integer_min_max_normalize(df['age_18_greater'])
df.head(10)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,age_0_5_norm,age_5_17_norm,age_18_greater_norm
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,4,33,43
1,2025-03-09,Karnataka,Bengaluru Urban,560043,14,33,39,5,18,45
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,29,82,12,10,45,14
3,2025-03-09,Uttar Pradesh,Aligarh,202133,62,29,15,23,16,17
4,2025-03-09,Karnataka,Bengaluru Urban,560016,14,16,21,5,8,24
5,2025-03-09,Bihar,Sitamarhi,843331,20,49,12,7,27,14
6,2025-03-09,Bihar,Sitamarhi,843330,23,24,42,8,13,49
7,2025-03-09,Uttar Pradesh,Bahraich,271865,26,60,14,9,33,16
8,2025-03-09,Uttar Pradesh,Firozabad,283204,28,26,10,10,14,11
9,2025-03-09,Bihar,Purbi Champaran,845418,30,48,10,11,26,11


In [None]:
#DROPPING ORIGINAL NUMERICAL COLUMNS
df.drop(columns=numeric_cols, inplace=True)

In [None]:
# TOTAL POPULATION NORMALIZATION
df['total_population_norm'] = (
    df['age_0_5_norm'] +
    df['age_5_17_norm'] +
    df['age_18_greater_norm']
)
df.head(10)

Unnamed: 0,date,state,district,pincode,age_0_5_norm,age_5_17_norm,age_18_greater_norm,total_population_norm
0,2025-03-02,Meghalaya,East Khasi Hills,793121,4,33,43,80
1,2025-03-09,Karnataka,Bengaluru Urban,560043,5,18,45,68
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,10,45,14,69
3,2025-03-09,Uttar Pradesh,Aligarh,202133,23,16,17,56
4,2025-03-09,Karnataka,Bengaluru Urban,560016,5,8,24,37
5,2025-03-09,Bihar,Sitamarhi,843331,7,27,14,48
6,2025-03-09,Bihar,Sitamarhi,843330,8,13,49,70
7,2025-03-09,Uttar Pradesh,Bahraich,271865,9,33,16,58
8,2025-03-09,Uttar Pradesh,Firozabad,283204,10,14,11,35
9,2025-03-09,Bihar,Purbi Champaran,845418,11,26,11,48


In [None]:
# CHILD RATIO CALCULATION
df['child_ratio'] = (
    (df['age_0_5_norm'] + df['age_5_17_norm']) * 100
) // (df['total_population_norm'] + 1)

#ADULT RATIO CALCULATION
df['adult_ratio'] = (
    df['age_18_greater_norm'] * 100
) // (df['total_population_norm'] + 1)

df.head(10)

Unnamed: 0,date,state,district,pincode,age_0_5_norm,age_5_17_norm,age_18_greater_norm,total_population_norm,child_ratio,adult_ratio
0,2025-03-02,Meghalaya,East Khasi Hills,793121,4,33,43,80,45,53
1,2025-03-09,Karnataka,Bengaluru Urban,560043,5,18,45,68,33,65
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,10,45,14,69,78,20
3,2025-03-09,Uttar Pradesh,Aligarh,202133,23,16,17,56,68,29
4,2025-03-09,Karnataka,Bengaluru Urban,560016,5,8,24,37,34,63
5,2025-03-09,Bihar,Sitamarhi,843331,7,27,14,48,69,28
6,2025-03-09,Bihar,Sitamarhi,843330,8,13,49,70,29,69
7,2025-03-09,Uttar Pradesh,Bahraich,271865,9,33,16,58,71,27
8,2025-03-09,Uttar Pradesh,Firozabad,283204,10,14,11,35,66,30
9,2025-03-09,Bihar,Purbi Champaran,845418,11,26,11,48,75,22


In [None]:
#LABEL ENCODING FOR CATEGORICAL COLUMNS
state_encoder = LabelEncoder()
district_encoder = LabelEncoder()

df['state_enc'] = state_encoder.fit_transform(df['state'])
df['district_enc'] = district_encoder.fit_transform(df['district'])


In [None]:
#K-MEANS CLUSTERING FOR ENROLMENT PRESSURE CATEGORIZATION
features = [
    'total_population_norm',
    'child_ratio',
    'adult_ratio',
    'state_enc'
]

X = df[features]

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['enrolment_cluster'] = kmeans.fit_predict(X)

df['enrolment_pressure'] = df['enrolment_cluster'].map({
    0: 'HIGH',
    1: 'MEDIUM',
    2: 'LOW'
})


In [23]:
df

Unnamed: 0,date,state,district,pincode,age_0_5_norm,age_5_17_norm,age_18_greater_norm,total_population_norm,child_ratio,adult_ratio,state_enc,district_enc,enrolment_cluster,enrolment_pressure
0,2025-03-02,Meghalaya,East Khasi Hills,793121,4,33,43,80,45,53,30,263,1,MEDIUM
1,2025-03-09,Karnataka,Bengaluru Urban,560043,5,18,45,68,33,65,23,114,1,MEDIUM
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,10,45,14,69,78,20,45,435,1,MEDIUM
3,2025-03-09,Uttar Pradesh,Aligarh,202133,23,16,17,56,68,29,45,18,1,MEDIUM
4,2025-03-09,Karnataka,Bengaluru Urban,560016,5,8,24,37,34,63,23,114,1,MEDIUM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006024,2025-12-31,West Bengal,West Midnapore,721149,0,0,0,0,0,0,51,963,0,HIGH
1006025,2025-12-31,West Bengal,West Midnapore,721150,0,1,0,1,50,0,51,963,1,MEDIUM
1006026,2025-12-31,West Bengal,West Midnapore,721305,0,0,0,0,0,0,51,963,0,HIGH
1006027,2025-12-31,West Bengal,West Midnapore,721504,0,0,0,0,0,0,51,963,0,HIGH


In [None]:
#AGGREGATING DATA AT STATE LEVEL
state_level_df = (
    df.groupby('state')
      .agg(
          total_age_0_5=('age_0_5_norm', 'sum'),
          total_age_5_17=('age_5_17_norm', 'sum'),
          total_age_18_plus=('age_18_greater_norm', 'sum'),
          total_pincodes=('pincode', 'nunique')
      )
      .reset_index()
)

state_level_df.head()


Unnamed: 0,state,total_age_0_5,total_age_5_17,total_age_18_plus,total_pincodes
0,100000,0,0,245,1
1,Andaman & Nicobar Islands,0,0,0,9
2,Andaman and Nicobar Islands,18,3,0,21
3,Andhra Pradesh,14648,2270,1553,1786
4,Arunachal Pradesh,266,785,162,52


In [None]:
#AGGREGATING DATA AT DISTRICT LEVEL
district_level_df = (
    df.groupby(['state', 'district'])
      .agg(
          total_age_0_5=('age_0_5_norm', 'sum'),
          total_age_5_17=('age_5_17_norm', 'sum'),
          total_age_18_plus=('age_18_greater_norm', 'sum'),
          total_pincodes=('pincode', 'nunique')
      )
      .reset_index()
)

district_level_df.head()


Unnamed: 0,state,district,total_age_0_5,total_age_5_17,total_age_18_plus,total_pincodes
0,100000,100000,0,0,245,1
1,Andaman & Nicobar Islands,Andamans,0,0,0,7
2,Andaman & Nicobar Islands,Nicobars,0,0,0,1
3,Andaman & Nicobar Islands,South Andaman,0,0,0,6
4,Andaman and Nicobar Islands,Nicobar,8,3,0,4


In [None]:
#CALCULATING TOTAL ENROLMENT LOAD AT DISTRICT LEVEL
district_level_df['total_enrolment_load'] = (
    district_level_df['total_age_0_5'] +
    district_level_df['total_age_5_17'] +
    district_level_df['total_age_18_plus']
)

district_level_df.head()


Unnamed: 0,state,district,total_age_0_5,total_age_5_17,total_age_18_plus,total_pincodes,total_enrolment_load
0,100000,100000,0,0,245,1,245
1,Andaman & Nicobar Islands,Andamans,0,0,0,7,0
2,Andaman & Nicobar Islands,Nicobars,0,0,0,1,0
3,Andaman & Nicobar Islands,South Andaman,0,0,0,6,0
4,Andaman and Nicobar Islands,Nicobar,8,3,0,4,11


In [None]:
#EXPORTING THE CLEANED AND PREPROCESSED DATASET
df.to_csv(
    "cleaned_preprocessed_aadhaar_dataset.csv",
    index=False,
    encoding="utf-8"
)
