# Data Cleaning and Preprocessing 

## Step 1 : Load the dataset

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("dataset/aadhar_dataset.csv")
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,01-09-2025,Madhya Pradesh,Agar Malwa,465447,3,0,0
1,01-09-2025,Madhya Pradesh,Alirajpur,457888,7,11,0
2,01-09-2025,Madhya Pradesh,Anuppur,484113,1,0,0
3,01-09-2025,Madhya Pradesh,Anuppur,484440,1,0,0
4,01-09-2025,Madhya Pradesh,Ashok Nagar,473330,19,4,0


## Step 2 : Renaming age group columns

In [2]:
df = df.rename(columns={
    'age_0_5' : 'age_group_0_to_5',
    'age_5_17' : 'age_group_5_to_17',
    'age_18_greater' : 'age_group_18_plus'
})
df.head()

Unnamed: 0,date,state,district,pincode,age_group_0_to_5,age_group_5_to_17,age_group_18_plus
0,01-09-2025,Madhya Pradesh,Agar Malwa,465447,3,0,0
1,01-09-2025,Madhya Pradesh,Alirajpur,457888,7,11,0
2,01-09-2025,Madhya Pradesh,Anuppur,484113,1,0,0
3,01-09-2025,Madhya Pradesh,Anuppur,484440,1,0,0
4,01-09-2025,Madhya Pradesh,Ashok Nagar,473330,19,4,0


## Step 3 : Remove unnecessary column

In [3]:
df = df.drop(columns=['pincode'])
df.head()

Unnamed: 0,date,state,district,age_group_0_to_5,age_group_5_to_17,age_group_18_plus
0,01-09-2025,Madhya Pradesh,Agar Malwa,3,0,0
1,01-09-2025,Madhya Pradesh,Alirajpur,7,11,0
2,01-09-2025,Madhya Pradesh,Anuppur,1,0,0
3,01-09-2025,Madhya Pradesh,Anuppur,1,0,0
4,01-09-2025,Madhya Pradesh,Ashok Nagar,19,4,0


## Step 4 : Convert date column to datetime (Changing Datatype) 

In [4]:
df['date'] = pd.to_datetime(df['date'] , format="%d-%m-%Y")
print(df.dtypes)

date                 datetime64[ns]
state                        object
district                     object
age_group_0_to_5              int64
age_group_5_to_17             int64
age_group_18_plus             int64
dtype: object


## Step 5 : Clean text fields (state and district)

In [6]:
df['state'] = df['state'].str.strip()
df['district'] = df['district'].str.strip()

## Step 6 : Checking Null Values

In [8]:
df.isnull().sum()

date                 0
state                0
district             0
age_group_0_to_5     0
age_group_5_to_17    0
age_group_18_plus    0
dtype: int64

#### Data is already cleaned 

## Step 7 : Summary of the dataset

In [11]:
df.describe()

Unnamed: 0,date,age_group_0_to_5,age_group_5_to_17,age_group_18_plus
count,59991,59991.0,59991.0,59991.0
mean,2025-10-10 02:11:28.063209472,6.959261,2.51416,0.182911
min,2025-03-09 00:00:00,0.0,0.0,0.0
25%,2025-09-12 00:00:00,1.0,0.0,0.0
50%,2025-10-20 00:00:00,3.0,1.0,0.0
75%,2025-11-14 00:00:00,7.0,2.0,0.0
max,2026-01-03 00:00:00,1400.0,1376.0,103.0
std,,22.971808,16.253868,1.737969


# Feature Engineering 

## Total enrolments

In [16]:
df['total_enrolments'] = (
    df['age_group_0_to_5'] + df['age_group_5_to_17'] + df['age_group_18_plus']
)
df['total_enrolments'].head()

0     3
1    18
2     1
3     1
4    23
Name: total_enrolments, dtype: int64

## Age group contribution ratios

In [17]:
df['pct_age_0_to_5'] = df['age_group_0_to_5'] / df['total_enrolments']
df['pct_age_5_to_17'] = df['age_group_5_to_17'] / df['total_enrolments']
df['pct_age_18_plus'] = df['age_group_18_plus'] / df['total_enrolments']

# Data Transformation

## State-level aggregation

In [22]:
state_level = df.groupby('state', as_index=False).agg({
    'age_group_0_to_5': 'sum',
    'age_group_5_to_17': 'sum',
    'age_group_18_plus': 'sum',
    'total_enrolments': 'sum'
})
state_level

Unnamed: 0,state,age_group_0_to_5,age_group_5_to_17,age_group_18_plus,total_enrolments
0,Madhya Pradesh,417493,150827,10973,579293


## District-level aggregation

In [25]:
district_level = df.groupby(
    ['state', 'district'], as_index=False).agg({'total_enrolments' : 'sum'})
district_level.head()

Unnamed: 0,state,district,total_enrolments
0,Madhya Pradesh,Agar Malwa,3195
1,Madhya Pradesh,Alirajpur,7705
2,Madhya Pradesh,Anuppur,3951
3,Madhya Pradesh,Ashok Nagar,6144
4,Madhya Pradesh,Ashoknagar,3011


## Saving our Cleaned dataset

#### Cleaned and preprocessed UIDAI Aadhaar dataset

In [27]:
df.to_csv("aadhaar_cleaned_dataset.csv" , index=False)

#### State-level aggregated dataset

In [28]:
state_level.to_csv("aadhaar_state_level_summary.csv" , index=False)

#### District-level aggregated dataset

In [29]:
district_level.to_csv("aadhaar_district_level_summary.csv", index=False)