In [1]:
import pandas as pd
import numpy as np
import glob
import os


In [2]:
def load_all_csvs(path):
    files = glob.glob(path)
    dfs = []
    for f in files:
        print("Loading:", f)
        dfs.append(pd.read_csv(f))
    return pd.concat(dfs, ignore_index=True)

enrol = load_all_csvs("../data/enrolment/*.csv")
demo  = load_all_csvs("../data/demographic/*.csv")
bio   = load_all_csvs("../data/biometric/*.csv")


Loading: ../data/enrolment\enrol_01.csv
Loading: ../data/enrolment\enrol_02.csv
Loading: ../data/enrolment\enrol_03.csv
Loading: ../data/demographic\demo_01.csv
Loading: ../data/demographic\demo_02.csv
Loading: ../data/demographic\demo_03.csv
Loading: ../data/demographic\demo_04.csv
Loading: ../data/demographic\demo_05.csv
Loading: ../data/biometric\bio_01.csv
Loading: ../data/biometric\bio_02.csv
Loading: ../data/biometric\bio_03.csv
Loading: ../data/biometric\bio_04.csv


In [3]:
print("ENROL:", enrol.shape)
print("DEMO :", demo.shape)
print("BIO  :", bio.shape)


ENROL: (1006029, 7)
DEMO : (2071700, 6)
BIO  : (1861108, 6)


In [4]:
print("ENROL columns:")
print(enrol.columns.tolist())

print("\nDEMO columns:")
print(demo.columns.tolist())

print("\nBIO columns:")
print(bio.columns.tolist())


ENROL columns:
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

DEMO columns:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

BIO columns:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [5]:
enrol.head()


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [6]:
demo.head()


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,16-12-2025,Madhya Pradesh,Shajapur,465113,0,4
1,16-12-2025,Madhya Pradesh,Shajapur,465226,2,14
2,16-12-2025,Madhya Pradesh,Shajapur,465339,1,2
3,16-12-2025,Madhya Pradesh,Shajapur,465447,1,19
4,16-12-2025,Madhya Pradesh,Sheopur,476332,11,25


In [7]:
bio.head()


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [8]:
import re

def is_valid_state(x):
    if pd.isna(x):
        return False
    x = str(x).strip().lower()
    
    # invalid if purely numeric
    if x.isdigit():
        return False
    
    # invalid if mostly numeric (more digits than letters)
    letters = sum(c.isalpha() for c in x)
    digits = sum(c.isdigit() for c in x)
    if digits > letters:
        return False
    
    return True


In [9]:
invalid_enrol_states = enrol[~enrol["state"].apply(is_valid_state)]
invalid_demo_states  = demo[~demo["state"].apply(is_valid_state)]
invalid_bio_states   = bio[~bio["state"].apply(is_valid_state)]

print("Invalid ENROL rows:", invalid_enrol_states.shape)
print("Invalid DEMO rows :", invalid_demo_states.shape)
print("Invalid BIO rows  :", invalid_bio_states.shape)


Invalid ENROL rows: (22, 7)
Invalid DEMO rows : (2, 6)
Invalid BIO rows  : (0, 6)


In [10]:
invalid_enrol_states["state"].value_counts().head(20)


state
100000    22
Name: count, dtype: int64

In [11]:
invalid_demo_states["state"].value_counts().head(20)


state
100000    2
Name: count, dtype: int64

In [12]:
invalid_bio_states["state"].value_counts().head(20)


Series([], Name: count, dtype: int64)

In [13]:
enrol_clean = enrol[enrol["state"].apply(is_valid_state)].copy()
demo_clean  = demo[demo["state"].apply(is_valid_state)].copy()
bio_clean   = bio[ bio["state"].apply(is_valid_state)].copy()


In [14]:
print("ENROL  original:", enrol.shape, "→ cleaned:", enrol_clean.shape)
print("DEMO   original:", demo.shape,  "→ cleaned:", demo_clean.shape)
print("BIO    original:", bio.shape,   "→ cleaned:", bio_clean.shape)


ENROL  original: (1006029, 7) → cleaned: (1006007, 7)
DEMO   original: (2071700, 6) → cleaned: (2071698, 6)
BIO    original: (1861108, 6) → cleaned: (1861108, 6)


In [15]:
print(
    "Remaining invalid ENROL:",
    enrol_clean[~enrol_clean["state"].apply(is_valid_state)].shape
)

print(
    "Remaining invalid DEMO:",
    demo_clean[~demo_clean["state"].apply(is_valid_state)].shape
)

print(
    "Remaining invalid BIO:",
    bio_clean[~bio_clean["state"].apply(is_valid_state)].shape
)


Remaining invalid ENROL: (0, 7)
Remaining invalid DEMO: (0, 6)
Remaining invalid BIO: (0, 6)


In [16]:
sorted(enrol_clean["state"].str.lower().str.strip().unique())


['andaman & nicobar islands',
 'andaman and nicobar islands',
 'andhra pradesh',
 'arunachal pradesh',
 'assam',
 'bihar',
 'chandigarh',
 'chhattisgarh',
 'dadra & nagar haveli',
 'dadra and nagar haveli',
 'dadra and nagar haveli and daman and diu',
 'daman & diu',
 'daman and diu',
 'delhi',
 'goa',
 'gujarat',
 'haryana',
 'himachal pradesh',
 'jammu & kashmir',
 'jammu and kashmir',
 'jharkhand',
 'karnataka',
 'kerala',
 'ladakh',
 'lakshadweep',
 'madhya pradesh',
 'maharashtra',
 'manipur',
 'meghalaya',
 'mizoram',
 'nagaland',
 'odisha',
 'orissa',
 'pondicherry',
 'puducherry',
 'punjab',
 'rajasthan',
 'sikkim',
 'tamil nadu',
 'telangana',
 'the dadra and nagar haveli and daman and diu',
 'tripura',
 'uttar pradesh',
 'uttarakhand',
 'west  bengal',
 'west bangal',
 'west bengal',
 'westbengal']

In [17]:
sorted(demo_clean["state"].str.lower().str.strip().unique())


['andaman & nicobar islands',
 'andaman and nicobar islands',
 'andhra pradesh',
 'arunachal pradesh',
 'assam',
 'balanagar',
 'bihar',
 'chandigarh',
 'chhatisgarh',
 'chhattisgarh',
 'dadra & nagar haveli',
 'dadra and nagar haveli',
 'dadra and nagar haveli and daman and diu',
 'daman & diu',
 'daman and diu',
 'darbhanga',
 'delhi',
 'goa',
 'gujarat',
 'haryana',
 'himachal pradesh',
 'jaipur',
 'jammu & kashmir',
 'jammu and kashmir',
 'jharkhand',
 'karnataka',
 'kerala',
 'ladakh',
 'lakshadweep',
 'madanapalle',
 'madhya pradesh',
 'maharashtra',
 'manipur',
 'meghalaya',
 'mizoram',
 'nagaland',
 'nagpur',
 'odisha',
 'orissa',
 'pondicherry',
 'puducherry',
 'punjab',
 'puttenahalli',
 'raja annamalai puram',
 'rajasthan',
 'sikkim',
 'tamil nadu',
 'telangana',
 'tripura',
 'uttar pradesh',
 'uttarakhand',
 'uttaranchal',
 'west  bengal',
 'west bangal',
 'west bengal',
 'west bengli',
 'westbengal']

In [18]:
sorted(bio_clean["state"].str.lower().str.strip().unique())


['andaman & nicobar islands',
 'andaman and nicobar islands',
 'andhra pradesh',
 'arunachal pradesh',
 'assam',
 'bihar',
 'chandigarh',
 'chhatisgarh',
 'chhattisgarh',
 'dadra & nagar haveli',
 'dadra and nagar haveli',
 'dadra and nagar haveli and daman and diu',
 'daman & diu',
 'daman and diu',
 'delhi',
 'goa',
 'gujarat',
 'haryana',
 'himachal pradesh',
 'jammu & kashmir',
 'jammu and kashmir',
 'jharkhand',
 'karnataka',
 'kerala',
 'ladakh',
 'lakshadweep',
 'madhya pradesh',
 'maharashtra',
 'manipur',
 'meghalaya',
 'mizoram',
 'nagaland',
 'odisha',
 'orissa',
 'pondicherry',
 'puducherry',
 'punjab',
 'rajasthan',
 'sikkim',
 'tamil nadu',
 'tamilnadu',
 'telangana',
 'tripura',
 'uttar pradesh',
 'uttarakhand',
 'uttaranchal',
 'west  bengal',
 'west bangal',
 'west bengal',
 'westbengal']

In [19]:
STATE_CANONICAL_MAP = {
    # ANDAMAN & NICOBAR
    "andaman & nicobar islands": "andaman and nicobar islands",

    # DADRA / DAMAN
    "dadra & nagar haveli": "dadra and nagar haveli",
    "daman & diu": "daman and diu",
    "the dadra and nagar haveli and daman and diu": "dadra and nagar haveli and daman and diu",

    # JAMMU & KASHMIR
    "jammu & kashmir": "jammu and kashmir",

    # WEST BENGAL
    "west bangal": "west bengal",
    "west bengli": "west bengal",
    "westbengal": "west bengal",
    "west  bengal": "west bengal",

    # ODISHA
    "orissa": "odisha",

    # PUDUCHERRY
    "pondicherry": "puducherry",

    # UTTARAKHAND
    "uttaranchal": "uttarakhand",

    # TAMIL NADU
    "tamilnadu": "tamil nadu",

    # CHHATTISGARH
    "chhatisgarh": "chhattisgarh",
}


In [20]:
def normalize_state_text(x):
    if pd.isna(x):
        return x
    x = x.lower()
    x = x.replace("&", "and")
    x = " ".join(x.split())  # remove extra spaces
    return x


In [21]:
for df_raw in [enrol_clean, demo_clean, bio_clean]:
    df_raw["state"] = df_raw["state"].apply(normalize_state_text)
    df_raw["state"] = df_raw["state"].replace(STATE_CANONICAL_MAP)


In [22]:
sorted(enrol_clean["state"].unique())


['andaman and nicobar islands',
 'andhra pradesh',
 'arunachal pradesh',
 'assam',
 'bihar',
 'chandigarh',
 'chhattisgarh',
 'dadra and nagar haveli',
 'dadra and nagar haveli and daman and diu',
 'daman and diu',
 'delhi',
 'goa',
 'gujarat',
 'haryana',
 'himachal pradesh',
 'jammu and kashmir',
 'jharkhand',
 'karnataka',
 'kerala',
 'ladakh',
 'lakshadweep',
 'madhya pradesh',
 'maharashtra',
 'manipur',
 'meghalaya',
 'mizoram',
 'nagaland',
 'odisha',
 'puducherry',
 'punjab',
 'rajasthan',
 'sikkim',
 'tamil nadu',
 'telangana',
 'tripura',
 'uttar pradesh',
 'uttarakhand',
 'west bengal']

In [23]:
sorted(demo_clean["state"].unique())


['andaman and nicobar islands',
 'andhra pradesh',
 'arunachal pradesh',
 'assam',
 'balanagar',
 'bihar',
 'chandigarh',
 'chhattisgarh',
 'dadra and nagar haveli',
 'dadra and nagar haveli and daman and diu',
 'daman and diu',
 'darbhanga',
 'delhi',
 'goa',
 'gujarat',
 'haryana',
 'himachal pradesh',
 'jaipur',
 'jammu and kashmir',
 'jharkhand',
 'karnataka',
 'kerala',
 'ladakh',
 'lakshadweep',
 'madanapalle',
 'madhya pradesh',
 'maharashtra',
 'manipur',
 'meghalaya',
 'mizoram',
 'nagaland',
 'nagpur',
 'odisha',
 'puducherry',
 'punjab',
 'puttenahalli',
 'raja annamalai puram',
 'rajasthan',
 'sikkim',
 'tamil nadu',
 'telangana',
 'tripura',
 'uttar pradesh',
 'uttarakhand',
 'west bengal']

In [24]:
sorted(bio_clean["state"].unique())


['andaman and nicobar islands',
 'andhra pradesh',
 'arunachal pradesh',
 'assam',
 'bihar',
 'chandigarh',
 'chhattisgarh',
 'dadra and nagar haveli',
 'dadra and nagar haveli and daman and diu',
 'daman and diu',
 'delhi',
 'goa',
 'gujarat',
 'haryana',
 'himachal pradesh',
 'jammu and kashmir',
 'jharkhand',
 'karnataka',
 'kerala',
 'ladakh',
 'lakshadweep',
 'madhya pradesh',
 'maharashtra',
 'manipur',
 'meghalaya',
 'mizoram',
 'nagaland',
 'odisha',
 'puducherry',
 'punjab',
 'rajasthan',
 'sikkim',
 'tamil nadu',
 'telangana',
 'tripura',
 'uttar pradesh',
 'uttarakhand',
 'west bengal']

In [25]:
STATE_CANONICAL_MAP.update({
    "dadra and nagar haveli": "dadra and nagar haveli and daman and diu",
    "daman and diu": "dadra and nagar haveli and daman and diu",
})


In [26]:
for df_raw in [enrol_clean, demo_clean, bio_clean]:
    df_raw["state"] = df_raw["state"].replace(STATE_CANONICAL_MAP)


In [27]:
[state for state in sorted(enrol_clean["state"].unique()) if "dadra" in state or "daman" in state]


['andaman and nicobar islands', 'dadra and nagar haveli and daman and diu']

In [28]:
[state for state in sorted(demo_clean["state"].unique()) if "dadra" in state or "daman" in state]


['andaman and nicobar islands', 'dadra and nagar haveli and daman and diu']

In [29]:
[state for state in sorted(bio_clean["state"].unique()) if "dadra" in state or "daman" in state]


['andaman and nicobar islands', 'dadra and nagar haveli and daman and diu']

In [30]:
VALID_STATES = set(enrol_clean["state"].unique())

len(VALID_STATES), sorted(VALID_STATES)



(36,
 ['andaman and nicobar islands',
  'andhra pradesh',
  'arunachal pradesh',
  'assam',
  'bihar',
  'chandigarh',
  'chhattisgarh',
  'dadra and nagar haveli and daman and diu',
  'delhi',
  'goa',
  'gujarat',
  'haryana',
  'himachal pradesh',
  'jammu and kashmir',
  'jharkhand',
  'karnataka',
  'kerala',
  'ladakh',
  'lakshadweep',
  'madhya pradesh',
  'maharashtra',
  'manipur',
  'meghalaya',
  'mizoram',
  'nagaland',
  'odisha',
  'puducherry',
  'punjab',
  'rajasthan',
  'sikkim',
  'tamil nadu',
  'telangana',
  'tripura',
  'uttar pradesh',
  'uttarakhand',
  'west bengal'])

In [31]:
invalid_demo_state_rows = demo_clean[~demo_clean["state"].isin(VALID_STATES)]

print("Number of DEMO rows with non-state values:", invalid_demo_state_rows.shape)


Number of DEMO rows with non-state values: (11, 6)


In [32]:
invalid_demo_state_rows["state"].value_counts()


state
darbhanga               2
jaipur                  2
balanagar               2
madanapalle             2
nagpur                  1
raja annamalai puram    1
puttenahalli            1
Name: count, dtype: int64

In [33]:
invalid_demo_state_rows.head(10)


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
33255,22-12-2025,nagpur,Near Uday nagar NIT garden,440024,0,1
57984,26-12-2025,raja annamalai puram,Near Dhyana Ashram,600028,0,1
294631,16-12-2025,darbhanga,Near University Thana,846004,0,1
404084,16-12-2025,darbhanga,Near University Thana,846004,0,1
413049,19-12-2025,puttenahalli,5th cross,560078,0,1
803143,13-12-2025,balanagar,IDPL COLONY,500037,0,1
883975,26-12-2025,jaipur,Near meera hospital,302016,0,1
886706,27-12-2025,madanapalle,Kadiri Road,517325,0,1
1628706,13-12-2025,balanagar,IDPL COLONY,500037,0,1
1682463,26-12-2025,jaipur,Near meera hospital,302016,0,1


In [34]:
demo_clean_v2 = demo_clean[demo_clean["state"].isin(VALID_STATES)].copy()


In [35]:
print("DEMO before:", demo_clean.shape)
print("DEMO after :", demo_clean_v2.shape)


DEMO before: (2071698, 6)
DEMO after : (2071687, 6)


In [36]:
demo_clean_v2[~demo_clean_v2["state"].isin(VALID_STATES)].shape


(0, 6)

In [37]:
demo_clean = demo_clean_v2


In [38]:
import re

def normalize_district_text(x):
    if pd.isna(x):
        return x
    x = x.lower()
    x = x.replace("&", "and")
    x = re.sub(r"[^a-z0-9\s]", " ", x)
    x = " ".join(x.split())
    return x


In [39]:
for df_raw in [enrol_clean, demo_clean, bio_clean]:
    df_raw["district_norm"] = df_raw["district"].apply(normalize_district_text)


In [40]:
enrol_clean[["state", "district", "district_norm"]].head(10)


Unnamed: 0,state,district,district_norm
0,meghalaya,East Khasi Hills,east khasi hills
1,karnataka,Bengaluru Urban,bengaluru urban
2,uttar pradesh,Kanpur Nagar,kanpur nagar
3,uttar pradesh,Aligarh,aligarh
4,karnataka,Bengaluru Urban,bengaluru urban
5,bihar,Sitamarhi,sitamarhi
6,bihar,Sitamarhi,sitamarhi
7,uttar pradesh,Bahraich,bahraich
8,uttar pradesh,Firozabad,firozabad
9,bihar,Purbi Champaran,purbi champaran


In [41]:
cg_enrol = enrol_clean[enrol_clean["state"] == "chhattisgarh"].copy()
cg_enrol.shape


(18550, 8)

In [42]:
cg_enrol[["district", "district_norm"]].drop_duplicates().sort_values("district_norm")


Unnamed: 0,district,district_norm
6936,Balod,balod
1453,Baloda Bazar,baloda bazar
2162,Balrampur,balrampur
3757,Bastar,bastar
3758,Bemetara,bemetara
146,Bijapur,bijapur
3086,Bilaspur,bilaspur
11027,Dakshin Bastar Dantewada,dakshin bastar dantewada
14186,Dantewada,dantewada
6952,Dhamtari,dhamtari


In [43]:
cg_enrol["district_norm"].value_counts().sort_values(ascending=False)


district_norm
raipur                             1442
bilaspur                           1340
janjgir champa                     1146
durg                               1034
baloda bazar                       1023
korba                               891
raigarh                             820
rajnandgaon                         746
jashpur                             710
kanker                              601
surguja                             593
bastar                              589
surajpur                            574
gariyaband                          531
balod                               493
bemetara                            493
koriya                              487
dhamtari                            482
balrampur                           479
mahasamund                          461
uttar bastar kanker                 424
kondagaon                           369
mungeli                             352
bijapur                             341
dakshin bastar dantewada  

In [44]:
from collections import defaultdict

cg_district_pincodes = (
    cg_enrol
    .groupby("district_norm")["pincode"]
    .apply(lambda x: sorted(x.unique().tolist()))
)

cg_district_pincodes


district_norm
balod                              [491221, 491222, 491223, 491225, 491226, 49122...
baloda bazar                       [492112, 493101, 493113, 493114, 493118, 49319...
balrampur                          [497001, 497118, 497119, 497220, 497223, 49722...
bastar                             [494001, 494005, 494010, 494115, 494221, 49422...
bemetara                           [490036, 491331, 491332, 491335, 491337, 49133...
bijapur                             [494441, 494444, 494446, 494447, 494448, 494450]
bilaspur                           [495001, 495004, 495006, 495009, 495112, 49511...
dakshin bastar dantewada           [494111, 494115, 494441, 494442, 494449, 49455...
dantewada                          [494111, 494115, 494441, 494449, 494551, 49455...
dhamtari                           [492109, 493661, 493662, 493663, 493770, 49377...
durg                               [490001, 490006, 490009, 490011, 490020, 49002...
gariyaband                         [491558, 491661,

In [45]:
cg_district_names = cg_district_pincodes.index.tolist()
cg_district_names


['balod',
 'baloda bazar',
 'balrampur',
 'bastar',
 'bemetara',
 'bijapur',
 'bilaspur',
 'dakshin bastar dantewada',
 'dantewada',
 'dhamtari',
 'durg',
 'gariyaband',
 'gaurela pendra marwahi',
 'gaurella pendra marwahi',
 'janjgir champa',
 'jashpur',
 'kabeerdham',
 'kanker',
 'kawardha',
 'khairagarh chhuikhadan gandai',
 'kondagaon',
 'korba',
 'koriya',
 'mahasamund',
 'manendragarh chirmiri bharatpur',
 'mohalla manpur ambagarh chowki',
 'mohla manpur ambagarh chouki',
 'mungeli',
 'narayanpur',
 'raigarh',
 'raipur',
 'rajnandgaon',
 'sakti',
 'sarangarh bilaigarh',
 'sukma',
 'surajpur',
 'surguja',
 'uttar bastar kanker']

In [46]:
CG_DISTRICT_CANONICAL_MAP = {
    # spelling variants
    ("chhattisgarh", "gaurella pendra marwahi"): "gaurela pendra marwahi",
    ("chhattisgarh", "mohalla manpur ambagarh chowki"): "mohla manpur ambagarh chowki",
    ("chhattisgarh", "mohla manpur ambagarh chouki"): "mohla manpur ambagarh chowki",

    # official aliases
    ("chhattisgarh", "kabeerdham"): "kawardha",
    ("chhattisgarh", "uttar bastar kanker"): "kanker",
}


In [47]:
def apply_cg_district_fix(row):
    key = (row["state"], row["district_norm"])
    return CG_DISTRICT_CANONICAL_MAP.get(key, row["district_norm"])

for df_raw in [enrol_clean, demo_clean, bio_clean]:
    mask = df_raw["state"] == "chhattisgarh"
    df_raw.loc[mask, "district_final"] = df_raw.loc[mask].apply(apply_cg_district_fix, axis=1)


In [48]:
sorted(
    enrol_clean[enrol_clean["state"] == "chhattisgarh"]["district_final"].unique()
)


['balod',
 'baloda bazar',
 'balrampur',
 'bastar',
 'bemetara',
 'bijapur',
 'bilaspur',
 'dakshin bastar dantewada',
 'dantewada',
 'dhamtari',
 'durg',
 'gariyaband',
 'gaurela pendra marwahi',
 'janjgir champa',
 'jashpur',
 'kanker',
 'kawardha',
 'khairagarh chhuikhadan gandai',
 'kondagaon',
 'korba',
 'koriya',
 'mahasamund',
 'manendragarh chirmiri bharatpur',
 'mohla manpur ambagarh chowki',
 'mungeli',
 'narayanpur',
 'raigarh',
 'raipur',
 'rajnandgaon',
 'sakti',
 'sarangarh bilaigarh',
 'sukma',
 'surajpur',
 'surguja']

In [49]:
up_enrol = enrol_clean[enrol_clean["state"] == "uttar pradesh"].copy()
up_enrol.shape


(110369, 9)

In [50]:
up_enrol[["district", "district_norm"]].drop_duplicates().sort_values("district_norm")


Unnamed: 0,district,district_norm
35,Agra,agra
3,Aligarh,aligarh
5960,Allahabad,allahabad
577,Ambedkar Nagar,ambedkar nagar
5975,Amethi,amethi
...,...,...
445,Sitapur,sitapur
6236,Sonbhadra,sonbhadra
463,Sultanpur,sultanpur
58,Unnao,unnao


In [51]:
up_enrol["district_norm"].value_counts().sort_values(ascending=False)


district_norm
azamgarh           2967
lucknow            2933
jaunpur            2689
allahabad          2626
kanpur nagar       2601
                   ... 
siddharth nagar       6
shravasti             4
raebareli             2
kushi nagar           1
bagpat                1
Name: count, Length: 88, dtype: int64

In [52]:
up_district_pincodes = (
    up_enrol
    .groupby("district_norm")["pincode"]
    .apply(lambda x: sorted(x.unique().tolist()))
)

up_district_pincodes


district_norm
agra              [282001, 282002, 282003, 282004, 282005, 28200...
aligarh           [202001, 202002, 202121, 202122, 202123, 20212...
allahabad         [211001, 211002, 211003, 211004, 211005, 21100...
ambedkar nagar    [224122, 224123, 224125, 224126, 224129, 22413...
amethi            [227405, 227406, 227407, 227408, 227409, 22741...
                                        ...                        
sitapur           [261001, 261121, 261125, 261131, 261135, 26113...
sonbhadra         [231205, 231206, 231207, 231208, 231209, 23121...
sultanpur         [222301, 222302, 222303, 227304, 227405, 22740...
unnao             [209308, 209506, 209801, 209821, 209825, 20982...
varanasi          [221001, 221002, 221003, 221004, 221005, 22100...
Name: pincode, Length: 88, dtype: object

In [53]:
sorted(up_district_pincodes.index.tolist())


['agra',
 'aligarh',
 'allahabad',
 'ambedkar nagar',
 'amethi',
 'amroha',
 'auraiya',
 'ayodhya',
 'azamgarh',
 'baghpat',
 'bagpat',
 'bahraich',
 'ballia',
 'balrampur',
 'banda',
 'bara banki',
 'barabanki',
 'bareilly',
 'basti',
 'bhadohi',
 'bijnor',
 'budaun',
 'bulandshahar',
 'bulandshahr',
 'chandauli',
 'chitrakoot',
 'deoria',
 'etah',
 'etawah',
 'faizabad',
 'farrukhabad',
 'fatehpur',
 'firozabad',
 'gautam buddha nagar',
 'ghaziabad',
 'ghazipur',
 'gonda',
 'gorakhpur',
 'hamirpur',
 'hapur',
 'hardoi',
 'hathras',
 'jalaun',
 'jaunpur',
 'jhansi',
 'jyotiba phule nagar',
 'kannauj',
 'kanpur dehat',
 'kanpur nagar',
 'kasganj',
 'kaushambi',
 'kheri',
 'kushi nagar',
 'kushinagar',
 'lalitpur',
 'lucknow',
 'maharajganj',
 'mahoba',
 'mahrajganj',
 'mainpuri',
 'mathura',
 'mau',
 'meerut',
 'mirzapur',
 'moradabad',
 'muzaffarnagar',
 'pilibhit',
 'pratapgarh',
 'prayagraj',
 'rae bareli',
 'raebareli',
 'rampur',
 'saharanpur',
 'sambhal',
 'sant kabir nagar',
 's

In [54]:
UP_DISTRICT_CANONICAL_MAP = {
    # spelling variants
    ("uttar pradesh", "bagpat"): "baghpat",
    ("uttar pradesh", "bulandshahar"): "bulandshahr",
    ("uttar pradesh", "bara banki"): "barabanki",
    ("uttar pradesh", "kushi nagar"): "kushinagar",
    ("uttar pradesh", "mahrajganj"): "maharajganj",
    ("uttar pradesh", "rae bareli"): "raebareli",
    ("uttar pradesh", "shrawasti"): "shravasti",
    ("uttar pradesh", "siddharth nagar"): "siddharthnagar",

    # official renames / aliases
    ("uttar pradesh", "allahabad"): "prayagraj",
    ("uttar pradesh", "faizabad"): "ayodhya",
    ("uttar pradesh", "bhadohi"): "sant ravidas nagar bhadohi",
    ("uttar pradesh", "sant ravidas nagar"): "sant ravidas nagar bhadohi",
    ("uttar pradesh", "jyotiba phule nagar"): "amroha",
}


In [55]:
def apply_up_district_fix(row):
    key = (row["state"], row["district_norm"])
    return UP_DISTRICT_CANONICAL_MAP.get(key, row["district_norm"])

for df_raw in [enrol_clean, demo_clean, bio_clean]:
    mask = df_raw["state"] == "uttar pradesh"
    df_raw.loc[mask, "district_final"] = df_raw.loc[mask].apply(
        apply_up_district_fix, axis=1
    )


In [56]:
sorted(
    enrol_clean[enrol_clean["state"] == "uttar pradesh"]["district_final"].unique()
)


['agra',
 'aligarh',
 'ambedkar nagar',
 'amethi',
 'amroha',
 'auraiya',
 'ayodhya',
 'azamgarh',
 'baghpat',
 'bahraich',
 'ballia',
 'balrampur',
 'banda',
 'barabanki',
 'bareilly',
 'basti',
 'bijnor',
 'budaun',
 'bulandshahr',
 'chandauli',
 'chitrakoot',
 'deoria',
 'etah',
 'etawah',
 'farrukhabad',
 'fatehpur',
 'firozabad',
 'gautam buddha nagar',
 'ghaziabad',
 'ghazipur',
 'gonda',
 'gorakhpur',
 'hamirpur',
 'hapur',
 'hardoi',
 'hathras',
 'jalaun',
 'jaunpur',
 'jhansi',
 'kannauj',
 'kanpur dehat',
 'kanpur nagar',
 'kasganj',
 'kaushambi',
 'kheri',
 'kushinagar',
 'lalitpur',
 'lucknow',
 'maharajganj',
 'mahoba',
 'mainpuri',
 'mathura',
 'mau',
 'meerut',
 'mirzapur',
 'moradabad',
 'muzaffarnagar',
 'pilibhit',
 'pratapgarh',
 'prayagraj',
 'raebareli',
 'rampur',
 'saharanpur',
 'sambhal',
 'sant kabir nagar',
 'sant ravidas nagar bhadohi',
 'shahjahanpur',
 'shamli',
 'shravasti',
 'siddharthnagar',
 'sitapur',
 'sonbhadra',
 'sultanpur',
 'unnao',
 'varanasi']

In [57]:
mh_enrol = enrol_clean[enrol_clean["state"] == "maharashtra"].copy()
mh_enrol.shape


(77191, 9)

In [58]:
mh_enrol[["district", "district_norm"]].drop_duplicates().sort_values("district_norm")


Unnamed: 0,district,district_norm
869355,Ahilyanagar,ahilyanagar
204,Ahmadnagar,ahmadnagar
4680,Ahmed Nagar,ahmed nagar
2256,Ahmednagar,ahmednagar
2630,Akola,akola
1732,Amravati,amravati
12,Aurangabad,aurangabad
413,Beed,beed
4697,Bhandara,bhandara
12019,Bid,bid


In [59]:
mh_enrol["district_norm"].value_counts().sort_values(ascending=False)


district_norm
pune                         6663
thane                        4236
mumbai                       4183
nashik                       3739
ahmadnagar                   3723
kolhapur                     3101
nagpur                       3043
solapur                      2994
mumbai suburban              2911
jalgaon                      2836
satara                       2369
aurangabad                   2229
sangli                       2113
nanded                       2044
raigarh                      1920
amravati                     1902
palghar                      1646
yavatmal                     1591
buldhana                     1553
ratnagiri                    1530
dhule                        1484
beed                         1400
latur                        1361
chandrapur                   1275
nandurbar                    1273
mumbai city                  1250
akola                        1186
osmanabad                    1160
jalna                        1133


In [60]:
mh_district_pincodes = (
    mh_enrol
    .groupby("district_norm")["pincode"]
    .apply(lambda x: sorted(x.unique().tolist()))
)

mh_district_pincodes


district_norm
ahilyanagar                  [413725, 413726, 413738, 413739, 414001, 41411...
ahmadnagar                   [400037, 412210, 413201, 413204, 413205, 41370...
ahmed nagar                  [413201, 413205, 413703, 413706, 413709, 41371...
ahmednagar                                                    [413705, 414105]
akola                        [444001, 444002, 444003, 444004, 444005, 44400...
amravati                     [442302, 444601, 444602, 444603, 444604, 44460...
aurangabad                   [400012, 400033, 423701, 423702, 423703, 43100...
beed                         [413207, 413229, 413249, 414202, 414203, 41420...
bhandara                     [441104, 441106, 441802, 441803, 441804, 44180...
bid                          [413229, 413249, 414202, 414203, 414204, 41420...
buldana                      [443001, 443101, 443102, 443103, 443106, 44311...
buldhana                     [443001, 443101, 443102, 443103, 443104, 44310...
chandrapur                   [441205, 

In [61]:
sorted(mh_district_pincodes.index.tolist())


['ahilyanagar',
 'ahmadnagar',
 'ahmed nagar',
 'ahmednagar',
 'akola',
 'amravati',
 'aurangabad',
 'beed',
 'bhandara',
 'bid',
 'buldana',
 'buldhana',
 'chandrapur',
 'chatrapati sambhaji nagar',
 'chhatrapati sambhajinagar',
 'dharashiv',
 'dhule',
 'gadchiroli',
 'gondia',
 'gondiya',
 'hingoli',
 'jalgaon',
 'jalna',
 'kolhapur',
 'latur',
 'mumbai',
 'mumbai city',
 'mumbai sub urban',
 'mumbai suburban',
 'nagpur',
 'nanded',
 'nandurbar',
 'nashik',
 'osmanabad',
 'palghar',
 'parbhani',
 'pune',
 'raigad',
 'raigarh',
 'raigarh mh',
 'ratnagiri',
 'sangli',
 'satara',
 'sindhudurg',
 'solapur',
 'thane',
 'wardha',
 'washim',
 'yavatmal']

In [62]:
MH_DISTRICT_CANONICAL_MAP = {
    # Ahmednagar variants
    ("maharashtra", "ahmadnagar"): "ahmednagar",
    ("maharashtra", "ahmed nagar"): "ahmednagar",
    ("maharashtra", "ahilyanagar"): "ahmednagar",

    # Beed
    ("maharashtra", "bid"): "beed",

    # Buldhana
    ("maharashtra", "buldana"): "buldhana",

    # Gondia
    ("maharashtra", "gondiya"): "gondia",

    # Mumbai suburban
    ("maharashtra", "mumbai sub urban"): "mumbai suburban",

    # Aurangabad rename
    ("maharashtra", "aurangabad"): "chhatrapati sambhajinagar",
    ("maharashtra", "chatrapati sambhaji nagar"): "chhatrapati sambhajinagar",

    # Osmanabad rename
    ("maharashtra", "osmanabad"): "dharashiv",
}


In [63]:
def apply_mh_district_fix(row):
    key = (row["state"], row["district_norm"])
    return MH_DISTRICT_CANONICAL_MAP.get(key, row["district_norm"])

for df_raw in [enrol_clean, demo_clean, bio_clean]:
    mask = df_raw["state"] == "maharashtra"
    df_raw.loc[mask, "district_final"] = df_raw.loc[mask].apply(
        apply_mh_district_fix, axis=1
    )


In [64]:
sorted(
    enrol_clean[enrol_clean["state"] == "maharashtra"]["district_final"].unique()
)


['ahmednagar',
 'akola',
 'amravati',
 'beed',
 'bhandara',
 'buldhana',
 'chandrapur',
 'chhatrapati sambhajinagar',
 'dharashiv',
 'dhule',
 'gadchiroli',
 'gondia',
 'hingoli',
 'jalgaon',
 'jalna',
 'kolhapur',
 'latur',
 'mumbai',
 'mumbai city',
 'mumbai suburban',
 'nagpur',
 'nanded',
 'nandurbar',
 'nashik',
 'palghar',
 'parbhani',
 'pune',
 'raigad',
 'raigarh',
 'raigarh mh',
 'ratnagiri',
 'sangli',
 'satara',
 'sindhudurg',
 'solapur',
 'thane',
 'wardha',
 'washim',
 'yavatmal']

In [65]:
rj_enrol = enrol_clean[enrol_clean["state"] == "rajasthan"].copy()
rj_enrol.shape


(56159, 9)

In [66]:
rj_enrol[["district", "district_norm"]].drop_duplicates().sort_values("district_norm")


Unnamed: 0,district,district_norm
810,Ajmer,ajmer
571,Alwar,alwar
833180,Balotra,balotra
5292,Banswara,banswara
682,Baran,baran
1864,Barmer,barmer
990453,Beawar,beawar
519,Bharatpur,bharatpur
1778,Bhilwara,bhilwara
468,Bikaner,bikaner


In [67]:
rj_enrol["district_norm"].value_counts().sort_values(ascending=False)


district_norm
jaipur              4670
sikar               2861
alwar               2720
nagaur              2672
ajmer               2528
jodhpur             2471
jhunjhunun          2419
udaipur             2369
pali                2297
bharatpur           2081
bhilwara            1932
barmer              1742
kota                1681
bikaner             1539
churu               1519
dausa               1474
dungarpur           1464
karauli             1420
chittorgarh         1396
sawai madhopur      1381
jalor               1313
ganganagar          1311
rajsamand           1247
jhalawar            1132
sirohi              1076
banswara            1051
hanumangarh          935
bundi                934
tonk                 930
dholpur              865
baran                827
pratapgarh           704
jaisalmer            589
jhunjhunu            241
chittaurgarh         237
dhaulpur              78
deeg                  43
jalore                 5
didwana kuchaman       2
balotra    

In [68]:
rj_district_pincodes = (
    rj_enrol
    .groupby("district_norm")["pincode"]
    .apply(lambda x: sorted(x.unique().tolist()))
)

rj_district_pincodes


district_norm
ajmer               [305001, 305002, 305003, 305004, 305005, 30500...
alwar               [301001, 301002, 301018, 301019, 301020, 30102...
balotra                                                      [344022]
banswara            [327001, 327021, 327022, 327023, 327024, 32702...
baran               [325004, 325202, 325205, 325206, 325207, 32520...
barmer              [344001, 344011, 344012, 344021, 344022, 34402...
beawar                                                       [305901]
bharatpur           [321001, 321021, 321022, 321023, 321024, 32102...
bhilwara            [311001, 311011, 311013, 311021, 311022, 31102...
bikaner             [331801, 331803, 331811, 334001, 334003, 33400...
bundi               [323001, 323021, 323022, 323023, 323024, 32302...
chittaurgarh        [312001, 312021, 312022, 312024, 312025, 31202...
chittorgarh         [312001, 312021, 312022, 312023, 312024, 31202...
churu               [331001, 331021, 331022, 331023, 331029, 33103...
dausa 

In [69]:
sorted(rj_district_pincodes.index.tolist())


['ajmer',
 'alwar',
 'balotra',
 'banswara',
 'baran',
 'barmer',
 'beawar',
 'bharatpur',
 'bhilwara',
 'bikaner',
 'bundi',
 'chittaurgarh',
 'chittorgarh',
 'churu',
 'dausa',
 'deeg',
 'dhaulpur',
 'dholpur',
 'didwana kuchaman',
 'dungarpur',
 'ganganagar',
 'hanumangarh',
 'jaipur',
 'jaisalmer',
 'jalor',
 'jalore',
 'jhalawar',
 'jhunjhunu',
 'jhunjhunun',
 'jodhpur',
 'karauli',
 'kota',
 'nagaur',
 'pali',
 'pratapgarh',
 'rajsamand',
 'salumbar',
 'sawai madhopur',
 'sikar',
 'sirohi',
 'tonk',
 'udaipur']

In [70]:
RJ_DISTRICT_CANONICAL_MAP = {
    # spelling variants
    ("rajasthan", "chittaurgarh"): "chittorgarh",
    ("rajasthan", "dhaulpur"): "dholpur",
    ("rajasthan", "jalor"): "jalore",
    ("rajasthan", "jhunjhunun"): "jhunjhunu",
}


In [71]:
def apply_rj_district_fix(row):
    key = (row["state"], row["district_norm"])
    return RJ_DISTRICT_CANONICAL_MAP.get(key, row["district_norm"])

for df_raw in [enrol_clean, demo_clean, bio_clean]:
    mask = df_raw["state"] == "rajasthan"
    df_raw.loc[mask, "district_final"] = df_raw.loc[mask].apply(
        apply_rj_district_fix, axis=1
    )


In [72]:
sorted(
    enrol_clean[enrol_clean["state"] == "rajasthan"]["district_final"].unique()
)


['ajmer',
 'alwar',
 'balotra',
 'banswara',
 'baran',
 'barmer',
 'beawar',
 'bharatpur',
 'bhilwara',
 'bikaner',
 'bundi',
 'chittorgarh',
 'churu',
 'dausa',
 'deeg',
 'dholpur',
 'didwana kuchaman',
 'dungarpur',
 'ganganagar',
 'hanumangarh',
 'jaipur',
 'jaisalmer',
 'jalore',
 'jhalawar',
 'jhunjhunu',
 'jodhpur',
 'karauli',
 'kota',
 'nagaur',
 'pali',
 'pratapgarh',
 'rajsamand',
 'salumbar',
 'sawai madhopur',
 'sikar',
 'sirohi',
 'tonk',
 'udaipur']

In [73]:
tn_enrol = enrol_clean[enrol_clean["state"] == "tamil nadu"].copy()
tn_enrol.shape


(92552, 9)

In [74]:
tn_enrol[["district", "district_norm"]].drop_duplicates().sort_values("district_norm")


Unnamed: 0,district,district_norm
5453,Ariyalur,ariyalur
2227,Chengalpattu,chengalpattu
2318,Chennai,chennai
1498,Coimbatore,coimbatore
1388,Cuddalore,cuddalore
5495,Dharmapuri,dharmapuri
5499,Dindigul,dindigul
5508,Erode,erode
1652,Kallakurichi,kallakurichi
330,Kancheepuram,kancheepuram


In [75]:
tn_enrol["district_norm"].value_counts().sort_values(ascending=False)


district_norm
kancheepuram       4421
tirunelveli        4420
vellore            4102
coimbatore         4092
salem              4058
tiruchirappalli    3769
tiruvallur         3694
thanjavur          3674
chennai            3300
cuddalore          3245
villupuram         3226
tiruvannamalai     3108
madurai            2904
viluppuram         2818
thoothukkudi       2574
erode              2553
tiruppur           2454
pudukkottai        2450
nagapattinam       2448
dindigul           2426
thiruvarur         2338
virudhunagar       2180
namakkal           2161
sivaganga          2146
ramanathapuram     2027
krishnagiri        1756
dharmapuri         1644
karur              1535
kanniyakumari      1534
kanyakumari        1449
theni              1323
ariyalur           1084
the nilgiris       1031
perambalur         1009
tenkasi             888
chengalpattu        686
tirupattur          681
kallakurichi        593
mayiladuthurai      421
thiruvallur         223
ranipet              96
ka

In [76]:
tn_district_pincodes = (
    tn_enrol
    .groupby("district_norm")["pincode"]
    .apply(lambda x: sorted(x.unique().tolist()))
)

tn_district_pincodes


district_norm
ariyalur           [608703, 608901, 612901, 612902, 612903, 61290...
chengalpattu       [600048, 600064, 600070, 600073, 600100, 60012...
chennai            [600001, 600002, 600003, 600004, 600005, 60000...
coimbatore         [638401, 638459, 638462, 641001, 641002, 64100...
cuddalore          [605007, 605106, 605110, 606001, 606003, 60610...
dharmapuri         [635001, 635101, 635104, 635106, 635109, 63511...
dindigul           [624001, 624002, 624003, 624004, 624005, 62410...
erode              [638001, 638002, 638003, 638004, 638005, 63800...
kallakurichi       [605102, 605202, 605203, 605401, 605702, 60575...
kancheepuram       [600016, 600027, 600041, 600043, 600044, 60004...
kanchipuram                         [600069, 600073, 631501, 631502]
kanniyakumari      [629001, 629002, 629003, 629004, 629101, 62910...
kanyakumari        [629001, 629002, 629003, 629004, 629101, 62910...
karur              [621301, 621311, 621313, 621315, 638151, 63900...
krishnagiri        [

In [77]:
sorted(tn_district_pincodes.index.tolist())


['ariyalur',
 'chengalpattu',
 'chennai',
 'coimbatore',
 'cuddalore',
 'dharmapuri',
 'dindigul',
 'erode',
 'kallakurichi',
 'kancheepuram',
 'kanchipuram',
 'kanniyakumari',
 'kanyakumari',
 'karur',
 'krishnagiri',
 'madurai',
 'mayiladuthurai',
 'nagapattinam',
 'namakkal',
 'perambalur',
 'pudukkottai',
 'ramanathapuram',
 'ranipet',
 'salem',
 'sivaganga',
 'tenkasi',
 'thanjavur',
 'the nilgiris',
 'theni',
 'thiruvallur',
 'thiruvarur',
 'thoothukkudi',
 'tiruchirappalli',
 'tirunelveli',
 'tirupathur',
 'tirupattur',
 'tiruppur',
 'tiruvallur',
 'tiruvannamalai',
 'tiruvarur',
 'tuticorin',
 'vellore',
 'villupuram',
 'viluppuram',
 'virudhunagar']

In [78]:
TN_DISTRICT_CANONICAL_MAP = {
    # spelling variants
    ("tamil nadu", "kancheepuram"): "kanchipuram",
    ("tamil nadu", "kanniyakumari"): "kanyakumari",
    ("tamil nadu", "tirupathur"): "tirupattur",
    ("tamil nadu", "thiruvallur"): "tiruvallur",
    ("tamil nadu", "thiruvarur"): "tiruvarur",
    ("tamil nadu", "tuticorin"): "thoothukkudi",
}


In [79]:
def apply_tn_district_fix(row):
    key = (row["state"], row["district_norm"])
    return TN_DISTRICT_CANONICAL_MAP.get(key, row["district_norm"])

for df_raw in [enrol_clean, demo_clean, bio_clean]:
    mask = df_raw["state"] == "tamil nadu"
    df_raw.loc[mask, "district_final"] = df_raw.loc[mask].apply(
        apply_tn_district_fix, axis=1
    )


In [80]:
sorted(
    enrol_clean[enrol_clean["state"] == "tamil nadu"]["district_final"].unique()
)


['ariyalur',
 'chengalpattu',
 'chennai',
 'coimbatore',
 'cuddalore',
 'dharmapuri',
 'dindigul',
 'erode',
 'kallakurichi',
 'kanchipuram',
 'kanyakumari',
 'karur',
 'krishnagiri',
 'madurai',
 'mayiladuthurai',
 'nagapattinam',
 'namakkal',
 'perambalur',
 'pudukkottai',
 'ramanathapuram',
 'ranipet',
 'salem',
 'sivaganga',
 'tenkasi',
 'thanjavur',
 'the nilgiris',
 'theni',
 'thoothukkudi',
 'tiruchirappalli',
 'tirunelveli',
 'tirupattur',
 'tiruppur',
 'tiruvallur',
 'tiruvannamalai',
 'tiruvarur',
 'vellore',
 'villupuram',
 'viluppuram',
 'virudhunagar']

In [81]:
STATE_DISTRICT_SELECTION = {
    "uttar pradesh": [
        "lucknow",
        "kanpur nagar",
        "gorakhpur",
        "prayagraj",
        "varanasi",
    ],
    "maharashtra": [
        "mumbai suburban",
        "pune",
        "nagpur",
        "thane",
        "nashik",
    ],
    "tamil nadu": [
        "chennai",
        "coimbatore",
        "madurai",
        "tiruchirappalli",
        "salem",
    ],
    "rajasthan": [
        "jaipur",
        "jodhpur",
        "kota",
        "udaipur",
        "bikaner",
    ],
    "chhattisgarh": [
        "raipur",
        "bilaspur",
        "durg",
        "korba",
        "raigarh",
    ],
}


In [82]:
def filter_states_districts(df, selection_map):
    return df[
        df["state"].isin(selection_map.keys())
        & df.apply(
            lambda r: r["district_final"] in selection_map[r["state"]],
            axis=1
        )
    ].copy()


In [85]:
def filter_states_districts(df, selection_map):
    df = df[df["state"].isin(selection_map.keys())].copy()
    
    mask = df.apply(
        lambda r: r["district_final"] in selection_map[r["state"]],
        axis=1
    )
    return df[mask].copy()


In [88]:
enrol_subset = filter_states_districts(enrol_clean, STATE_DISTRICT_SELECTION)
demo_subset  = filter_states_districts(demo_clean, STATE_DISTRICT_SELECTION)
bio_subset   = filter_states_districts(bio_clean, STATE_DISTRICT_SELECTION)

print("ENROL:", enrol_subset.shape)
print("DEMO :", demo_subset.shape)
print("BIO  :", bio_subset.shape)


ENROL: (71177, 9)
DEMO : (125035, 8)
BIO  : (114727, 8)


In [89]:
enrol_subset.groupby(["state", "district_final"]).size()


state          district_final 
chhattisgarh   bilaspur           1340
               durg               1034
               korba               891
               raigarh             820
               raipur             1442
maharashtra    mumbai suburban    3251
               nagpur             3043
               nashik             3739
               pune               6663
               thane              4236
rajasthan      bikaner            1539
               jaipur             4670
               jodhpur            2471
               kota               1681
               udaipur            2369
tamil nadu     chennai            3300
               coimbatore         4092
               madurai            2904
               salem              4058
               tiruchirappalli    3769
uttar pradesh  gorakhpur          2555
               kanpur nagar       2601
               lucknow            2933
               prayagraj          3651
               varanasi          

In [90]:
import os

output_dir = "../output/tables"
os.makedirs(output_dir, exist_ok=True)


In [91]:
enrol_subset.to_csv(
    f"{output_dir}/enrolment_clean_5states_25districts_2025.csv",
    index=False
)

demo_subset.to_csv(
    f"{output_dir}/demographic_clean_5states_25districts_2025.csv",
    index=False
)

bio_subset.to_csv(
    f"{output_dir}/biometric_clean_5states_25districts_2025.csv",
    index=False
)


In [92]:
for state in STATE_DISTRICT_SELECTION.keys():
    enrol_subset[enrol_subset["state"] == state].to_csv(
        f"{output_dir}/enrolment_{state.replace(' ', '_')}_2025.csv",
        index=False
    )
    demo_subset[demo_subset["state"] == state].to_csv(
        f"{output_dir}/demographic_{state.replace(' ', '_')}_2025.csv",
        index=False
    )
    bio_subset[bio_subset["state"] == state].to_csv(
        f"{output_dir}/biometric_{state.replace(' ', '_')}_2025.csv",
        index=False
    )
