In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
def load_all_csvs(path):
    files = glob.glob(path)
    dfs = []
    for f in files:
        print("Loading:", f)
        dfs.append(pd.read_csv(f))
    return pd.concat(dfs, ignore_index=True)

enrol = load_all_csvs("../data/enrolment/*.csv")
demo  = load_all_csvs("../data/demographic/*.csv")
bio   = load_all_csvs("../data/biometric/*.csv")


Loading: ../data/enrolment\enrol_01.csv
Loading: ../data/enrolment\enrol_02.csv
Loading: ../data/enrolment\enrol_03.csv
Loading: ../data/demographic\demo_01.csv
Loading: ../data/demographic\demo_02.csv
Loading: ../data/demographic\demo_03.csv
Loading: ../data/demographic\demo_04.csv
Loading: ../data/demographic\demo_05.csv
Loading: ../data/biometric\bio_01.csv
Loading: ../data/biometric\bio_02.csv
Loading: ../data/biometric\bio_03.csv
Loading: ../data/biometric\bio_04.csv


In [3]:
print("Enrolment shape:", enrol.shape)
print("Demographic shape:", demo.shape)
print("Biometric shape:", bio.shape)


Enrolment shape: (1006029, 7)
Demographic shape: (2071700, 6)
Biometric shape: (1861108, 6)


In [4]:
print("Enrolment columns:")
print(enrol.columns)

print("\nDemographic columns:")
print(demo.columns)

print("\nBiometric columns:")
print(bio.columns)


Enrolment columns:
Index(['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17',
       'age_18_greater'],
      dtype='object')

Demographic columns:
Index(['date', 'state', 'district', 'pincode', 'demo_age_5_17',
       'demo_age_17_'],
      dtype='object')

Biometric columns:
Index(['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_'], dtype='object')


In [7]:
enrol['date'] = pd.to_datetime(enrol['date'], errors='coerce')
demo['date']  = pd.to_datetime(demo['date'], errors='coerce')
bio['date']   = pd.to_datetime(bio['date'], errors='coerce')

enrol['year'] = enrol['date'].dt.year
demo['year']  = demo['date'].dt.year
bio['year']   = bio['date'].dt.year



In [8]:
print("Enrolment years:", enrol['year'].value_counts())
print("Demographic years:", demo['year'].value_counts())
print("Biometric years:", bio['year'].value_counts())


Enrolment years: year
2025.0    323791
Name: count, dtype: int64
Demographic years: year
2025    2071700
Name: count, dtype: int64
Biometric years: year
2025.0    917008
Name: count, dtype: int64


In [9]:
for df in [enrol, demo, bio]:
    df['state'] = df['state'].astype(str).str.strip().str.lower()
    df['district'] = df['district'].astype(str).str.strip().str.lower()


In [10]:
def print_state_summary(df, name):
    print("\n" + name)
    print("-" * 40)
    vc = df['state'].value_counts().sort_index()
    print("Unique states:", vc.shape[0])
    print(vc)

print_state_summary(enrol, "ENROLMENT")
print_state_summary(demo,  "DEMOGRAPHIC")
print_state_summary(bio,   "BIOMETRIC")



ENROLMENT
----------------------------------------
Unique states: 49
state
100000                                              22
andaman & nicobar islands                          103
andaman and nicobar islands                        289
andhra pradesh                                   65663
arunachal pradesh                                 1601
assam                                            31827
bihar                                            60567
chandigarh                                         859
chhattisgarh                                     18550
dadra & nagar haveli                                24
dadra and nagar haveli                             162
dadra and nagar haveli and daman and diu           116
daman & diu                                         20
daman and diu                                       92
delhi                                             6804
goa                                               1527
gujarat                                     

In [11]:
CANONICAL_STATES = {
    'andaman and nicobar islands',
    'andhra pradesh',
    'arunachal pradesh',
    'assam',
    'bihar',
    'chandigarh',
    'chhattisgarh',
    'dadra and nagar haveli and daman and diu',
    'delhi',
    'goa',
    'gujarat',
    'haryana',
    'himachal pradesh',
    'jammu and kashmir',
    'jharkhand',
    'karnataka',
    'kerala',
    'ladakh',
    'lakshadweep',
    'madhya pradesh',
    'maharashtra',
    'manipur',
    'meghalaya',
    'mizoram',
    'nagaland',
    'odisha',
    'puducherry',
    'punjab',
    'rajasthan',
    'sikkim',
    'tamil nadu',
    'telangana',
    'tripura',
    'uttar pradesh',
    'uttarakhand',
    'west bengal'
}

len(CANONICAL_STATES)



36

In [12]:
STATE_ALIAS_MAP = {
    # Andaman
    'andaman & nicobar islands': 'andaman and nicobar islands',

    # J&K
    'jammu & kashmir': 'jammu and kashmir',

    # Odisha
    'orissa': 'odisha',

    # West Bengal
    'west bangal': 'west bengal',
    'westbengal': 'west bengal',
    'west  bengal': 'west bengal',
    'west bengli': 'west bengal',

    # UT merger
    'dadra & nagar haveli': 'dadra and nagar haveli and daman and diu',
    'daman & diu': 'dadra and nagar haveli and daman and diu',
    'daman and diu': 'dadra and nagar haveli and daman and diu',

    # Puducherry
    'pondicherry': 'puducherry',

    # Uttarakhand old name
    'uttaranchal': 'uttarakhand'
}


In [13]:
def normalize_and_map_state(s):
    if not isinstance(s, str):
        return None
    s = s.strip().lower()
    s = STATE_ALIAS_MAP.get(s, s)
    return s


In [14]:
for df in [enrol, demo, bio]:
    df['state'] = df['state'].apply(normalize_and_map_state)


In [15]:
for df in [enrol, demo, bio]:
    df.dropna(subset=['state'], inplace=True)
    df = df[df['state'].isin(CANONICAL_STATES)]


In [16]:
enrol = enrol[enrol['state'].isin(CANONICAL_STATES)]
demo  = demo[demo['state'].isin(CANONICAL_STATES)]
bio   = bio[bio['state'].isin(CANONICAL_STATES)]


In [17]:
print("ENROLMENT STATES:", sorted(enrol['state'].unique()))
print("DEMOGRAPHIC STATES:", sorted(demo['state'].unique()))
print("BIOMETRIC STATES:", sorted(bio['state'].unique()))

print(
    enrol['state'].nunique(),
    demo['state'].nunique(),
    bio['state'].nunique()
)


ENROLMENT STATES: ['andaman and nicobar islands', 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chandigarh', 'chhattisgarh', 'dadra and nagar haveli and daman and diu', 'delhi', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jammu and kashmir', 'jharkhand', 'karnataka', 'kerala', 'ladakh', 'lakshadweep', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'puducherry', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal']
DEMOGRAPHIC STATES: ['andaman and nicobar islands', 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chandigarh', 'chhattisgarh', 'dadra and nagar haveli and daman and diu', 'delhi', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jammu and kashmir', 'jharkhand', 'karnataka', 'kerala', 'ladakh', 'lakshadweep', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'puducherry', 'punjab', 'rajasthan', 'sikkim

In [18]:
STATE_TO_ANALYZE = 'uttar pradesh'


In [19]:
enrol_up_districts = (
    enrol[enrol['state'] == STATE_TO_ANALYZE]['district']
    .value_counts()
    .sort_index()
)

print("ENROLMENT – Uttar Pradesh districts")
print("Count:", enrol_up_districts.shape[0])
print(enrol_up_districts)


ENROLMENT – Uttar Pradesh districts
Count: 89
district
agra              1979
aligarh           2055
allahabad         2626
ambedkar nagar    1743
amethi            1806
                  ... 
sitapur           1720
sonbhadra          982
sultanpur         1801
unnao             1501
varanasi          2125
Name: count, Length: 89, dtype: int64


In [20]:
demo_up_districts = (
    demo[demo['state'] == STATE_TO_ANALYZE]['district']
    .value_counts()
    .sort_index()
)

print("DEMOGRAPHIC – Uttar Pradesh districts")
print("Count:", demo_up_districts.shape[0])
print(demo_up_districts)


DEMOGRAPHIC – Uttar Pradesh districts
Count: 90
district
agra              2422
aligarh           3104
allahabad         3704
ambedkar nagar    2615
amethi            3047
                  ... 
sitapur           2196
sonbhadra         1818
sultanpur         2808
unnao             2473
varanasi          3222
Name: count, Length: 90, dtype: int64


In [21]:
bio_up_districts = (
    bio[bio['state'] == STATE_TO_ANALYZE]['district']
    .value_counts()
    .sort_index()
)

print("BIOMETRIC – Uttar Pradesh districts")
print("Count:", bio_up_districts.shape[0])
print(bio_up_districts)


BIOMETRIC – Uttar Pradesh districts
Count: 90
district
agra              2358
aligarh           2852
allahabad         3511
ambedkar nagar    2467
amethi            2919
                  ... 
sitapur           2146
sonbhadra         1617
sultanpur         2597
unnao             2183
varanasi          3027
Name: count, Length: 90, dtype: int64


In [22]:
enrol_set = set(enrol_up_districts.index)
demo_set  = set(demo_up_districts.index)
bio_set   = set(bio_up_districts.index)

print("Enrolment only:", sorted(enrol_set - demo_set - bio_set))
print("Demographic only:", sorted(demo_set - enrol_set - bio_set))
print("Biometric only:", sorted(bio_set - enrol_set - demo_set))

print("Common districts:", len(enrol_set & demo_set & bio_set))


Enrolment only: ['kushi nagar', 'shravasti', 'siddharth nagar']
Demographic only: ['baghpat *', 'chitrakoot *', 'jyotiba phule nagar *']
Biometric only: ['auraiya *', 'gautam buddha nagar *', 'mahoba *']
Common districts: 86


In [23]:
import re

def normalize_district(d):
    if not isinstance(d, str):
        return None
    d = d.lower().strip()
    d = re.sub(r'[^a-z\s]', '', d)   # remove *, numbers, symbols
    d = re.sub(r'\s+', ' ', d)       # collapse spaces
    return d


In [24]:
for df in [enrol, demo, bio]:
    df['district_norm'] = df['district'].apply(normalize_district)


In [25]:
enrol_up = enrol[enrol['state'] == 'uttar pradesh']
demo_up  = demo[demo['state'] == 'uttar pradesh']
bio_up   = bio[bio['state'] == 'uttar pradesh']

enrol_set = set(enrol_up['district_norm'])
demo_set  = set(demo_up['district_norm'])
bio_set   = set(bio_up['district_norm'])

print("Common districts after normalization:",
      len(enrol_set & demo_set & bio_set))

print("Still mismatched:",
      (enrol_set | demo_set | bio_set) - (enrol_set & demo_set & bio_set))


Common districts after normalization: 86
Still mismatched: {'mahoba ', 'kushi nagar', 'chitrakoot ', 'chandauli ', 'auraiya ', 'baghpat ', 'siddharth nagar', 'jyotiba phule nagar ', 'shravasti', 'gautam buddha nagar '}


In [26]:
# Build common (state, district_norm) pairs across all datasets

enrol_pairs = set(zip(enrol['state'], enrol['district_norm']))
demo_pairs  = set(zip(demo['state'], demo['district_norm']))
bio_pairs   = set(zip(bio['state'], bio['district_norm']))

COMMON_STATE_DISTRICTS = enrol_pairs & demo_pairs & bio_pairs

print("Total common state-district pairs:", len(COMMON_STATE_DISTRICTS))


Total common state-district pairs: 945


In [30]:
enrol = enrol[enrol.apply(
    lambda r: (r['state'], r['district_norm']) in COMMON_STATE_DISTRICTS,
    axis=1
)]

demo = demo[demo.apply(
    lambda r: (r['state'], r['district_norm']) in COMMON_STATE_DISTRICTS,
    axis=1
)]

bio = bio[bio.apply(
    lambda r: (r['state'], r['district_norm']) in COMMON_STATE_DISTRICTS,
    axis=1
)]


In [32]:
print(
    enrol[['state','district_norm']].drop_duplicates().shape,
    demo[['state','district_norm']].drop_duplicates().shape,
    bio[['state','district_norm']].drop_duplicates().shape
)


(945, 2) (945, 2) (945, 2)


In [33]:
enrol_agg = (
    enrol
    .groupby(['state', 'district_norm'], as_index=False)
    .agg({
        'age_0_5': 'sum',
        'age_5_17': 'sum',
        'age_18_greater': 'sum'
    })
)

enrol_agg.shape


(945, 5)

In [34]:
demo_agg = (
    demo
    .groupby(['state', 'district_norm'], as_index=False)
    .agg({
        'demo_age_5_17': 'sum',
        'demo_age_17_': 'sum'
    })
)

demo_agg.shape


(945, 4)

In [35]:
bio_agg = (
    bio
    .groupby(['state', 'district_norm'], as_index=False)
    .agg({
        'bio_age_5_17': 'sum',
        'bio_age_17_': 'sum'
    })
)

bio_agg.shape


(945, 4)

In [36]:
df = (
    enrol_agg
    .merge(demo_agg, on=['state', 'district_norm'], how='inner')
    .merge(bio_agg, on=['state', 'district_norm'], how='inner')
)

df.shape


(945, 9)

In [38]:
df.head()

Unnamed: 0,state,district_norm,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,andaman and nicobar islands,andamans,70,5,0,7,743,343,1672
1,andaman and nicobar islands,nicobar,64,11,0,58,729,992,819
2,andaman and nicobar islands,nicobars,1,0,0,0,4,1,1
3,andaman and nicobar islands,north and middle andaman,128,4,0,112,1897,3780,2603
4,andaman and nicobar islands,south andaman,216,12,0,440,3256,6690,3797


In [39]:
# Population that SHOULD need updates
df['expected_update_population'] = (
    df['age_5_17'] + df['age_18_greater']
)

# Updates that ACTUALLY happened
df['observed_updates'] = (
    df['bio_age_5_17'] +
    df['bio_age_17_'] +
    df['demo_age_5_17'] +
    df['demo_age_17_']
)

# Core coverage metric
df['update_coverage_ratio'] = (
    df['observed_updates'] / (df['expected_update_population'] + 1)
)

# Child biometric gap (high-impact metric)
df['child_biometric_gap'] = (
    (df['age_5_17'] - df['bio_age_5_17']) / (df['age_5_17'] + 1)
)


In [40]:
df[['update_coverage_ratio', 'child_biometric_gap']].describe()


Unnamed: 0,update_coverage_ratio,child_biometric_gap
count,945.0,945.0
mean,186.628082,-55.754565
std,265.846453,93.245483
min,0.065828,-919.433824
25%,43.528302,-63.676522
50%,101.189661,-24.781609
75%,214.612527,-9.486753
max,2279.902778,0.996182


In [41]:
df.sort_values('update_coverage_ratio').head(15)[
    ['state', 'district_norm', 'update_coverage_ratio']
]


Unnamed: 0,state,district_norm,update_coverage_ratio
361,karnataka,bengaluru rural,0.065828
140,bihar,pashchim champaran,0.130354
223,gujarat,banas kantha,0.135964
720,tamil nadu,kanchipuram,0.153153
550,meghalaya,eastern west khasi hills,0.384804
248,gujarat,sabar kantha,0.929889
559,meghalaya,west khasi hills,1.230484
563,mizoram,khawzawl,1.34375
243,gujarat,panch mahals,1.365138
554,meghalaya,south garo hills,1.365256


In [42]:
df.sort_values('child_biometric_gap', ascending=False).head(15)[
    ['state', 'district_norm', 'child_biometric_gap']
]


Unnamed: 0,state,district_norm,child_biometric_gap
140,bihar,pashchim champaran,0.996182
361,karnataka,bengaluru rural,0.991792
223,gujarat,banas kantha,0.991071
550,meghalaya,eastern west khasi hills,0.986784
684,rajasthan,jalore,0.978261
720,tamil nadu,kanchipuram,0.977778
248,gujarat,sabar kantha,0.949917
243,gujarat,panch mahals,0.948315
252,gujarat,surendranagar,0.920875
87,assam,dima hasao,0.884615


In [43]:
state_summary = (
    df
    .groupby('state', as_index=False)
    .agg({
        'expected_update_population': 'sum',
        'observed_updates': 'sum',
        'age_5_17': 'sum',
        'bio_age_5_17': 'sum'
    })
)

state_summary['state_update_coverage'] = (
    state_summary['observed_updates'] /
    (state_summary['expected_update_population'] + 1)
)

state_summary['state_child_gap'] = (
    (state_summary['age_5_17'] - state_summary['bio_age_5_17']) /
    (state_summary['age_5_17'] + 1)
)

state_summary.sort_values('state_update_coverage').head(10)


Unnamed: 0,state,expected_update_population,observed_updates,age_5_17,bio_age_5_17,state_update_coverage,state_child_gap
22,meghalaya,88462,175004,53234,36531,1.978273,0.31376
24,nagaland,11075,146384,9953,32561,13.216324,-2.271248
3,assam,88744,1995300,65972,596624,22.48352,-8.043472
4,bihar,335839,9711937,324731,2208141,28.918345,-5.79989
33,uttar pradesh,492387,18120052,474749,6207104,36.800353,-12.074471
29,sikkim,1150,43093,1045,11932,37.439618,-10.408222
2,arunachal pradesh,2387,108837,2236,42894,45.576633,-18.175235
28,rajasthan,118678,6810756,113123,2066742,57.388047,-17.269713
10,gujarat,87177,5020841,71003,1460655,57.592982,-19.571461
14,jharkhand,58389,3427483,56993,876612,58.699829,-14.380794


In [44]:
all_districts = (
    df[['state', 'district_norm']]
    .drop_duplicates()
    .sort_values(['state', 'district_norm'])
)

print("Total unique state–district pairs:", all_districts.shape[0])
all_districts.head(20)


Total unique state–district pairs: 945


Unnamed: 0,state,district_norm
0,andaman and nicobar islands,andamans
1,andaman and nicobar islands,nicobar
2,andaman and nicobar islands,nicobars
3,andaman and nicobar islands,north and middle andaman
4,andaman and nicobar islands,south andaman
5,andhra pradesh,adilabad
6,andhra pradesh,alluri sitharama raju
7,andhra pradesh,anakapalli
8,andhra pradesh,anantapur
9,andhra pradesh,ananthapur


In [45]:
district_count_per_state = (
    all_districts
    .groupby('state')
    .size()
    .sort_values(ascending=False)
)

district_count_per_state


state
uttar pradesh                               86
madhya pradesh                              60
karnataka                                   53
maharashtra                                 52
bihar                                       46
tamil nadu                                  45
andhra pradesh                              45
west bengal                                 43
rajasthan                                   42
odisha                                      39
gujarat                                     39
chhattisgarh                                38
assam                                       37
telangana                                   37
jharkhand                                   32
punjab                                      27
jammu and kashmir                           26
arunachal pradesh                           25
haryana                                     23
nagaland                                    17
kerala                                      15
uttarak

In [46]:
for state, group in all_districts.groupby('state'):
    print("\nSTATE:", state.upper())
    print("-" * 60)
    print(sorted(group['district_norm'].tolist()))



STATE: ANDAMAN AND NICOBAR ISLANDS
------------------------------------------------------------
['andamans', 'nicobar', 'nicobars', 'north and middle andaman', 'south andaman']

STATE: ANDHRA PRADESH
------------------------------------------------------------
['adilabad', 'alluri sitharama raju', 'anakapalli', 'anantapur', 'ananthapur', 'ananthapuramu', 'annamayya', 'bapatla', 'chittoor', 'cuddapah', 'dr b r ambedkar konaseema', 'east godavari', 'eluru', 'guntur', 'hyderabad', 'kakinada', 'karim nagar', 'karimnagar', 'khammam', 'krishna', 'kurnool', 'kv rangareddy', 'kvrangareddy', 'mahabub nagar', 'mahabubnagar', 'mahbubnagar', 'medak', 'n t r', 'nalgonda', 'nandyal', 'nellore', 'nizamabad', 'palnadu', 'parvathipuram manyam', 'prakasam', 'rangareddi', 'sri potti sriramulu nellore', 'sri sathya sai', 'srikakulam', 'tirupati', 'visakhapatnam', 'vizianagaram', 'warangal', 'west godavari', 'y s r']

STATE: ARUNACHAL PRADESH
------------------------------------------------------------
['

In [49]:
df.columns


Index(['state', 'district_norm', 'age_0_5', 'age_5_17', 'age_18_greater',
       'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_',
       'expected_update_population', 'observed_updates',
       'update_coverage_ratio', 'child_biometric_gap'],
      dtype='object')

In [51]:
!pip install rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp313-cp313-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp313-cp313-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 3.3 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 3.5 MB/s  0:00:00
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [56]:
from rapidfuzz import process, fuzz

def match_district(row, ref_state_map, threshold=90):
    state = row['state_norm']
    district = row['district_norm']

    if state not in ref_state_map:
        return None

    choices = ref_state_map[state]

    match, score, _ = process.extractOne(
        district,
        choices,
        scorer=fuzz.token_sort_ratio
    )

    if score >= threshold:
        return match
    else:
        return None


In [62]:
print(df.columns.tolist())


['state', 'district_norm', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'expected_update_population', 'observed_updates', 'update_coverage_ratio', 'child_biometric_gap']


In [65]:
from collections import defaultdict

state_district_map = defaultdict(list)

for state, grp in df.groupby('state'):
    state_district_map[state] = sorted(grp['district_norm'].unique())


In [66]:
district_canonical_map = {}


In [67]:
from rapidfuzz import process, fuzz
import re


In [68]:
def normalize_text(s):
    if not isinstance(s, str):
        return s
    s = s.lower()
    s = s.strip()
    s = re.sub(r'\s+', ' ', s)          # multiple spaces
    s = s.replace('&', 'and')
    s = re.sub(r'\bdistrict\b', '', s) # remove word district
    s = re.sub(r'\s+', ' ', s)
    return s.strip()


In [69]:
df['district_norm'] = df['district_norm'].apply(normalize_text)
df['state'] = df['state'].apply(normalize_text)


In [70]:
state_districts = (
    df.groupby('state')['district_norm']
      .unique()
      .apply(list)
      .to_dict()
)


In [71]:
district_canonical_map = {}

for state, districts in state_districts.items():
    remaining = set(districts)
    
    while remaining:
        base = remaining.pop()
        cluster = [base]
        
        matches = process.extract(
            base,
            list(remaining),
            scorer=fuzz.token_sort_ratio,
            score_cutoff=90
        )
        
        for match, score, _ in matches:
            cluster.append(match)
            remaining.remove(match)
        
        # canonical = longest name (usually most official)
        canonical = max(cluster, key=len)
        
        for d in cluster:
            district_canonical_map[(state, d)] = canonical


In [72]:
df['district_final'] = df.apply(
    lambda r: district_canonical_map.get(
        (r['state'], r['district_norm']),
        r['district_norm']
    ),
    axis=1
)


In [73]:
df.groupby('state')['district_norm'].nunique().sort_values(ascending=False)


state
uttar pradesh                               85
madhya pradesh                              59
karnataka                                   48
maharashtra                                 48
bihar                                       46
tamil nadu                                  45
andhra pradesh                              45
west bengal                                 43
rajasthan                                   42
odisha                                      39
gujarat                                     39
chhattisgarh                                38
assam                                       37
telangana                                   37
jharkhand                                   31
punjab                                      27
jammu and kashmir                           26
arunachal pradesh                           25
haryana                                     23
nagaland                                    17
kerala                                      15
uttarak

In [74]:
df.groupby('state')['district_final'].nunique().sort_values(ascending=False)


state
uttar pradesh                               82
madhya pradesh                              59
maharashtra                                 45
karnataka                                   43
bihar                                       42
rajasthan                                   40
tamil nadu                                  40
andhra pradesh                              40
west bengal                                 39
gujarat                                     38
assam                                       37
chhattisgarh                                37
telangana                                   36
odisha                                      33
jharkhand                                   28
punjab                                      27
jammu and kashmir                           26
arunachal pradesh                           25
haryana                                     23
nagaland                                    17
kerala                                      14
uttarak

In [76]:
sorted(df[df['state']=='uttarakhand']['district_final'].unique())
sorted(df[df['state']=='west bengal']['district_final'].unique())
sorted(df[df['state']=='andaman and nicobar islands']['district_final'].unique())


['andamans', 'nicobars', 'north and middle andaman', 'south andaman']

In [77]:
df.groupby('state')['district_final'].nunique()


state
andaman and nicobar islands                  4
andhra pradesh                              40
arunachal pradesh                           25
assam                                       37
bihar                                       42
chandigarh                                   2
chhattisgarh                                37
dadra and nagar haveli and daman and diu     3
delhi                                       12
goa                                          3
gujarat                                     38
haryana                                     23
himachal pradesh                            13
jammu and kashmir                           26
jharkhand                                   28
karnataka                                   43
kerala                                      14
ladakh                                       2
lakshadweep                                  1
madhya pradesh                              59
maharashtra                                 45
manipur

In [78]:
for state, subdf in df.groupby('state'):
    districts = sorted(subdf['district_final'].unique())
    print(f"\nSTATE: {state.upper()}")
    print("-" * 60)
    print(districts)
    print(f"Count: {len(districts)}")



STATE: ANDAMAN AND NICOBAR ISLANDS
------------------------------------------------------------
['andamans', 'nicobars', 'north and middle andaman', 'south andaman']
Count: 4

STATE: ANDHRA PRADESH
------------------------------------------------------------
['adilabad', 'alluri sitharama raju', 'anakapalli', 'ananthapur', 'ananthapuramu', 'annamayya', 'bapatla', 'chittoor', 'cuddapah', 'dr b r ambedkar konaseema', 'east godavari', 'eluru', 'guntur', 'hyderabad', 'kakinada', 'karim nagar', 'khammam', 'krishna', 'kurnool', 'kv rangareddy', 'mahabub nagar', 'medak', 'n t r', 'nalgonda', 'nandyal', 'nellore', 'nizamabad', 'palnadu', 'parvathipuram manyam', 'prakasam', 'rangareddi', 'sri potti sriramulu nellore', 'sri sathya sai', 'srikakulam', 'tirupati', 'visakhapatnam', 'vizianagaram', 'warangal', 'west godavari', 'y s r']
Count: 40

STATE: ARUNACHAL PRADESH
------------------------------------------------------------
['anjaw', 'changlang', 'dibang valley', 'east kameng', 'east siang',