In [1]:
import pandas as pd
import glob

# Read all CSVs in folder
files = glob.glob("*.csv")   # or give path like "data/*.csv"

df = pd.concat(
    (pd.read_csv(f) for f in files),
    ignore_index=True
)

df.to_csv("Concat.csv", index=False)
print("Merged rows:", len(df))


Merged rows: 2071700


In [2]:
df = pd.read_csv("Concat.csv")

df['state'] = (
    df['state']
    .astype(str)
    .str.strip()
    .str.lower()
)


In [3]:
state_mapping = {
    'andhra pradesh': 'Andhra Pradesh',
    'andhra pradesh ': 'Andhra Pradesh',
    'andhra pradesh.': 'Andhra Pradesh',
    'arunachal pradesh': 'Arunachal Pradesh',
    'assam': 'Assam',
    'bihar': 'Bihar',
    'chandigarh': 'Chandigarh',
    'chhattisgarh': 'Chhattisgarh',
    'delhi': 'Delhi',
    'goa': 'Goa',
    'gujarat': 'Gujarat',
    'haryana': 'Haryana',
    'himachal pradesh': 'Himachal Pradesh',
    'jharkhand': 'Jharkhand',
    'karnataka': 'Karnataka',
    'kerala': 'Kerala',
    'madhya pradesh': 'Madhya Pradesh',
    'maharashtra': 'Maharashtra',
    'manipur': 'Manipur',
    'meghalaya': 'Meghalaya',
    'mizoram': 'Mizoram',
    'nagaland': 'Nagaland',
    'odisha': 'Odisha',
    'orissa': 'Odisha',
    'punjab': 'Punjab',
    'rajasthan': 'Rajasthan',
    'sikkim': 'Sikkim',
    'tamil nadu': 'Tamil Nadu',
    'telangana': 'Telangana',
    'tripura': 'Tripura',
    'uttar pradesh': 'Uttar Pradesh',
    'uttarakhand': 'Uttarakhand',
    'west bengal': 'West Bengal',
    'jammu and kashmir': 'Jammu & Kashmir',
    'jammu & kashmir': 'Jammu & Kashmir',
    'ladakh': 'Ladakh',
    'puducherry': 'Puducherry',
    'andaman and nicobar islands': 'Andaman & Nicobar Islands',
    'dadra and nagar haveli and daman and diu': 'Dadra and Nagar Haveli and Daman and Diu'
}

df['state'] = df['state'].replace(state_mapping)
df['state'] = df['state'].str.title()

# Drop garbage states like 100000
df = df[~df['state'].astype(str).eq('100000')]

df.to_csv("state_cleaned.csv", index=False)
print("Unique states:", df['state'].nunique())

unique_states = sorted(df['state'].dropna().unique())

print("Unique States Count:", len(unique_states))
print("\nUnique States:")
for s in unique_states:
    print(s)



Unique states: 54
Unique States Count: 54

Unique States:
Andaman & Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Balanagar
Bihar
Chandigarh
Chhatisgarh
Chhattisgarh
Dadra & Nagar Haveli
Dadra And Nagar Haveli
Dadra And Nagar Haveli And Daman And Diu
Daman & Diu
Daman And Diu
Darbhanga
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jaipur
Jammu & Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madanapalle
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
Nagpur
Odisha
Pondicherry
Puducherry
Punjab
Puttenahalli
Raja Annamalai Puram
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
Uttaranchal
West  Bengal
West Bangal
West Bengal
West Bengli
Westbengal


In [4]:
# Normalize state column first
df['state'] = (
    df['state']
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r'\s+', ' ', regex=True)
)

state_fix = {
    # Chhattisgarh
    "chhatisgarh": "Chhattisgarh",
    "chhattisgarh": "Chhattisgarh",

    # Dadra / Daman merge (official UT)
    "dadra & nagar haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "dadra and nagar haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "daman & diu": "Dadra and Nagar Haveli and Daman and Diu",
    "daman and diu": "Dadra and Nagar Haveli and Daman and Diu",
    "dadra and nagar haveli and daman and diu": "Dadra and Nagar Haveli and Daman and Diu",

    # Pondicherry
    "pondicherry": "Puducherry",
    "puducherry": "Puducherry",

    # Tamil Nadu
    "tamil nadu": "Tamil Nadu",
    "tamilnadu": "Tamil Nadu",

    # Uttaranchal
    "uttaranchal": "Uttarakhand",
    "uttarakhand": "Uttarakhand",

    # West Bengal
    "west bengal": "West Bengal",
    "west  bengal": "West Bengal",
    "west bangal": "West Bengal",
    "westbengal": "West Bengal",

    # Jammu & Kashmir
    "jammu & kashmir": "Jammu & Kashmir",
    "jammu and kashmir": "Jammu & Kashmir",
}

df['state'] = df['state'].replace(state_fix)

# Title-case final state names
df['state'] = df['state'].str.title()
df['state'] = df['state'].replace({
    "Dadra And Nagar Haveli And Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu"
})


# Final verification
unique_states = sorted(df['state'].unique())
print("Final Unique States Count:", len(unique_states))
for s in unique_states:
    print(s)
df.to_csv("state_cleaned.csv", index=False)


Final Unique States Count: 44
Andaman & Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Balanagar
Bihar
Chandigarh
Chhattisgarh
Dadra and Nagar Haveli and Daman and Diu
Darbhanga
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jaipur
Jammu & Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madanapalle
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
Nagpur
Odisha
Puducherry
Punjab
Puttenahalli
Raja Annamalai Puram
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
West Bengal
West Bengli


In [11]:
official_states = {
    "Andaman & Nicobar Islands",
    "Andhra Pradesh",
    "Arunachal Pradesh",
    "Assam",
    "Bihar",
    "Chandigarh",
    "Chhattisgarh",
    "Dadra and Nagar Haveli and Daman and Diu",
    "Delhi",
    "Goa",
    "Gujarat",
    "Haryana",
    "Himachal Pradesh",
    "Jammu & Kashmir",
    "Jharkhand",
    "Karnataka",
    "Kerala",
    "Ladakh",
    "Lakshadweep",
    "Madhya Pradesh",
    "Maharashtra",
    "Manipur",
    "Meghalaya",
    "Mizoram",
    "Nagaland",
    "Odisha",
    "Puducherry",
    "Punjab",
    "Rajasthan",
    "Sikkim",
    "Tamil Nadu",
    "Telangana",
    "Tripura",
    "Uttar Pradesh",
    "Uttarakhand",
    "West Bengal"
}
df['state'] = df['state'].replace({
    "West Bengli": "West Bengal"
})

bad_state_rows = df[~df['state'].isin(official_states)]

print("Rows with invalid state values:", len(bad_state_rows))
print(bad_state_rows[['state', 'district']].head(20))

df = df[df['state'].isin(official_states)]
df.loc[df['state'] == 'Darbhanga', 'state'] = 'Bihar'
df.loc[df['state'] == 'Jaipur', 'state'] = 'Rajasthan'
df.loc[df['state'] == 'Nagpur', 'state'] = 'Maharashtra'

final_states = sorted(df['state'].unique())

print("Final Unique States Count:", len(final_states))
for s in final_states:
    print(s)
df.to_csv("state_cleaned.csv", index=False)
print("✅ Saved as aadhar_state_cleaned_final.csv")



Rows with invalid state values: 0
Empty DataFrame
Columns: [state, district]
Index: []
Final Unique States Count: 36
Andaman & Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Bihar
Chandigarh
Chhattisgarh
Dadra and Nagar Haveli and Daman and Diu
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jammu & Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
Odisha
Puducherry
Punjab
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
West Bengal
✅ Saved as aadhar_state_cleaned_final.csv


In [14]:
official_states = sorted([
    "Andaman & Nicobar Islands",
    "Andhra Pradesh",
    "Arunachal Pradesh",
    "Assam",
    "Bihar",
    "Chandigarh",
    "Chhattisgarh",
    "Dadra and Nagar Haveli and Daman and Diu",
    "Delhi",
    "Goa",
    "Gujarat",
    "Haryana",
    "Himachal Pradesh",
    "Jammu & Kashmir",
    "Jharkhand",
    "Karnataka",
    "Kerala",
    "Ladakh",
    "Lakshadweep",
    "Madhya Pradesh",
    "Maharashtra",
    "Manipur",
    "Meghalaya",
    "Mizoram",
    "Nagaland",
    "Odisha",
    "Puducherry",
    "Punjab",
    "Rajasthan",
    "Sikkim",
    "Tamil Nadu",
    "Telangana",
    "Tripura",
    "Uttar Pradesh",
    "Uttarakhand",
    "West Bengal"
])

print("\nMissing states:")
print(sorted(set(official_states) - set(unique_states)))

print("\nExtra / invalid states:")
print(sorted(set(unique_states) - set(official_states)))



Missing states:
[]

Extra / invalid states:
[]


In [15]:
len(df["state"].unique())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB


In [None]:
df.duplicated()
df.info()
df.describe()
df.isna()
df.head()
df.tail()
df.shape
df.columns
df.info()
df.duplicated()
df.to_csv("final_merged_demo.csv",index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB


In [14]:
import pandas as pd


df = pd.read_csv("state_cleaned.csv")
df['district'] = (
    df['district']
    .astype(str)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)
districts = sorted(df['district'].unique())

print("Raw Unique Districts Count:", len(districts))
for d in districts:
    print(d)



Raw Unique Districts Count: 973
ANGUL
ANUGUL
Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmadabad
Ahmadnagar
Ahmed Nagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Allahabad
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapur
Ananthapur
Ananthapuramu
Anantnag
Andamans
Angul
Anjaw
Annamayya
Anugal
Anugul
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Aurangabad
Aurangabad(BH)
Aurangabad(bh)
Ayodhya
Azamgarh
Badgam
Bagalkot
Bagalkot *
Bageshwar
Baghpat
Baghpat *
Bagpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Baleshwar
Baleswar
Balianta
Ballari
Ballia
Bally Jagachha
Balod
Baloda Bazar
Balotra
Balrampur
Banas Kantha
Banaskantha
Banda
Bandipore
Bandipur
Bangalore
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Bara Banki
Barabanki
Baramula
Baran
Barddhaman
Bardez
Bardhaman
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Baudh
Beawar
Beed
Begusarai
Belagavi
Belgaum
Bellar

In [13]:
df['district_norm'] = df['district'].str.lower()

dup_districts = (
    df.groupby('district_norm')['district']
    .nunique()
    .sort_values(ascending=False)
)

print(dup_districts[dup_districts > 1])


district_norm
jajpur                 3
nadia                  3
east midnapore         3
hooghly                3
kolkata                2
howrah                 2
south 24 pargana       2
south 24 parganas      2
nuapada                2
aurangabad(bh)         2
rangareddi             2
malda                  2
angul                  2
yadgir                 2
anugul                 2
seraikela-kharsawan    2
chittoor               2
Name: district, dtype: int64


In [15]:
df['district_raw'] = df['district']

df['district'] = (
    df['district']
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r'[*()]', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
)


In [16]:
invalid_exact = {
    "east", "west", "north", "south",
    "north east", "north west",
    "south east", "south west"
}

df = df[~df['district'].isin(invalid_exact)]


In [58]:
district_fix = {

# ================= ODISHA =================
"anugul": "angul",
"anugal": "angul",
"baleshwar": "balasore",
"baleswar": "balasore",
"baudh": "boudh",
"sonapur": "subarnapur",

# ================= WEST BENGAL =================
"hooghiy": "hooghly",
"hugli": "hooghly",
"haora": "howrah",
"hawrah": "howrah",
"koch bihar": "cooch behar",
"east midnapore": "purba medinipur",
"east midnapur": "purba medinipur",
"west midnapore": "paschim medinipur",
"west medinipur": "paschim medinipur",
"bardhaman": "purba bardhaman",
"barddhaman": "purba bardhaman",
"burdwan": "purba bardhaman",
"maldah": "malda",
"puruliya": "purulia",

# ================= KARNATAKA =================
"bellary": "ballari",
"gulbarga": "kalaburagi",
"bijapur": "vijayapura",
"bijapur kar": "vijayapura",
"chickmagalur": "chikkamagaluru",
"chikmagalur": "chikkamagaluru",
"shimoga": "shivamogga",
"tumkur": "tumakuru",

# ================= ANDHRA / TELANGANA =================
"cuddapah": "ysr",
"y s r": "ysr",
"ysr kadapa": "ysr",
"rangareddi": "rangareddy",
"k v rangareddy": "rangareddy",
"medchal?malkajgiri": "medchal-malkajgiri",
"medchal-malkajgiri": "medchal-malkajgiri",

# ================= MAHARASHTRA =================
"ahmadnagar": "ahilyanagar",
"ahmednagar": "ahilyanagar",
"ahmed nagar": "ahilyanagar",
"osmanabad": "dharashiv",
"aurangabad": "chhatrapati sambhajinagar",

# ================= TAMIL NADU =================
"tuticorin": "thoothukudi",
"thoothukkudi": "thoothukudi",
"tirupattur": "tirupathur",
"villupuram": "viluppuram",

# ================= UP / BIHAR =================
"bara banki": "barabanki",
"raebareli": "rae bareli",
"jyotiba phule nagar": "amroha",

# ================= GENERAL =================
"bid": "beed",
"gondiya": "gondia",
"samstipur": "samastipur",
"monghyr": "munger",
"pakaur": "pakur",
"kodarma": "koderma",
"north twenty four parganas": "north 24 parganas",
"south twenty four parganas": "south 24 parganas",
"Mumbai City": "Mumbai City",
"Mumbai Suburban" : "Mumbai Suburban",
"Mumbai Sub Urban": "Mumbai Suburban"

}
df['district'] = df['district'].replace(district_fix)
df.loc[
    (df['state'] == 'Bihar') &
    (df['district'] == 'chhatrapati sambhajinagar'),
    'district'
] = 'aurangabad'
df['district'] = df['district'].str.title()


In [59]:
df['district_norm'] = df['district'].str.lower()

dup_check = (
    df.groupby('district_norm')['district']
    .nunique()
    .sort_values(ascending=False)
)

print(dup_check[dup_check > 1])


Series([], Name: district, dtype: int64)


In [62]:
print("Final Unique Districts:", df['district'].nunique())


Final Unique Districts: 798


In [63]:
# districts that appear only once (usually garbage)
rare = df['district'].value_counts()
problem_districts = rare[rare == 1].index.tolist()

print("Suspicious districts:", len(problem_districts))
for d in problem_districts[:50]:
    print(d)


Suspicious districts: 0


In [64]:
df['district'] = (
    df['district']
    .astype(str)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)


In [65]:
district_fix_final = {
    "Udupi ": "Udupi",
    "Jyotiba Phule Nagar ": "Amroha",          # official rename
    "Bhadrakr": "Bhadrak",
    "Chitrakoot ": "Chitrakoot",
    "Chandauli ": "Chandauli",
    "Dist : Thane": "Thane",
    "Bijapurkar": "Vijayapura",
    "MedchalâMalkajgiri": "Medchal-Malkajgiri"
}

df['district'] = df['district'].replace(district_fix_final)
df['district'] = df['district'].replace({
    "Bandipur": "Bandipore",
    "bandipur": "bandipore"
})





In [66]:
df['district'] = df['district'].str.title()



In [67]:
print("Final Unique Districts:", df['district'].nunique())
df_sorted = df.sort_values(by=['state', 'district'])



Final Unique Districts: 798


In [69]:
df_sorted.to_csv("aadhar_state_dist.csv", index=False)
print("✅ Saved as aadhar_state_district_sorted_final.csv")


✅ Saved as aadhar_state_district_sorted_final.csv


In [35]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("aadhar_state_district_sorted_final.csv")

# Count unique districts
district_count = (
    df['district']
    .value_counts()
    .reset_index()
)

district_count.columns = ['district', 'count']

print(district_count)


                district  count
0        Purba Bardhaman  22374
1      North 24 Parganas  17435
2             Viluppuram  13124
3                   Pune  12450
4        Purba Medinipur  12371
..                   ...    ...
875  Naihati Anandabazar      2
876            Tiruvarur      2
877             Balianta      1
878  Jyotiba Phule Nagar      1
879       Bally Jagachha      1

[880 rows x 2 columns]


In [38]:
import pandas as pd

# Load file
df = pd.read_csv("aadhar_state_district_sorted_final.csv")

# Clean data (important)
df = df.dropna(subset=['state', 'district'])
df['state'] = df['state'].str.strip().str.title()
df['district'] = df['district'].str.strip().str.title()

# Count unique districts per state
state_district_count = (
    df.groupby('state')['district']
    .nunique()
    .reset_index(name='total_districts')
)

print(state_district_count)


                                       state  total_districts
0                  Andaman & Nicobar Islands                5
1                             Andhra Pradesh               45
2                          Arunachal Pradesh               25
3                                      Assam               37
4                                      Bihar               44
5                                 Chandigarh                3
6                               Chhattisgarh               39
7   Dadra And Nagar Haveli And Daman And Diu                4
8                                      Delhi               12
9                                        Goa                5
10                                   Gujarat               39
11                                   Haryana               23
12                          Himachal Pradesh               14
13                           Jammu & Kashmir               26
14                                 Jharkhand               29
15      

In [39]:
import pandas as pd

# Load CSV
df = pd.read_csv("aadhar_state_district_sorted_final.csv")

# Clean district names
df['district'] = df['district'].astype(str).str.strip().str.title()

# Get unique districts
unique_districts = sorted(df['district'].unique())

print(unique_districts)
print("Total unique districts:", len(unique_districts))


['Adilabad', 'Agar Malwa', 'Agra', 'Ahilyanagar', 'Ahmadabad', 'Ahmedabad', 'Aizawl', 'Ajmer', 'Akola', 'Alappuzha', 'Aligarh', 'Alipurduar', 'Alirajpur', 'Allahabad', 'Alluri Sitharama Raju', 'Almora', 'Alwar', 'Ambala', 'Ambedkar Nagar', 'Amethi', 'Amravati', 'Amreli', 'Amritsar', 'Amroha', 'Anakapalli', 'Anand', 'Anantapur', 'Ananthapur', 'Ananthapuramu', 'Anantnag', 'Andamans', 'Angul', 'Anjaw', 'Annamayya', 'Anuppur', 'Araria', 'Ariyalur', 'Arvalli', 'Arwal', 'Ashok Nagar', 'Auraiya', 'Aurangabad', 'Aurangabadbh', 'Ayodhya', 'Azamgarh', 'Badgam', 'Bagalkot', 'Bageshwar', 'Baghpat', 'Bagpat', 'Bahraich', 'Bajali', 'Baksa', 'Balaghat', 'Balangir', 'Balasore', 'Balianta', 'Ballari', 'Ballia', 'Bally Jagachha', 'Balod', 'Baloda Bazar', 'Balotra', 'Balrampur', 'Banas Kantha', 'Banaskantha', 'Banda', 'Bandipore', 'Bangalore', 'Bangalore Rural', 'Banka', 'Bankura', 'Banswara', 'Bapatla', 'Barabanki', 'Baramula', 'Baran', 'Bardez', 'Bareilly', 'Bargarh', 'Barmer', 'Barnala', 'Barpeta', 'B

In [71]:
import pandas as pd

# Load file
df = pd.read_csv("aadhar_state_dist.csv")

# Clean district names
df['district'] = df['district'].astype(str).str.strip().str.title()

# Get unique districts
unique_districts = sorted(df['district'].unique())

# Print ALL districts (no truncation)
for d in unique_districts:
    print(d)

print("\nTotal unique districts:", len(unique_districts))


Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Anantnag
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Ballari
Ballia
Balod
Baloda Bazar
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bidar
Bijnor
Bikaner
Bilaspur
Birbhum
Bishnupur
Biswanath
Bokaro
Bongaigaon
Botad
Boudh
Budaun
Budgam
Bulandshahr
Buldhana
Bundi
Burhanpur
Buxar
Cachar
Central Delhi
Chamarajanagar
Chamba
Chamoli
Champa

In [45]:
import pandas as pd

# Load your file
df = pd.read_csv("aadhar_state_district_sorted_final.csv")

# Basic normalization
df['state'] = (
    df['state']
    .astype(str)
    .str.strip()
    .str.title()
)

df['district'] = (
    df['district']
    .astype(str)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
    .str.title()
)


In [46]:
district_fix_map = {
    # Renamed districts
    "Allahabad": "Prayagraj",
    "Faizabad": "Ayodhya",
    "Ahmednagar": "Ahilyanagar",
    "Hoshangabad": "Narmadapuram",
    "Osmanabad": "Dharashiv",
    "Bellary": "Ballari",
    "Bijapur": "Vijayapura",

    # Spelling variants
    "Ahmadabad": "Ahmedabad",
    "Banas Kantha": "Banaskantha",
    "Bagpat": "Baghpat",
    "Bulandshahar": "Bulandshahr",
    "Buldana": "Buldhana",
    "Chamrajanagar": "Chamarajanagar",
    "Chamrajnagar": "Chamarajanagar",
    "Davangere": "Davanagere",
    "Darjiling": "Darjeeling",
    "Dhaulpur": "Dholpur",
    "Gondiya": "Gondia",
    "Jhunjhunun": "Jhunjhunu",
    "Jalor": "Jalore",
    "Kanyakumari": "Kanniyakumari",
    "Kasargod": "Kasaragod",
    "Khorda": "Khordha",
    "Purnea": "Purnia",
    "Sundergarh": "Sundargarh",
    "Surendra Nagar": "Surendranagar",
    "Yamuna Nagar": "Yamunanagar",

    # J&K / Ladakh
    "Badgam": "Budgam",
    "Baramula": "Baramulla",
    "Shupiyan": "Shopian",
    "Punch": "Poonch",
    "Leh Ladakh": "Leh",

    # Telangana / AP
    "Karim Nagar": "Karimnagar",
    "Mahabub Nagar": "Mahabubnagar",
    "Mahbubnagar": "Mahabubnagar",
    "Medchal−Malkajgiri": "Medchal-Malkajgiri",
    "Jagitial": "Jagtial",
    "Jangoan": "Jangaon",
    
}

df['district'] = df['district'].replace(district_fix_map)


In [47]:
df.loc[
    (df['state'] == "Maharashtra") &
    (df['district'] == "Aurangabad"),
    'district'
] = "Chhatrapati Sambhajinagar"


In [49]:
df.to_csv(
    "aadhar_state_district_standardized.csv",
    index=False
)

print("✅ District names standardized — no rows removed")
print("Total unique districts:", df['district'].nunique())


✅ District names standardized — no rows removed
Total unique districts: 849


In [50]:
from collections import Counter

district_words = df['district'].str.split().explode()
print(district_words.value_counts().head(30))


district
Purba           35323
24              29324
Parganas        29302
North           27066
Bardhaman       26703
West            25888
East            25128
South           25090
Medinipur       22798
Godavari        22193
Nagar           18383
Mumbai          17569
Paschim         14608
Viluppuram      13124
Warangal        12609
Pune            12450
Kannada         12396
Rangareddy      12309
Mahabubnagar    12100
Thrissur        12097
Bengaluru       10948
Nellore         10738
Karimnagar      10683
Ahilyanagar     10330
Hyderabad       10010
Ernakulam        9895
Hooghly          9810
Delhi            9755
Palakkad         9726
Ahmedabad        9710
Name: count, dtype: int64


In [51]:
import pandas as pd

df = pd.read_csv("aadhar_state_district_standardized.csv")
districts = sorted(df['district'].dropna().unique())

for d in districts:
    print(d)

print("\nTotal unique districts:", len(districts))

Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapur
Ananthapur
Ananthapuramu
Anantnag
Andamans
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Aurangabad
Aurangabadbh
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Balianta
Ballari
Ballia
Bally Jagachha
Balod
Baloda Bazar
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Bangalore
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Bardez
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Belgaum
Bemetara
Bengaluru
Bengaluru Rural
Bengaluru South
Betul
Bhabua
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bicholim
Bidar
Bijnor
Bikaner
Bilaspur
Birb

In [52]:
mapping = {

# ===== Andhra Pradesh + Telangana (heavy collapse) =====
"Anantapur": "Anantapuramu",
"Ananthapur": "Anantapuramu",
"Ananthapuramu": "Anantapuramu",
"Cuddapah": "YSR",
"Y. S. R": "YSR",
"Ysr": "YSR",
"N. T. R": "NTR",
"Sri Potti Sriramulu Nellore": "Spsr Nellore",
"Nellore": "Spsr Nellore",
"Warangal": "Warangal",
"Warangal Rural": "Warangal",
"Warangal Urban": "Warangal",
"Hanumakonda": "Warangal",
"K.V. Rangareddy": "Rangareddy",
"K.V.Rangareddy": "Rangareddy",
"Ranga Reddy": "Rangareddy",
"Rangareddi": "Rangareddy",
"Medchal-Malkajgiri": "Rangareddy",
"Medchal Malkajgiri": "Rangareddy",
"Yadadri.": "Yadadri Bhuvanagiri",
"Mahabubnagar": "Mahabubnagar",
"Mahabub Nagar": "Mahabubnagar",
"Jyotiba Phule Nagar": "Amroha",

# ===== Maharashtra (collapse old + new) =====
"Ahmednagar": "Ahilyanagar",
"Ahmadnagar": "Ahilyanagar",
"Ahmed Nagar": "Ahilyanagar",
"Osmanabad": "Osmanabad",
"Dharashiv": "Osmanabad",
"Aurangabad": "Chhatrapati Sambhajinagar",
"Aurangabadbh": "Chhatrapati Sambhajinagar",
"Chatrapati Sambhaji Nagar": "Chhatrapati Sambhajinagar",
"Chhatrapati Sambhajinagar": "Chhatrapati Sambhajinagar",
"Mumbai": "Mumbai",
"Mumbai City": "Mumbai",
"Mumbai Sub Urban": "Mumbai",
"Mumbai Suburban": "Mumbai",

# ===== Karnataka =====
"Bangalore": "Bengaluru",
"Bengaluru": "Bengaluru",
"Bengaluru South": "Bengaluru",
"Bangalore Rural": "Bengaluru",
"Bengaluru Rural": "Bengaluru",
"Belgaum": "Belagavi",
"Bellary": "Ballari",
"Gulbarga": "Kalaburagi",
"Mysore": "Mysuru",
"Shimoga": "Shivamogga",
"Tumkur": "Tumakuru",
"Chickmagalur": "Chikkamagaluru",
"Chikmagalur": "Chikkamagaluru",
"Chamrajanagar": "Chamarajanagar",
"Chamrajnagar": "Chamarajanagar",

# ===== Gujarat =====
"Kachchh": "Kutch",
"Mahesana": "Mehsana",
"Panchmahals": "Panchmahal",
"Panch Mahals": "Panchmahal",
"Surendra Nagar": "Surendranagar",
"The Dangs": "Dang",

# ===== Odisha =====
"Anugul": "Angul",
"Baleshwar": "Balasore",
"Baleswar": "Balasore",
"Baudh": "Boudh",
"Debagarh": "Deogarh",
"Khorda": "Khordha",
"Jagatsinghapur": "Jagatsinghpur",
"Subarnapur": "Sonepur",
"Balianta": "Khordha",

# ===== West Bengal (aggressive) =====
"Coochbehar": "Cooch Behar",
"Hooghiy": "Hooghly",
"Hugli": "Hooghly",
"Haora": "Howrah",
"Hawrah": "Howrah",
"Bally Jagachha": "Howrah",
"Domjur": "Howrah",
"East Midnapur": "Purba Medinipur",
"East Midnapore": "Purba Medinipur",
"West Midnapore": "Paschim Medinipur",
"Medinipur": "Paschim Medinipur",
"Bardhaman": "Purba Bardhaman",
"Burdwan": "Purba Bardhaman",
"Barddhaman": "Purba Bardhaman",
"South 24 Pargana": "South 24 Parganas",
"North Dinajpur": "Uttar Dinajpur",
"South Dinajpur": "Dakshin Dinajpur",
"Purbi Singhbhum": "East Singhbhum",
"Purbi Singhbum": "East Singhbhum",
"Pashchimi Singhbhum": "West Singhbhum",
"Naihati Anandabazar": "North 24 Parganas",
"South Dumdumm": "North 24 Parganas",

# ===== Bihar + UP =====
"Bara Banki": "Barabanki",
"Bhabua": "Kaimur",
"Kaimur Bhabua": "Kaimur",
"Purbi Champaran": "East Champaran",
"Pashchim Champaran": "West Champaran",
"Shrawasti": "Shravasti",
"Raebareli": "Rae Bareli",

# ===== Punjab / Haryana =====
"Mewat": "Nuh",
"S.A.S Nagarmohali": "Mohali",
"Sas Nagar Mohali": "Mohali",
"Sahibzada Ajit Singh Nagar": "Mohali",
"Shaheed Bhagat Singh Nagar": "Nawanshahr",

# ===== J&K / Ladakh =====
"Badgam": "Budgam",
"Punch": "Poonch",
"Shupiyan": "Shopian",
"Leh Ladakh": "Leh",

# ===== North East =====
"North Cachar Hills": "Dima Hasao",
"Sibsagar": "Sivasagar",
"Marigaon": "Morigaon",
"Chumukedima": "Dimapur",

# ===== Goa =====
"Bardez": "North Goa",
"Tiswadi": "North Goa",
"Bicholim": "North Goa",

# ===== Andaman & Nicobar =====
"Andamans": "South Andaman",
"Nicobar": "Nicobars",

# ===== Cities / locality noise =====
"Najafgarh": "South West Delhi",
"Garhwal": "Pauri Garhwal",
"Hardwar": "Haridwar",
"Tuticorin": "Thoothukudi"
}


In [54]:
df['district'] = (
    df['district']
    .str.replace(r'[*()]', '', regex=True)
    .str.strip()
)

df['district'] = df['district'].replace(mapping)

len(df['district'].unique())


798

In [56]:
districts = sorted(df['district'].dropna().unique())

for d in districts:
    print(d)

print("Total districts:", len(districts))


Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Anantnag
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Ballari
Ballia
Balod
Baloda Bazar
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bidar
Bijnor
Bikaner
Bilaspur
Birbhum
Bishnupur
Biswanath
Bokaro
Bongaigaon
Botad
Boudh
Budaun
Budgam
Bulandshahr
Buldhana
Bundi
Burhanpur
Buxar
Cachar
Central Delhi
Chamarajanagar
Chamba
Chamoli
Champa

In [72]:
import pandas as pd

# ===============================
# 1. LOAD DATASET
# ===============================
input_path = "state_cleaned.csv"
df = pd.read_csv(input_path)

print("Loaded rows:", len(df))
print("Columns:", df.columns.tolist())


# ===============================
# 2. DISTRICT CLEANING (BASIC)
# ===============================
df['district'] = (
    df['district']
    .astype(str)
    .str.replace(r'[*()]', '', regex=True)
    .str.strip()
)


# ===============================
# 3. DISTRICT MAPPING (SAFE)
# ===============================
mapping = {

    # --- Mumbai (DO NOT MERGE) ---
    "Mumbai Sub Urban": "Mumbai Suburban",
    "Mumbai Sub-Urban": "Mumbai Suburban",
    "Mumbai Suburb": "Mumbai Suburban",

    # --- Maharashtra ---
    "Ahmednagar": "Ahilyanagar",
    "Ahmadnagar": "Ahilyanagar",
    "Ahmed Nagar": "Ahilyanagar",
    "Aurangabad": "Chhatrapati Sambhajinagar",
    "Chatrapati Sambhaji Nagar": "Chhatrapati Sambhajinagar",
    "Osmanabad": "Dharashiv",
    "Bid": "Beed",
    "Buldana": "Buldhana",
    "Gondiya": "Gondia",

    # --- Andhra Pradesh ---
    "Anantapur": "Anantapuramu",
    "Ananthapur": "Anantapuramu",
    "Cuddapah": "YSR",
    "Y. S. R": "YSR",
    "Ysr": "YSR",
    "Nellore": "Spsr Nellore",
    "Sri Potti Sriramulu Nellore": "Spsr Nellore",
    "N. T. R": "NTR",

    # --- Telangana ---
    "K.V. Rangareddy": "Rangareddy",
    "K.V.Rangareddy": "Rangareddy",
    "Rangareddi": "Rangareddy",
    "Karim Nagar": "Karimnagar",
    "Mahbub Nagar": "Mahabubnagar",

    # --- Karnataka ---
    "Bangalore": "Bengaluru",
    "Belgaum": "Belagavi",
    "Gulbarga": "Kalaburagi",
    "Bellary": "Ballari",
    "Shimoga": "Shivamogga",
    "Tumkur": "Tumakuru",
    "Chickmagalur": "Chikkamagaluru",
    "Chikmagalur": "Chikkamagaluru",

    # --- Tamil Nadu ---
    "Tuticorin": "Thoothukudi",
    "Thoothukkudi": "Thoothukudi",
    "The Nilgiris": "Nilgiris",
    "Villupuram": "Viluppuram",
    "Tirupattur": "Tirupathur",

    # --- Uttar Pradesh ---
    "Allahabad": "Prayagraj",
    "Faizabad": "Ayodhya",
    "Jyotiba Phule Nagar": "Amroha",
    "Sant Ravidas Nagar": "Bhadohi",
    "Sant Ravidas Nagar Bhadohi": "Bhadohi",
    "Shrawasti": "Shravasti",

    # --- Bihar ---
    "Monghyr": "Munger",
    "Bhabua": "Kaimur",
    "Purbi Champaran": "East Champaran",
    "Pashchim Champaran": "West Champaran",

    # --- Odisha ---
    "Baleshwar": "Balasore",
    "Baleswar": "Balasore",
    "Anugul": "Angul",
    "Subarnapur": "Sonepur",

    # --- West Bengal ---
    "Hooghiy": "Hooghly",
    "Hugli": "Hooghly",
    "Haora": "Howrah",
    "Hawrah": "Howrah",
    "Koch Bihar": "Cooch Behar",
    "South Dinajpur": "Dakshin Dinajpur",
    "North Dinajpur": "Uttar Dinajpur",
    "Medinipur": "Paschim Medinipur",
    "East Midnapore": "Purba Medinipur",
    "West Midnapore": "Paschim Medinipur",

    # --- Assam / NE ---
    "North Cachar Hills": "Dima Hasao",
    "Sibsagar": "Sivasagar",
    "Marigaon": "Morigaon",

    # --- Delhi ---
    "East": "East Delhi",
    "West": "West Delhi",
    "North": "North Delhi",
    "South": "South Delhi",
    "North East": "North East Delhi",

    # --- Islands ---
    "Andamans": "South Andaman",
    "Nicobar": "Nicobars",

    # --- Goa ---
    "Bardez": "North Goa",
    "Tiswadi": "North Goa",
    "Bicholim": "North Goa",
}

df['district'] = df['district'].replace(mapping)


# ===============================
# 4. VALIDATION
# ===============================
print("\nMumbai check:")
print(df['district'].value_counts().loc[
    lambda x: x.index.str.contains("Mumbai", case=False)
])

print("\nTotal unique districts:", df['district'].nunique())


# ===============================
# 5. SAVE CLEAN DATASET
# ===============================
output_path = "aadhaar_district_cleaned.csv"
df.to_csv(output_path, index=False)

print("\nSaved cleaned file to:", output_path)


Loaded rows: 2071687
Columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

Mumbai check:
district
Mumbai             8039
Mumbai Suburban    6672
Mumbai City        2858
Name: count, dtype: int64

Total unique districts: 899

Saved cleaned file to: aadhaar_district_cleaned.csv


In [73]:
# get unique districts
unique_districts = sorted(df['district'].dropna().unique())

print("Total unique districts:", len(unique_districts))
print("\n--- UNIQUE DISTRICTS ---\n")

for d in unique_districts:
    print(d)


Total unique districts: 899

--- UNIQUE DISTRICTS ---

ANGUL
ANUGUL
Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmadabad
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Ananthapuramu
Anantnag
Angul
Anjaw
Annamayya
Anugal
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
AurangabadBH
Aurangabadbh
Ayodhya
Azamgarh
Badgam
Bagalkot
Bageshwar
Baghpat
Bagpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Balianta
Ballari
Ballia
Bally Jagachha
Balod
Baloda Bazar
Balotra
Balrampur
Banas Kantha
Banaskantha
Banda
Bandipore
Bandipur
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Bara Banki
Barabanki
Baramula
Baran
Barddhaman
Bardhaman
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Baudh
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Bengaluru Rural
Bengaluru South
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
BhadrakR
Bhagalpu

In [74]:
clean_map = {

    # -----------------
    # CASE / DUPLICATES
    # -----------------
    "ANGUL": "Angul",
    "ANUGUL": "Angul",
    "Anugal": "Angul",

    "JAJPUR": "Jajpur",
    "jajpur": "Jajpur",

    "HOOGHLY": "Hooghly",
    "hooghly": "Hooghly",

    "HOWRAH": "Howrah",

    "KOLKATA": "Kolkata",
    "MALDA": "Malda",
    "Maldah": "Malda",

    "NADIA": "Nadia",
    "nadia": "Nadia",

    "NUAPADA": "Nuapada",

    # -----------------
    # SPELLING VARIANTS
    # -----------------
    "Ahmadabad": "Ahmedabad",
    "Bagpat": "Baghpat",
    "Banas Kantha": "Banaskantha",
    "Bara Banki": "Barabanki",
    "Baramula": "Baramulla",
    "Baudh": "Boudh",
    "BhadrakR": "Bhadrak",
    "Bulandshahar": "Bulandshahr",
    "Chamrajanagar": "Chamarajanagar",
    "Chamrajnagar": "Chamarajanagar",
    "Darjiling": "Darjeeling",
    "Davangere": "Davanagere",
    "Dhaulpur": "Dholpur",
    "Hazaribag": "Hazaribagh",
    "Kasargod": "Kasaragod",
    "Kodarma": "Koderma",
    "Purnea": "Purnia",
    "Puruliya": "Purulia",
    "Raebareli": "Rae Bareli",
    "RaigarhMH": "Raigad",
    "Samstipur": "Samastipur",
    "Seraikela-kharsawan": "Seraikela-Kharsawan",
    "Shi-yomi": "Shi-Yomi",
    "Shupiyan": "Shopian",
    "Surendra Nagar": "Surendranagar",
    "Yamuna Nagar": "Yamunanagar",

    # -----------------
    # ADMIN / OLD NAMES
    # -----------------
    "Ahmednagar": "Ahilyanagar",
    "AurangabadBH": "Aurangabad",
    "Aurangabadbh": "Aurangabad",
    "Badgam": "Budgam",
    "Bijapur": "Vijayapura",
    "BijapurKAR": "Vijayapura",
    "Ganganagar": "Sri Ganganagar",
    "Hoshangabad": "Narmadapuram",
    "Mahabub Nagar": "Mahabubnagar",
    "Mahbubnagar": "Mahabubnagar",
    "Rangareddi": "Rangareddy",
    "Yadadri.": "Yadadri Bhuvanagiri",

    # -----------------
    # ANDHRA / TELANGANA
    # -----------------
    "Ananthapuramu": "Anantapuramu",
    "Cuddapah": "YSR",
    "Y. S. R": "YSR",
    "Nellore": "Spsr Nellore",
    "Jagitial": "Jagtial",
    "Jangoan": "Jangaon",
    "Medchal-malkajgiri": "Medchal-Malkajgiri",
    "Medchal?malkajgiri": "Medchal-Malkajgiri",
    "Medchalâmalkajgiri": "Medchal-Malkajgiri",
    "Medchal−malkajgiri": "Medchal-Malkajgiri",

    # -----------------
    # WEST BENGAL
    # -----------------
    "East Midnapur": "Purba Medinipur",
    "East midnapore": "Purba Medinipur",
    "West Medinipur": "Paschim Medinipur",
    "South 24 Pargana": "South 24 Parganas",
    "South 24 pargana": "South 24 Parganas",
    "South 24 parganas": "South 24 Parganas",
    "South Twenty Four Parganas": "South 24 Parganas",
    "North Twenty Four Parganas": "North 24 Parganas",

    # -----------------
    # JUNK LOCATION TEXT
    # -----------------
    "Dist : Thane": "Thane",
    "South DumDumM": "North 24 Parganas",
    "Sonapur": "Kamrup Metropolitan",

    # -----------------
    # ISLANDS
    # -----------------
    "Leh ladakh": "Leh",

}

df['district'] = df['district'].replace(clean_map)
final = sorted(df['district'].unique())
print("Final district count:", len(final))

for d in final:
    print(d)


Final district count: 839
Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Anantnag
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Aurangabad
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Balianta
Ballari
Ballia
Bally Jagachha
Balod
Baloda Bazar
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Bandipur
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Barddhaman
Bardhaman
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Bengaluru Rural
Bengaluru South
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bidar
Bijnor
Bikaner
Bilaspur
Birbhum
Bishnupur
Biswanath
Bokar

In [75]:
reduce_map = {

    # -----------------
    # CASE CLEANUP
    # -----------------
    "chittoor": "Chittoor",
    "east midnapore": "Purba Medinipur",
    "rangareddi": "Rangareddy",
    "yadgir": "Yadgir",
    "Warangal urban": "Warangal Urban",

    # -----------------
    # CLEAR DUPLICATES
    # -----------------
    "Bandipur": "Bandipore",
    "Barddhaman": "Purba Bardhaman",
    "Bardhaman": "Purba Bardhaman",
    "Gaurela-pendra-marwahi": "Gaurela-Pendra-Marwahi",

    # -----------------
    # ODIA
    # -----------------
    "Jajapur": "Jajpur",
    "Khorda": "Khordha",
    "Sundergarh": "Sundargarh",

    # -----------------
    # GUJARAT
    # -----------------
    "Panch Mahals": "Panchmahals",
    "Sabar Kantha": "Sabarkantha",

    # -----------------
    # BIHAR
    # -----------------
    "Kaimur Bhabua": "Kaimur",
    "Purnea": "Purnia",

    # -----------------
    # WEST BENGAL
    # -----------------
    "Puruliya": "Purulia",

    # -----------------
    # RAJASTHAN
    # -----------------
    "Jhunjhunun": "Jhunjhunu",
    "Jalor": "Jalore",

    # -----------------
    # KARNATAKA
    # -----------------
    "Bijapur": "Vijayapura",
    "Mysore": "Mysuru",

    # -----------------
    # TELANGANA / AP
    # -----------------
    "Mahbubnagar": "Mahabubnagar",

    # -----------------
    # DELHI
    # -----------------
    "South  Twenty Four Parganas": "South 24 Parganas",

    # -----------------
    # JANJGIR
    # -----------------
    "Janjgir - Champa": "Janjgir-Champa",
    "Janjgir Champa": "Janjgir-Champa",

    # -----------------
    # ANDAMAN
    # -----------------
    "North And Middle Andaman": "North and Middle Andaman",

    # -----------------
    # DUPLICATE HILLS
    # -----------------
    "Lahul & Spiti": "Lahaul and Spiti",
    "Lahul and Spiti": "Lahaul and Spiti",

}


In [76]:
df['district'] = df['district'].replace(reduce_map)


In [77]:
final = sorted(df['district'].unique())
print("Final district count:", len(final))


Final district count: 818


In [78]:
final_reduce_map = {

    # -----------------
    # WEST BENGAL
    # -----------------
    "Barddhaman": "Purba Bardhaman",
    "Bardhaman": "Purba Bardhaman",
    "South  Twenty Four Parganas": "South 24 Parganas",
    "South 24 pargana": "South 24 Parganas",
    "South 24 parganas": "South 24 Parganas",
    "North Twenty Four Parganas": "North 24 Parganas",
    "West Medinipur": "Paschim Medinipur",
    "East Midnapur": "Purba Medinipur",
    "East midnapore": "Purba Medinipur",

    # -----------------
    # ODISHA
    # -----------------
    "Sonapur": "Subarnapur",
    "Baloda Bazar": "Balodabazar-Bhatapara",

    # -----------------
    # ASSAM
    # -----------------
    "Kamrup Metro": "Kamrup Metropolitan",

    # -----------------
    # TELANGANA
    # -----------------
    "Warangal urban": "Warangal Urban",
    "Warangal Rural": "Warangal",
    "K.v. Rangareddy": "Rangareddy",
    "rangareddi": "Rangareddy",

    # -----------------
    # ANDHRA PRADESH
    # -----------------
    "YSR": "Y.S.R. Kadapa",
    "Y. S. R": "Y.S.R. Kadapa",

    # -----------------
    # BIHAR
    # -----------------
    "Kaimur Bhabua": "Kaimur",
    "Purnea": "Purnia",

    # -----------------
    # RAJASTHAN
    # -----------------
    "Jhunjhunun": "Jhunjhunu",
    "Jalor": "Jalore",

    # -----------------
    # KARNATAKA
    # -----------------
    "Bijapur": "Vijayapura",
    "Mysore": "Mysuru",

    # -----------------
    # HILLS & HYPHENS
    # -----------------
    "Lahul & Spiti": "Lahaul and Spiti",
    "Lahul and Spiti": "Lahaul and Spiti",

    # -----------------
    # CAPITALIZATION CLEANUP
    # -----------------
    "hooghly": "Hooghly",
    "nadia": "Nadia",
    "jajpur": "Jajpur",

}


In [79]:
df['district'] = df['district'].replace(final_reduce_map)


In [80]:
print("Final district count:", df['district'].nunique())


Final district count: 815


In [81]:
# get unique districts
districts = sorted(df['district'].dropna().unique())

print(f"Final district count: {len(districts)}\n")

for d in districts:
    print(d)


Final district count: 815

Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Anantnag
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Aurangabad
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Balianta
Ballari
Ballia
Bally Jagachha
Balod
Balodabazar-Bhatapara
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Bengaluru Rural
Bengaluru South
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bidar
Bijnor
Bikaner
Bilaspur
Birbhum
Bishnupur
Biswanath
Bokaro
Bongaigaon
Botad
B

In [83]:
mapping_final = {
    "Barddhaman": "Purba Bardhaman",
    "Bardhaman": "Purba Bardhaman",
    "Balodabazar-Bhatapara": "Baloda Bazar",
    "Gaurela-pendra-marwahi": "Gaurela-Pendra-Marwahi",
    "Janjgir-champa": "Janjgir-Champa",
    "Kamrup Metropolitan": "Kamrup Metro",
    "Sundergarh": "Sundargarh",
    "Y.S.R. Kadapa": "YSR",
    "chittoor": "Chittoor",
    "rangareddi": "Rangareddy",
    "yadgir": "Yadgir",

    "Aurangabad": "Chhatrapati Sambhajinagar",
    "Dharashiv": "Osmanabad",
    "Mahesana": "Mehsana",
    "Kachchh": "Kutch",
    "Narmadapuram": "Narmada",
    "East Nimar": "Khandwa",
    "West Nimar": "Khargone",

    "North and Middle Andaman": "North And Middle Andaman",
    "South  Twenty Four Parganas": "South 24 Parganas",
    "Tamulpur District": "Tamulpur",

    "Bally Jagachha": "Howrah",
    "Domjur": "Howrah",
    "Balianta": "Khordha",
    "Najafgarh": "South West Delhi",
    "Naihati Anandabazar": "North 24 Parganas",

    "Dadra & Nagar Haveli": "Dadra and Nagar Haveli"
}

df['district'] = df['district'].replace(mapping_final)
df['district'] = df['district'].str.strip().str.title()

len(df['district'].dropna().unique())


804

In [84]:
districts = sorted(df['district'].dropna().unique())
print("Final district count:", len(districts))
print("\n--- DISTRICTS ---\n")
for d in districts:
    print(d)


Final district count: 804

--- DISTRICTS ---

Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Anantnag
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Ballari
Ballia
Balod
Baloda Bazar
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Bengaluru Rural
Bengaluru South
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bidar
Bijnor
Bikaner
Bilaspur
Birbhum
Bishnupur
Biswanath
Bokaro
Bongaigaon
Botad
Boudh
Budaun
Budgam
Buland

In [85]:
df['district'] = (
    df['district']
    .astype(str)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)


In [86]:
df['district'] = (
    df['district']
    .str.replace('−', '-', regex=False)
    .str.replace('–', '-', regex=False)
    .str.replace('â', '-', regex=False)
)


In [87]:
final_mapping = {
    # Andhra Pradesh
    "Y. S. R": "YSR",
    "Ysr": "YSR",

    "N. T. R": "NTR",
    "Ntr": "NTR",

    "Spsr Nellore": "Sri Potti Sriramulu Nellore",

    # Telangana
    "K.V.Rangareddy": "Rangareddy",
    "K.V. Rangareddy": "Rangareddy",

    # Karnataka
    "Bangalore": "Bengaluru",

    # West Bengal
    "South 24 Pargana": "South 24 Parganas",
    "North Twenty Four Parganas": "North 24 Parganas",
    "Hooghiy": "Hooghly",
    "Hugli": "Hooghly",

    # Odisha
    "ANGUL": "Angul",
    "ANUGUL": "Angul",
    "Anugul": "Angul",

    # Maharashtra
    "Ahmadnagar": "Ahilyanagar",
    "Ahmed Nagar": "Ahilyanagar",

    # Jammu & Kashmir
    "Badgam": "Budgam",
    "Baramula": "Baramulla",

    # Misc
    "Pakaur": "Pakur",
    "Kaimur (Bhabua)": "Kaimur",
}

df['district'] = df['district'].replace(final_mapping)

final_districts = sorted(df['district'].dropna().unique())
print("✅ Final unique districts:", len(final_districts))


✅ Final unique districts: 803


In [88]:
final_fix = {
    # Andhra Pradesh
    "Y. S. R": "YSR",
    "Ysr": "YSR",

    "N. T. R": "NTR",
    "Ntr": "NTR",

    # Telangana
    "K.V. Rangareddy": "Rangareddy",
    "K.V.Rangareddy": "Rangareddy",

    # West Bengal
    "South 24 Pargana": "South 24 Parganas",
    "North Twenty Four Parganas": "North 24 Parganas",
    "Hooghiy": "Hooghly",
    "Hugli": "Hooghly",

    # Odisha
    "ANGUL": "Angul",
    "ANUGUL": "Angul",
    "Anugul": "Angul",

    # Maharashtra
    "Ahmed Nagar": "Ahilyanagar",
    "Ahmadnagar": "Ahilyanagar",

    # Karnataka
    "Bangalore": "Bengaluru",
    "Belgaum": "Belagavi",
    "Bellary": "Ballari",

    # J&K
    "Badgam": "Budgam",
    "Baramula": "Baramulla",

    # Bihar
    "Aurangabadbh": "Aurangabad (Bihar)",  # KEEP DISTINCT

    # Misc
    "Pakaur": "Pakur",
    "Kaimur (Bhabua)": "Kaimur",
}


df['district'] = df['district'].replace(final_fix)


final_districts = sorted(df['district'].dropna().unique())
print("✅ FINAL DISTRICT COUNT:", len(final_districts))


✅ FINAL DISTRICT COUNT: 803


In [89]:
from collections import defaultdict
import re

groups = defaultdict(list)

for d in districts:
    key = re.sub(r'[^a-z0-9]', '', d.lower())
    groups[key].append(d)

# print only suspicious groups
for k, v in groups.items():
    if len(v) > 1:
        print(v)


['S.A.S Nagarmohali', 'Sas Nagar Mohali']


In [90]:
district_mapping = {
    "S.A.S Nagarmohali": "SAS Nagar Mohali",
    "Sas Nagar Mohali": "SAS Nagar Mohali",
}

df['district'] = df['district'].replace(district_mapping)


In [91]:
df[df['district'].str.contains("Mohali", case=False, na=False)]['district'].value_counts()


district
SAS Nagar Mohali    3089
Mohali                 5
Name: count, dtype: int64

In [92]:
district_mapping = {
    "Mohali": "SAS Nagar Mohali"
}
df['district'] = df['district'].replace(district_mapping)


In [93]:
df[df['district'].str.contains("Mohali", case=False, na=False)]['district'].value_counts()


district
SAS Nagar Mohali    3094
Name: count, dtype: int64

In [94]:
final_districts = sorted(df['district'].dropna().unique())
print("✅ FINAL DISTRICT COUNT:", len(final_districts))

✅ FINAL DISTRICT COUNT: 801


In [95]:
districts = sorted(df['district'].dropna().unique())
print("Final district count:", len(districts))
print("\n--- DISTRICTS ---\n")
for d in districts:
    print(d)

Final district count: 801

--- DISTRICTS ---

Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Anantnag
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Ballari
Ballia
Balod
Baloda Bazar
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Bengaluru Rural
Bengaluru South
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bidar
Bijnor
Bikaner
Bilaspur
Birbhum
Bishnupur
Biswanath
Bokaro
Bongaigaon
Botad
Boudh
Budaun
Budgam
Buland

In [101]:
districts = sorted(df['district'].dropna().unique())
print("Final district count:", len(districts))
print("\n--- DISTRICTS ---\n")
for d in districts:
    print(d)

Final district count: 797

--- DISTRICTS ---

Adilabad
Agar Malwa
Agra
Ahilyanagar
Ahmedabad
Aizawl
Ajmer
Akola
Alappuzha
Aligarh
Alipurduar
Alirajpur
Alluri Sitharama Raju
Almora
Alwar
Ambala
Ambedkar Nagar
Amethi
Amravati
Amreli
Amritsar
Amroha
Anakapalli
Anand
Anantapuramu
Anantnag
Angul
Anjaw
Annamayya
Anuppur
Araria
Ariyalur
Arvalli
Arwal
Ashok Nagar
Auraiya
Ayodhya
Azamgarh
Bagalkot
Bageshwar
Baghpat
Bahraich
Bajali
Baksa
Balaghat
Balangir
Balasore
Ballari
Ballia
Balod
Baloda Bazar
Balotra
Balrampur
Banaskantha
Banda
Bandipore
Bangalore Rural
Banka
Bankura
Banswara
Bapatla
Barabanki
Baramulla
Baran
Bareilly
Bargarh
Barmer
Barnala
Barpeta
Barwani
Bastar
Basti
Bathinda
Beawar
Beed
Begusarai
Belagavi
Bemetara
Bengaluru
Bengaluru Rural
Betul
Bhadohi
Bhadradri Kothagudem
Bhadrak
Bhagalpur
Bhandara
Bharatpur
Bharuch
Bhavnagar
Bhilwara
Bhind
Bhiwani
Bhojpur
Bhopal
Bidar
Bijnor
Bikaner
Bilaspur
Birbhum
Bishnupur
Biswanath
Bokaro
Bongaigaon
Botad
Boudh
Budaun
Budgam
Bulandshahr
Buldhana
B

In [100]:
df['district'] = df['district'].replace({
    "Jagatsinghapur": "Jagatsinghpur"
})


In [98]:
df['district'] = df['district'].replace({
    "Chittaurgarh": "Chittorgarh"
})


In [96]:
final_mapping = {
    "Bengaluru South": "Bengaluru",

    "Manendragarh-Chirmiri-Bharatpur": "Manendragarh–Chirmiri–Bharatpur",

    "Ysr": "YSR",
    "Y. S. R": "YSR",

    "The Nilgiris": "Nilgiris",

    "Sonepur": "Subarnapur",

    "N. T. R": "NTR",

    "Warangal Urban": "Hanumakonda"
}

df['district'] = df['district'].replace(final_mapping)


In [102]:
output_file = "aadhaar_district_FINAL_VERIFIED.csv"
df.to_csv(output_file, index=False)

print(f"✅ Saved final cleaned file as: {output_file}")

✅ Saved final cleaned file as: aadhaar_district_FINAL_VERIFIED.csv
