In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
# Update this to your correct data folder path if needed
DATA_DIR = r"d:\uidai-data-hackathon\data\enrolment"
FILES = ['enrol_01.csv', 'enrol_02.csv', 'enrol_03.csv']
OUTPUT_FILE = r"d:\uidai-data-hackathon\data\enrolment\enrol_cleaned.csv"

## Step 1: Load Data

In [None]:
# --- STEP 1: LOAD DATA ---
print("Loading datasets...")
dfs = []
for f in FILES:
    path = os.path.join(DATA_DIR, f)
    if os.path.exists(path):
        print(f"Reading {f}...")
        df_temp = pd.read_csv(path)
        dfs.append(df_temp)
    else:
        print(f"Warning: {f} not found!")

if not dfs:
    raise FileNotFoundError("No data files found.")

df = pd.concat(dfs, ignore_index=True)
print(f"Total records loaded: {len(df)}")

## Step 2: Normalization

In [None]:
# --- STEP 2: ROBUST TEXT NORMALIZATION ---
print("Cleaning text fields...")
# Function to normalize text: Title Case, remove multiple spaces
def normalize_text(text):
    if pd.isna(text):
        return text
    text = str(text).strip()
    text = " ".join(text.split()) # Replaces multiple spaces with single space
    return text.title()

for col in ['state', 'district']:
    df[col] = df[col].apply(normalize_text)
    
# Remove rows with obviously bad state names (e.g., numbers)
df = df[~df['state'].str.match(r'^\d+$')]
print(f"Records after removing invalid states: {len(df)}")

## Step 3: Mapping Definitions

In [None]:
# --- STEP 3: MAPPING DICTIONARIES ---

# STATE MAPPING
# Keys should be in Title Case as we normalized above
state_map = {
    'Andaman & Nicobar Islands': 'Andaman and Nicobar Islands',
    'Andaman & Nicobar Islan': 'Andaman and Nicobar Islands',
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra And Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'The Dadra And Nagar Haveli And Daman And Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman And Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Jammu & Kashmir': 'Jammu and Kashmir',
    'Jammu And Kashmir': 'Jammu and Kashmir',
    'Orissa': 'Odisha',
    'Pondicherry': 'Puducherry',
    'West Bangal': 'West Bengal',
    'Westbengal': 'West Bengal',
    'Telangana': 'Telangana',
    'Chattisgarh': 'Chhattisgarh',
}

# DISTRICT MAPPING (State -> {Old Name -> New Name})
# Note: Ensure keys match the Title Case format
district_map = {
    'Andaman And Nicobar Islands': {
        'Andamans': 'South Andaman',
        'Nicobars': 'Nicobar',
        'North And Middle Andaman': 'North and Middle Andaman' 
    },
    'Andhra Pradesh': {
        'Ananthapur': 'Anantapuramu',
        'Anantapur': 'Anantapuramu',
        'Ananthapuramu': 'Anantapuramu',
        'Cuddapah': 'Kadapa (YSR Kadapa)',
        'Y. S. R': 'Kadapa (YSR Kadapa)',
        'Dr. B. R. Ambedkar Konaseema': 'Konaseema',
        'Visakhapatanam': 'Visakhapatnam',
        'N. T. R': 'NTR',
        'Nellore': 'Spsr Nellore',
        'Sri Potti Sriramulu Nellore': 'Spsr Nellore',
    },
    'Arunachal Pradesh': {
        'Leparada': 'Lower Siang',
        'Pakke Kessang': 'Pakke-Kessang',
        'Shi-Yomi': 'Shi Yomi',
        'Siang': 'Siang',
    },
    'Assam': {
        'Bajali': 'Barpeta',
        'Kamrup Metro': 'Kamrup Metropolitan',
        'Marigaon': 'Morigaon',
        'North Cachar Hills': 'Dima Hasao',
        'Sibsagar': 'Sivasagar',
        'Sribhumi': 'Karimganj',
        'South Salmara Mankachar': 'South Salmara-Mankachar',
        'Tamulpur District': 'Tamulpur',
        'West Karbi Anglong': 'Karbi Anglong',
    },
    'Bihar': {
        'Aurangabad(Bh)': 'Aurangabad',
        'Bhabua': 'Kaimur (Bhabua)',
        'East Champaran': 'East Champaran (Purvi Champaran)',
        'Monghyr': 'Munger',
        'Pashchim Champaran': 'West Champaran',
        'Purba Champaran': 'East Champaran (Purvi Champaran)',
        'Purbi Champaran': 'East Champaran (Purvi Champaran)',
        'Purnea': 'Purnia',
        'Samstipur': 'Samastipur',
        'Sheikpura': 'Sheikhpura',
    },
    'Chhattisgarh': {
        'Dantewada': 'Dakshin Bastar Dantewada',
        'Janjgir - Champa': 'Janjgir-Champa',
        'Janjgir Champa': 'Janjgir-Champa',
        'Kabeerdham': 'Kabirdham',
        'Kanker': 'Uttar Bastar Kanker',
        'Mohalla-Manpur-Ambagarh Chowki': 'Mohla-Manpur-Ambagarh Chouki'
    },
    'Delhi': {
        'North East': 'North East Delhi',
        'North East *': 'North East Delhi'
    },
    'Gujarat': {
        'Ahmadabad': 'Ahmedabad',
        'Banas Kantha': 'Banaskantha',
        'Dohad': 'Dahod',
        'Kachchh': 'Kutch',
        'Mahesana': 'Mehsana',
        'Panch Mahals': 'Panchmahal',
        'Panchmahals': 'Panchmahal',
        'Sabar Kantha': 'Sabarkantha',
        'Surendra Nagar': 'Surendranagar',
        'The Dangs': 'Dang'
    },
    'Haryana': {
        'Gurgaon': 'Gurugram',
        'Jhajjar *': 'Jhajjar',
        'Mewat': 'Nuh',
        'Yamuna Nagar': 'Yamunanagar'
    },
    'Himachal Pradesh': {
        'Lahul & Spiti': 'Lahaul and Spiti',
        'Lahul And Spiti': 'Lahaul and Spiti'
    },
    'Jammu And Kashmir': {
        'Badgam': 'Budgam',
        'Bandipore': 'Bandipora',
        'Baramula': 'Baramulla',
        'Leh (Ladakh)': 'Leh',
        'Punch': 'Poonch',
        'Shupiyan': 'Shopian'
    },
    'Jharkhand': {
        'Bokaro *': 'Bokaro',
        'East Singhbum': 'East Singhbhum',
        'Garhwa *': 'Garhwa',
        'Hazaribag': 'Hazaribagh',
        'Kodarma': 'Koderma',
        'Palamau': 'Palamu',
        'Pashchimi Singhbhum': 'West Singhbhum',
        'Purbi Singhbhum': 'East Singhbhum',
        'Sahibganj': 'Sahebganj', 
        'Seraikela-Kharsawan': 'Seraikela-Kharsawan'
    },
    'Karnataka': {
        'Bagalkot *': 'Bagalkot',
        'Bangalore': 'Bengaluru',
        'Bangalore Rural': 'Bengaluru Rural',
        'Belgaum': 'Belagavi',
        'Bellary': 'Ballari',
        'Bijapur': 'Vijayapura',
        'Chamarajanagar': 'Chamarajanagara',
        'Chamarajanagar *': 'Chamarajanagara',
        'Chamrajanagar': 'Chamarajanagara',
        'Chamrajnagar': 'Chamarajanagara',
        'Chickmagalur': 'Chikkamagaluru',
        'Chikmagalur': 'Chikkamagaluru',
        'Davanagere': 'Davangere',
        'Gadag *': 'Gadag',
        'Gulbarga': 'Kalaburagi',
        'Hasan': 'Hassan',
        'Haveri *': 'Haveri',
        'Hosapete': 'Vijayanagara',
        'Mysore': 'Mysuru',
        'Shimoga': 'Shivamogga',
        'Tumkur': 'Tumakuru',
        'Yadgir': 'Yadgir'
    },
    'Kerala': {
        'Kasargod': 'Kasaragod'
    },
    'Maharashtra': {
        'Ahmed Nagar': 'Ahmednagar', 
        'Bid': 'Beed',
        'Buldana': 'Buldhana',
        'Chatrapati Sambhaji Nagar': 'Chhatrapati Sambhajinagar',
        'Aurangabad': 'Chhatrapati Sambhajinagar',
        'Osmanabad': 'Dharashiv',
        'Gondia': 'Gondiya',
        'Gondiya *': 'Gondiya',
        'Hingoli *': 'Hingoli',
        'Mumbai City': 'Mumbai',
        'Mumbai( Sub Urban )': 'Mumbai Suburban',
        'Nandurbar *': 'Nandurbar',
        'Raigarh': 'Raigad',
        'Raigarh(Mh)': 'Raigad',
        'Washim *': 'Washim'
    },
    'Odisha': {
        'Anugul': 'Angul',
        'Baleshwar': 'Balasore',
        'Baleswar': 'Balasore',
        'Baudh': 'Boudh',
        'Jajapur': 'Jajpur',
        'Jagatsinghapur': 'Jagatsinghpur',
        'Khorda': 'Khordha',
        'Nabarangapur': 'Nabarangpur',
        'Nuapada': 'Nuapada',
        'Sonapur': 'Subarnapur',
        'Sundergarh': 'Sundargarh'
    },
    'Punjab': {
        'Ferozepur': 'Firozpur',
        'S.A.S Nagar': 'Sahibzada Ajit Singh Nagar',
        'S.A.S Nagar(Mohali)': 'Sahibzada Ajit Singh Nagar',
        'Sas Nagar (Mohali)': 'Sahibzada Ajit Singh Nagar',
        'Sri Muktsar Sahib': 'Muktsar'
    },
    'Rajasthan': {
        'Chittaurgarh': 'Chittorgarh',
        'Dhaulpur': 'Dholpur',
        'Jalore': 'Jalor'
    },
    'Tamil Nadu': {
        'Kancheepuram': 'Kanchipuram',
        'Kanniyakumari': 'Kanyakumari',
        'Thiruvallur': 'Tiruvallur',
        'Thoothukkudi': 'Thoothukudi',
        'Villupuram': 'Viluppuram'
    },
    'Telangana': {
        'Mahabub Nagar': 'Mahabubnagar',
        'Rangareddi': 'Rangareddy',
        'Warangal Urban': 'Hanumakonda',
        'Warangal Rural': 'Warangal'
    },
    'Uttar Pradesh': {
        'Allahabad': 'Prayagraj',
        'Bagpat': 'Baghpat',
        'Bara Banki': 'Barabanki',
        'Bulandshahr': 'Bulandshahar',
        'Faizabad': 'Ayodhya',
        'Gautam Buddha Nagar': 'Gautam Buddh Nagar',
        'Jyotiba Phule Nagar': 'Amroha',
        'Kanshiram Nagar': 'Kasganj',
        'Kheri': 'Lakhimpur Kheri',
        'Mahamaya Nagar': 'Hathras',
        'Mahrajganj': 'Maharajganj',
        'Ramabai Nagar': 'Kanpur Dehat',
        'Sant Ravidas Nagar': 'Bhadohi',
        'Shravasti': 'Shrawasti',
        'Siddharth Nagar': 'Siddharthnagar'
    },
    'Uttarakhand': {
        'Garhwal': 'Pauri Garhwal',
        'Hardwar': 'Haridwar',
        'Nainital': 'Naini Tal',
        'Udham Singh Nagar': 'Udam Singh Nagar'
    },
    'West Bengal': {
        'Burdwan': 'Purba Bardhaman',
        'Darjiling': 'Darjeeling',
        'Hooghly': 'Hugli',
        'Howrah': 'Haora',
        'Koch Bihar': 'Cooch Behar',
        'Coochbehar': 'Cooch Behar',
        'Maldah': 'Malda',
        'North 24 Parganas': 'North Twenty Four Parganas',
        'North 24 Pargana': 'North Twenty Four Parganas',
        'Puruliya': 'Purulia',
        'South 24 Parganas': 'South Twenty Four Parganas',
        'South 24 parganas': 'South Twenty Four Parganas',
        'South Twenty Four Parganas': 'South 24 Parganas',
        'West Midnapore': 'Paschim Medinipur',
        'West Medinipur': 'Paschim Medinipur'
    }
}

## Step 4: Apply Cleaning Logic

In [None]:
# --- STEP 4: APPLY MAPPINGS ---
print("Applying mappings...")

# --- PRE-CHECK: Unique States BEFORE Mapping ---
print("\n--- STATES BEFORE MAPPING ---")
print(sorted(df['state'].unique()))
print("-"*30 + "\n")

# 1. Map States
df['state'] = df['state'].replace(state_map)

# --- INTERMEDIATE CHECK: Unique States After Mapping ---
print("\n--- STATES AFTER MAPPING ---")
print(sorted(df['state'].unique()))
print("-"*30 + "\n")

# 2. Map Districts (State-wise)
def clean_district(row):
    state = row['state']
    dist = row['district']
    
    if state in district_map:
        if dist in district_map[state]:
            return district_map[state][dist]
    return dist

df['district'] = df.apply(clean_district, axis=1)

# --- STEP 5: FINAL CLEANUP & EXPORT ---
# Capitalize properly
df['state'] = df['state'].str.title()
df['district'] = df['district'].str.title()

print("Saving cleaned data...")
df.to_csv(OUTPUT_FILE, index=False)
print(f"Done! Saved to {OUTPUT_FILE}")

## Step 5: Verification

In [None]:
# --- RESULTS: FINAL STATE & DISTRICT LIST ---
print("\n--- FINAL DATA STRUCTURE ---\n")

states = sorted(df['state'].unique())
print(f"Total Unique States: {len(states)}\n")

for state in states:
    districts = sorted(df[df['state'] == state]['district'].unique())
    print(f"[{state}] ({len(districts)} Districts):")
    # Print formatted list
    print(", ".join(districts))
    print("") # Empty line for readability