# Aadhaar UIDAI – Combined Pipeline Notebook

This notebook is a **direct combination** of existing Python scripts.

❗ **No logic has been changed, added, or removed.**

Run cells **top to bottom**.


## 1️ Merge CSV files (merge_csv.py)

In [None]:
import pandas as pd
import glob
import os

# Get current folder path
current_folder = os.getcwd()

# Get all CSV files in this folder
files = glob.glob(os.path.join(current_folder, "*.csv"))

print("Files found:", files)

# Read and store all CSVs
df_list = []
for file in files:
    df = pd.read_csv(file)
    df_list.append(df)

# Concatenate all files
final_df = pd.concat(df_list, ignore_index=True)

# Save combined file
final_df.to_csv("combined_aadhar_biometric.csv", index=False)

print("All CSV files combined successfully!")


## 2️ State cleaning & validation (clean_state.py)

In [None]:
import pandas as pd

# Load data
df = pd.read_csv("combined_aadhar_biometric.csv")
print("Total rows:", len(df))

# Normalize
df['state_raw'] = (
    df['state']
    .astype(str)
    .str.strip()
    .str.lower()
)

# Canonical list of Indian States & UTs
VALID_STATES = {
    'andhra pradesh',
    'arunachal pradesh',
    'assam',
    'bihar',
    'chhattisgarh',
    'goa',
    'gujarat',
    'haryana',
    'himachal pradesh',
    'jharkhand',
    'karnataka',
    'kerala',
    'madhya pradesh',
    'maharashtra',
    'manipur',
    'meghalaya',
    'mizoram',
    'nagaland',
    'odisha',
    'punjab',
    'rajasthan',
    'sikkim',
    'tamil nadu',
    'telangana',
    'tripura',
    'uttar pradesh',
    'uttarakhand',
    'west bengal',

    # UTs
    'andaman and nicobar islands',
    'chandigarh',
    'dadra and nagar haveli and daman and diu',
    'delhi',
    'jammu and kashmir',
    'ladakh',
    'lakshadweep',
    'puducherry'
}

STATE_FIXES = {
    # West Bengal
    'west bengal': 'west bengal',
    'west  bengal': 'west bengal',
    'westbengal': 'west bengal',
    'west bangal': 'west bengal',

    # Odisha
    'orissa': 'odisha',

    # Puducherry
    'pondicherry': 'puducherry',

    # Andaman & Nicobar
    'andaman & nicobar islands': 'andaman and nicobar islands',

    # Jammu & Kashmir
    'jammu & kashmir': 'jammu and kashmir',

    # Chhattisgarh
    'chhatisgarh': 'chhattisgarh',

    # Uttarakhand
    'uttaranchal': 'uttarakhand',

    # Tamil Nadu
    'tamilnadu': 'tamil nadu',

    # Dadra & Nagar Haveli and Daman & Diu
    'daman and diu': 'dadra and nagar haveli and daman and diu',
    'daman & diu': 'dadra and nagar haveli and daman and diu',
    'dadra and nagar haveli': 'dadra and nagar haveli and daman and diu',
    'dadra & nagar haveli': 'dadra and nagar haveli and daman and diu'
}



df['state_fixed'] = df['state_raw'].replace(STATE_FIXES)

# Final state column
df['state_final'] = df['state_fixed'].str.title()

# Flag invalid / unknown states (DO NOT DROP)
df['state_status'] = df['state_fixed'].apply(
    lambda x: 'VALID' if x in VALID_STATES else 'UNKNOWN'
)

# Report
print("\nVALID states:", (df['state_status'] == 'VALID').sum())
print("UNKNOWN states:", (df['state_status'] == 'UNKNOWN').sum())

# Save safely
df.to_csv("aadhar_state_cleaned_SAFE.csv", index=False)
print("Cleaned file saved safely")
print("\nUnknown state values:")
print(df[df['state_status'] == 'UNKNOWN']['state_raw'].value_counts().head(30))

## 3 Keep original columns & sort (state_sort.py)

In [None]:
import pandas as pd

# Load audited cleaned file (with helper columns)
df = pd.read_csv("aadhar_state_cleaned_SAFE.csv")

# Keep ONLY original columns (exact count)
df_final = df[
    [
        'date',
        'state_final',
        'district',
        'pincode',
        'bio_age_5_17',
        'bio_age_17_'
    ]
].copy()

# Rename back to original column name
df_final = df_final.rename(columns={'state_final': 'state'})

# Sort states A → Z
df_final = df_final.sort_values(by='state', ascending=True)

# Save final output
df_final.to_csv("aadhar_SORTED_ORIGINAL_COLUMNS.csv", index=False)

print("Sorted file created with ONLY original columns")
print("Columns:", df_final.columns.tolist())


## 4️ District cleaning & standardization

In [None]:
import pandas as pd

# ==================================================
# LOAD FINAL STATE-CLEANED FILE (ORIGINAL COLUMNS)
# ==================================================
df = pd.read_csv("aadhar_SORTED_ORIGINAL_COLUMNS.csv")

# ==================================================
# 1️ SPECIAL CASE: Maharashtra – Aurangabad rename
# ==================================================
df.loc[
    (df['state'] == 'Maharashtra') &
    (df['district'].str.contains('Aurangabad|Sambhajinagar', case=False, na=False)),
    'district'
] = 'Chhatrapati Sambhajinagar'

# ==================================================
# 2️ NORMALIZE DISTRICT TEXT (CRITICAL)
# ==================================================
df['district'] = (
    df['district']
    .astype(str)
    .str.replace(r'[*()]', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

# ==================================================
# 3️ FULL EXTENDED DISTRICT MAPPING (YOUR LIST + NECESSARY)
# ==================================================
mapping = {
    # --- West Bengal ---
    'Coochbehar': 'Cooch Behar',
    'Hooghiy': 'Hooghly',
    'Hugli': 'Hooghly',
    'Haora': 'Howrah',
    'Hawrah': 'Howrah',
    'Koch Bihar': 'Cooch Behar',
    'Maldah': 'Malda',
    'Puruliya': 'Purulia',
    'Barddhaman': 'Purba Bardhaman',
    'Bardhaman': 'Purba Bardhaman',
    'Burdwan': 'Purba Bardhaman',
    'East Midnapur': 'Purba Medinipur',
    'East Midnapore': 'Purba Medinipur',
    'West Midnapore': 'Paschim Medinipur',
    'West Medinipur': 'Paschim Medinipur',
    'Medinipur': 'Paschim Medinipur',
    'Medinipur West': 'Paschim Medinipur',
    'North Twenty Four Parganas': 'North 24 Parganas',
    'South Twenty Four Parganas': 'South 24 Parganas',
    'South 24 Pargana': 'South 24 Parganas',
    '24 Paraganas North': 'North 24 Parganas',
    '24 Paraganas South': 'South 24 Parganas',
    'Dinajpur Uttar': 'Uttar Dinajpur',
    'Dinajpur Dakshin': 'Dakshin Dinajpur',
    'North Dinajpur': 'Uttar Dinajpur',
    'South Dinajpur': 'Dakshin Dinajpur',
    'Darjiling': 'Darjeeling',

    # --- Odisha ---
    'Debagarh': 'Deogarh',
    'Jajapur': 'Jajpur',
    'Khorda': 'Khordha',
    'Anugul': 'Angul',
    'Anugal': 'Angul',
    'Baleshwar': 'Balasore',
    'Baleswar': 'Balasore',
    'Sundergarh': 'Sundargarh',
    'Jagatsinghapur': 'Jagatsinghpur',
    'Baudh': 'Boudh',
    'Nabarangapu': 'Nabarangpur',
    'Nabarangapur': 'Nabarangpur',

    # --- Bihar ---
    'Purbi Champaran': 'East Champaran',
    'Pashchim Champaran': 'West Champaran',
    'Sheikpura': 'Sheikhpura',
    'Samstipur': 'Samastipur',
    'Monghyr': 'Munger',
    'Bhabua': 'Kaimur',
    'Bara Banki': 'Barabanki',

    # --- Jharkhand ---
    'Purbi Singhbhum': 'East Singhbhum',
    'Purbi Singhbum': 'East Singhbhum',
    'East Singhbum': 'East Singhbhum',
    'Pashchimi Singhbhum': 'West Singhbhum',
    'Hazaribag': 'Hazaribagh',
    'Palamau': 'Palamu',
    'Sahebganj': 'Sahibganj',
    'Kodarma': 'Koderma',
    'Pakaur': 'Pakur',

    # --- Karnataka ---
    'Bellary': 'Ballari',
    'Gulbarga': 'Kalaburagi',
    'BijapurKar': 'Vijayapura',
    'Bagalkot': 'Bagalkote',
    'Belgaum': 'Belagavi',
    'Chamrajanagar': 'Chamarajanagar',
    'Chamrajnagar': 'Chamarajanagar',
    'Chickmagalur': 'Chikkamagaluru',
    'Chikmagalur': 'Chikkamagaluru',
    'Davangere': 'Davanagere',
    'Shimoga': 'Shivamogga',
    'Tumkur': 'Tumakuru',
    'Hasan': 'Hassan',
    'Bangalore': 'Bengaluru Urban',
    'Bengaluru': 'Bengaluru Urban',
    'Bangalore Rural': 'Bengaluru Rural',
    'Mysore': 'Mysuru',

    # --- Maharashtra ---
    'Ahmednagar': 'Ahilyanagar',
    'Ahmadnagar': 'Ahilyanagar',
    'Ahmed Nagar': 'Ahilyanagar',
    'Bid': 'Beed',
    'Buldana': 'Buldhana',
    'Gondiya': 'Gondia',
    'RaigarhMh': 'Raigad',
    'Osmanabad': 'Osmanabad (Dharashiv)',
    'Dharashiv': 'Osmanabad (Dharashiv)',
    'Mumbai': 'Mumbai City',
    'Mumbai Sub Urban': 'Mumbai Suburban',
    'Aurangabad (Chhatrapati Sambhajinagar)': 'Chhatrapati Sambhajinagar',
    'Aurangabad Chhatrapati Sambhajinagar': 'Chhatrapati Sambhajinagar',
    'Chatrapati Sambhaji Nagar': 'Chhatrapati Sambhajinagar',

    # --- Telangana ---
    'K.V.Rangareddy': 'Ranga Reddy',
    'K.V. Rangareddy': 'Ranga Reddy',
    'Rangareddi': 'Ranga Reddy',
    'Yadadri.': 'Yadadri Bhuvanagiri',
    'Jagitial': 'Jagtial',
    'Jangoan': 'Jangaon',
    'Komaram Bheem': 'Kumuram Bheem',
    'Hanumakonda': 'Warangal Urban',

    # --- Andhra Pradesh ---
    'Anantapur': 'Anantapuramu',
    'Ananthapur': 'Anantapuramu',
    'Ananthapuramu': 'Anantapuramu',
    'Cuddapah': 'YSR',
    'Y. S. R': 'YSR',
    'Sri Potti Sriramulu Nellore': 'Spsr Nellore',

    # --- Punjab / Haryana ---
    'Firozpur': 'Ferozepur',
    'Nawanshahr': 'Shahid Bhagat Singh Nagar',
    'Sas Nagar Mohali': 'Mohali',
    'S.A.S Nagar': 'Mohali',
    'Sahibzada Ajit Singh Nagar': 'Mohali',

    # --- Goa ---
    'Bardez': 'North Goa',
    'Tiswadi': 'North Goa',
    'Bicholim': 'North Goa',

    # --- Jammu & Kashmir ---
    'Bandipore': 'Bandipora',
    'Bandipur': 'Bandipora',
    'Shupiyan': 'Shopian',
    'Rajauri': 'Rajouri',

    # --- Others ---
    'Tuticorin': 'Thoothukudi',
    'Kanniyakumari': 'Kanyakumari',
    'Thiruvallur': 'Tiruvallur',
    'The Nilgiris': 'Nilgiris',
    'Raebareli': 'Rae Bareli',
}

# ==================================================
# 4️ APPLY MAPPING
# ==================================================
df['district'] = df['district'].replace(mapping)

# ==================================================
# 5️ FINAL SORT: STATE → DISTRICT → PINCODE
# ==================================================
df = df.sort_values(
    by=['state', 'district', 'pincode'],
    ascending=[True, True, True]
)

# ==================================================
# 6️ SAVE FINAL FILE (NO EXTRA COLUMNS)
# ==================================================
df.to_csv("aadhar_FINAL_STATE_DISTRICT_SORTED.csv", index=False)

print("FULL EXTENDED DISTRICT CLEAN COMPLETED")
print("Columns:", df.columns.tolist())
print("Total rows:", len(df))


## 5️⃣ Duplicate detection (duplicate.py)

In [None]:
import pandas as pd

# Load CSV from SAME folder as this .py file
df1 = pd.read_csv("enrollment_final_no_duplicates.csv", encoding="latin1")

print("✅ File loaded successfully")

# Columns to check duplicates
colss = ['date', 'state', 'district', 'pincode']

# Find duplicate rows
duplicates = df1[df1.duplicated(subset=colss, keep=False)]

# Save duplicates
duplicates.to_csv("duplicate_rows.csv", index=False)

print(f"✅ Saved {len(duplicates)} duplicate rows to CSV")
