In [1]:
print("HI")

HI


In [5]:
import numpy as np
import pandas as pd
import random
import json
import faker
import datetime

fake = faker.Faker("en_IN")

# Config
NUM_USERS = 2000
CURRENT_YEAR = 2025

# Aadhaar IDs
AadharIds = np.array([f"{i:012d}" for i in range(500000000001, 500000000001 + NUM_USERS)])

# DOBs
date_range = pd.date_range('1960-01-01', '2015-12-31').date
dobs = np.random.choice(date_range, size=(NUM_USERS,))

# Gender
genders = np.random.choice(["Male", "Female"], size=(NUM_USERS,))

# Addresses
addresses = [fake.address().replace("\n", ", ") for _ in range(NUM_USERS)]

# PAN generator
def random_pan(name):
    return f"{''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=5))}{random.randint(1000,9999)}{random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ')}"

# License generator
def random_dl():
    state_code = random.choice(["DL", "HR", "UP", "MH", "RJ", "PB", "TN", "KA"])
    return f"{state_code}-{random.randint(10,99)}-{random.randint(1000000,9999999)}"

# Socio-economic distribution
def socio_class(dob):
    age = CURRENT_YEAR - dob.year
    if age < 18:
        return "child"
    elif age >= 18 and random.random() < 0.6:
        return "middle"   # 60% middle class adults
    else:
        return "lower"    # 40% lower class adults

# Generate unique phone numbers (10-digit, realistic Indian start digits)
phones = []
while len(phones) < NUM_USERS:
    num = np.random.randint(2000000000, 4999999999, dtype='int64')
    if num not in phones:
        phones.append(str(num))

# Build DigiLocker DB
digilocker_data = {}

for i, aid in enumerate(AadharIds):
    name = fake.name_male() if genders[i] == "Male" else fake.name_female()
    dob = dobs[i]
    age = CURRENT_YEAR - dob.year
    addr = addresses[i]
    socio = socio_class(dob)
    phone = phones[i]

    # Decide ITR presence more explicitly for use by marksheet logic.
    # Middle class adults: high probability of filing ITR (tunable).
    itr_exists = False
    if socio == "middle" and age >= 18:
        itr_exists = (random.random() < 0.75)  # 75% of middle-class adults have ITR

    # Decide ration_card first because marksheet presence may depend on it's type (PHH => at least 10th)
    ration_card = None
    if socio == "lower":
        card_type_choice = random.choice(["PHH", "APL", "BPL", "AAY"])
        ration_card = {
            "card_number": f"RC{200000+i}",
            "card_type": card_type_choice,
            "family_head": name,
            "family_size": random.randint(2,7),
            "address": addr,
            "phone_number": phone,
            "document_path": f"docs/ration/{aid}_rc.pdf"
        }

    # Family ID presence
    family_id = None
    if random.random() < 0.3:
        family_id = {
            "id_number": f"FID{100000+i}",
            "head_of_family": name,
            "members": [
                {"name": fake.first_name(), "age": random.randint(1,70), "relation": random.choice(["Spouse","Child","Parent"])}
                for _ in range(random.randint(2,5))
            ],
            "address": addr,
            "phone_number": phone,
            "document_path": f"docs/family_id/{aid}_fid.pdf"
        }

    # PAN presence (adults)
    pan_card = None
    if age >= 18 and random.random() < 0.9:
        pan_card = {
            "pan_number": random_pan(name),
            "name_on_card": name,
            "dob": str(dob),
            "father_name": fake.name_male(),
            "phone_number": phone,
            "document_path": f"docs/pan/{aid}_pan.pdf"
        }

    # Driver's license presence (adults)
    driver_license = None
    if age >= 18:
        # Some adults have DL, not all
        if random.random() < 0.6:
            driver_license = {
                "license_number": random_dl(),
                "name": name,
                "dob": str(dob),
                "issue_date": str(fake.date_between(start_date=datetime.date(2000, 1, 1), end_date=datetime.date(2018, 1, 1))),
                "expiry_date": str(fake.date_between(start_date=datetime.date(2025, 1, 1), end_date=datetime.date(2035, 1, 1))),
                "vehicle_classes": random.sample(["MCWG", "LMV", "HMV"], random.randint(1,2)),
                "phone_number": phone,
                "document_path": f"docs/dl/{aid}_dl.pdf"
            }

    # ITR object if itr_exists
    itr = None
    if itr_exists:
        itr = {
            "pan_number": random_pan(name),
            "assessment_year": random.choice([2021,2022,2023,2024]),
            "gross_income": random.randint(300000,2000000),
            "tax_paid": random.randint(10000,500000),
            "filing_date": str(fake.date_between(start_date=datetime.date(2021, 4, 1), end_date=datetime.date(2024, 7, 31))),
            "document_path": f"docs/itr/{aid}_itr.pdf"
        }

    # Decide marksheet presence using the new rules:
    # - If child / young (<=20) => likely have 10th
    # - If itr_exists => very likely have 10th and 12th
    # - If ration_card exists and card_type == "PHH" => ensure at least 10th
    # - Otherwise use base probabilities to increase overall counts
    has_10 = False
    has_12 = False

    if age <= 20:
        has_10 = True
    elif itr_exists:
        has_10 = (random.random() < 0.95)  # very likely
    elif ration_card and ration_card.get("card_type") == "PHH":
        has_10 = True  # guarantee at least 10th for PHH ration
    else:
        has_10 = (random.random() < 0.55)  # raised base chance for 10th among adults

    if 17 <= age <= 22:
        has_12 = True
    elif itr_exists:
        has_12 = (random.random() < 0.85)  # quite likely for ITR filers
    else:
        has_12 = (random.random() < 0.35)  # moderate base chance for 12th among others

    marksheet_10 = None
    marksheet_12 = None

    if has_10:
        marksheet_10 = {
            "roll_number": f"10TH{300000+i}",
            "candidate_name": name,
            "year": random.choice([2016,2017,2018,2019,2020,2021,2022,2023]),
            "school_name": fake.company(),
            "subjects": {"Maths": random.randint(40,100), "Science": random.randint(40,100), "English": random.randint(40,100)},
            "grade": random.choice(["A1","A2","B1","B2","C"]),
            "document_path": f"docs/marksheets/{aid}_10.pdf"
        }

    if has_12:
        # Prefer to include 12th for similar cohorts
        marksheet_12 = {
            "roll_number": f"12TH{400000+i}",
            "candidate_name": name,
            "year": random.choice([2018,2019,2020,2021,2022,2023,2024]),
            "school_name": fake.company(),
            "subjects": {"Physics": random.randint(40,100), "Chemistry": random.randint(40,100), "Maths": random.randint(40,100)},
            "grade": random.choice(["A1","A2","B1","B2","C"]),
            "document_path": f"docs/marksheets/{aid}_12.pdf"
        }

    # Build the entry
    entry = {
        "persona_description": f"Synthetic citizen record {i+1}",
        "user_info": {
            "name": name,
            "dob": str(dob),
            "gender": genders[i],
            "address": addr,
            "phone_number": phone
        },
        "linked_documents": {
            "aadhaar": {
                "aadhaar_number": aid,
                "name": name,
                "dob": str(dob),
                "gender": genders[i],
                "address": addr,
                "phone_number": phone,
                "document_path": f"docs/aadhaar/{aid}.pdf"
            },
            "pan_card": pan_card,
            "driver_license": driver_license,
            "family_id": family_id,
            "ration_card": ration_card,
            "marksheet_10": marksheet_10,
            "marksheet_12": marksheet_12,
            "itr": itr
        }
    }

    digilocker_data[aid] = entry

# Save JSON
with open("digilocker_synthetic_2000.json", "w") as f:
    json.dump(digilocker_data, f, indent=4)
