In [3]:
import numpy as np
import pandas as pd
import random
import json
import faker
import datetime

fake = faker.Faker("en_IN")

# Config
NUM_USERS = 2000

# Aadhaar IDs
AadharIds = np.array([f"{i:012d}" for i in range(500000000001, 500000000001 + NUM_USERS)])

# DOBs
date_range = pd.date_range('1960-01-01', '2015-12-31').date
dobs = np.random.choice(date_range, size=(NUM_USERS,))

# Gender
genders = np.random.choice(["Male", "Female"], size=(NUM_USERS,))

# Addresses
addresses = [fake.address().replace("\n", ", ") for _ in range(NUM_USERS)]

# PAN generator
def random_pan(name):
    return f"{''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=5))}{random.randint(1000,9999)}{random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ')}"

# License generator
def random_dl():
    state_code = random.choice(["DL", "HR", "UP", "MH", "RJ", "PB", "TN", "KA"])
    return f"{state_code}-{random.randint(10,99)}-{random.randint(1000000,9999999)}"

# Scheme for socio-economic distribution
def socio_class(dob):
    age = 2025 - dob.year
    if age < 18:
        return "child"
    elif age >= 18 and random.random() < 0.6:
        return "middle"  # 60% middle class adults
    else:
        return "lower"   # 40% lower class adults

# Generate unique phone numbers
phones = []
while len(phones) < NUM_USERS:
    num = np.random.randint(2000000000, 4999999999, dtype='int64')
    if num not in phones:
        phones.append(str(num))

# Build DigiLocker DB
digilocker_data = {}

for i, aid in enumerate(AadharIds):
    name = fake.name_male() if genders[i] == "Male" else fake.name_female()
    dob = dobs[i]
    addr = addresses[i]
    socio = socio_class(dob)
    phone = phones[i]

    entry = {
        "persona_description": f"Synthetic citizen record {i+1}",
        "user_info": {
            "name": name,
            "dob": str(dob),
            "gender": genders[i],
            "address": addr,
            "phone_number": phone
        },
        "linked_documents": {
            # Aadhaar (everyone has)
            "aadhaar": {
                "aadhaar_number": aid,
                "name": name,
                "dob": str(dob),
                "gender": genders[i],
                "address": addr,
                "photo_path": f"photos/{aid}.jpg",
                "phone_number": phone
            },

            # PAN (90% adults)
            "pan_card": {
                "pan_number": random_pan(name),
                "name_on_card": name,
                "dob": str(dob),
                "father_name": fake.name_male(),
                "photo_path": f"photos/{aid}_pan.jpg",
                "phone_number": phone
            } if (2025 - dob.year) >= 18 and random.random() < 0.9 else None,

            # Driver's License (only adults)
            "driver_license": {
                "license_number": random_dl(),
                "name": name,
                "dob": str(dob),
                "issue_date": str(fake.date_between(start_date=datetime.date(2000, 1, 1), end_date=datetime.date(2018, 1, 1))),
                "expiry_date": str(fake.date_between(start_date=datetime.date(2025, 1, 1), end_date=datetime.date(2035, 1, 1))),
                "vehicle_classes": random.sample(["MCWG", "LMV", "HMV"], random.randint(1,2)),
                "photo_path": f"photos/{aid}_dl.jpg",
                "phone_number": phone
            } if (2025 - dob.year) >= 18 else None,

            # Family ID (30% people)
            "family_id": {
                "id_number": f"FID{100000+i}",
                "head_of_family": name,
                "members": [
                    {"name": fake.first_name(), "age": random.randint(1,70), "relation": random.choice(["Spouse","Child","Parent"])}
                    for _ in range(random.randint(2,5))
                ],
                "address": addr
            } if random.random() < 0.3 else None,

            # Ration Card (only lower class adults)
            "ration_card": {
                "card_number": f"RC{200000+i}",
                "card_type": random.choice(["PHH", "APL", "BPL", "AAY"]),
                "family_head": name,
                "family_size": random.randint(2,7),
                "address": addr
            } if socio == "lower" else None,

            # Marksheets
            "marksheet_10": {
                "roll_number": f"10TH{300000+i}",
                "candidate_name": name,
                "year": random.choice([2018,2019,2020,2021,2022,2023]),
                "school_name": fake.company(),
                "subjects": {"Maths": random.randint(40,100), "Science": random.randint(40,100), "English": random.randint(40,100)},
                "grade": random.choice(["A1","A2","B1","B2","C"])
            } if (2025 - dob.year) <= 20 else None,

            "marksheet_12": {
                "roll_number": f"12TH{400000+i}",
                "candidate_name": name,
                "year": random.choice([2020,2021,2022,2023,2024]),
                "school_name": fake.company(),
                "subjects": {"Physics": random.randint(40,100), "Chemistry": random.randint(40,100), "Maths": random.randint(40,100)},
                "grade": random.choice(["A1","A2","B1","B2","C"])
            } if (17 <= (2025 - dob.year) <= 22) else None,

            # ITR (middle class only)
            "itr": {
                "pan_number": random_pan(name),
                "assessment_year": random.choice([2021,2022,2023,2024]),
                "gross_income": random.randint(300000,2000000),
                "tax_paid": random.randint(10000,500000),
                "filing_date": str(fake.date_between(start_date=datetime.date(2021, 4, 1), end_date=datetime.date(2024, 7, 31)))
            } if socio == "middle" else None
        }
    }

    digilocker_data[aid] = entry

# Save JSON
with open("digilocker_synthetic_2000.json", "w") as f:
    json.dump(digilocker_data, f, indent=4)
