In [7]:
import json
import random

# 1. Expanded placeholder pools (20+ items each)
PLACEHOLDERS = {
    # Global pools
    "country": [
        "France", "Japan", "India", "Germany", "Brazil", "Canada", "Kenya",
        "Australia", "China", "Russia", "Mexico", "Egypt", "South Korea",
        "Italy", "Spain", "Netherlands", "Nigeria", "Argentina", "Sweden", "Norway"
    ],
    "year": [str(y) for y in range(1900, 2025, 5)],
    "units": [
        "seconds", "centimeters", "ounces", "feet", "minutes", "hours",
        "kilometers", "miles", "liters", "gallons", "grams", "kilograms",
        "meters", "yards", "acres", "bytes", "bits", "watts", "volts", "newtons"
    ],
    "measurement": [
        "hour", "meter", "pound", "yard", "day", "liter", "ounce",
        "degree", "percent", "mole", "candela", "kelvin", "second",
        "minute", "hectare", "byte", "bit", "ampere", "watt", "joule"
    ],
    "event": [
        "World War II", "Olympics 2020", "Moon Landing", "Independence Day",
        "UN Summit", "FIFA World Cup", "Super Bowl", "Fall of Berlin Wall",
        "French Revolution", "Industrial Revolution", "Renaissance",
        "Y2K transition", "Spanish Flu pandemic", "Apollo 11 landing",
        "Woodstock Festival", "Expo 67", "Berlin Olympics",
        "Coronation of Queen Elizabeth II", "Launch of Sputnik",
        "Discovery of penicillin"
    ],
    
    # New placeholders for Shallow and Deep templates
    "condition": [
        "diabetes", "hypertension", "asthma", "depression", "arthritis",
        "migraine", "cancer", "heart disease", "Alzheimer's disease", "COPD",
        "multiple sclerosis", "Parkinson's disease", "osteoporosis", "ADHD",
        "autism", "schizophrenia", "obesity", "psoriasis", "celiac disease", "epilepsy"
    ],
    "treatment": [
        "cognitive behavioral therapy", "immunotherapy", "chemotherapy", "radiation therapy",
        "physical therapy", "occupational therapy", "dialysis", "hormone replacement therapy",
        "stem cell therapy", "gene therapy", "surgery", "laser treatment", "acupuncture",
        "antidepressants", "insulin therapy", "antiretroviral therapy", "antibiotics",
        "antihistamines", "statins", "anti-inflammatory drugs"
    ],
    "complex_issue": [
        "climate change", "water scarcity", "plastic pollution", "deforestation",
        "food insecurity", "air pollution", "biodiversity loss", "soil degradation",
        "ocean acidification", "urbanization", "electronic waste", "desertification",
        "invasive species", "habitat fragmentation", "coral bleaching", "overfishing",
        "toxic waste disposal", "groundwater contamination", "carbon emissions", "ozone depletion"
    ],
    "solution": [
        "renewable energy", "carbon capture", "circular economy", "precision agriculture",
        "smart city infrastructure", "electric vehicles", "green building standards",
        "waste-to-energy conversion", "regenerative farming", "water recycling systems",
        "biodegradable materials", "ecosystem restoration", "carbon offsetting",
        "green taxation", "emissions trading", "sustainable transport", "forest conservation",
        "energy efficiency standards", "wildlife corridors", "plastic alternatives"
    ],
    "policy_area": [
        "healthcare", "education", "transportation", "immigration", "housing",
        "environmental protection", "criminal justice", "taxation", "digital privacy",
        "economic development", "energy policy", "foreign policy", "defense",
        "social welfare", "labor policy", "consumer protection", "banking regulation",
        "telecommunications", "agriculture", "public health"
    ],
    "sector": [
        "healthcare", "education", "energy", "transportation", "agriculture",
        "financial services", "manufacturing", "telecommunications", "retail",
        "hospitality", "construction", "pharmaceutical", "automotive", "aerospace",
        "information technology", "entertainment", "media", "public sector",
        "logistics", "real estate"
    ],

    # Domain-specific pools
    "General Knowledge": {
        "object": [
            "the Eiffel Tower", "Mount Everest", "Great Wall of China",
            "Pyramids of Giza", "Stonehenge", "Statue of Liberty",
            "Machu Picchu", "Taj Mahal", "Colosseum",
            "Leaning Tower of Pisa", "Sydney Opera House", "Uluru",
            "Christ the Redeemer", "Angkor Wat", "Mount Kilimanjaro",
            "Niagara Falls", "Sahara Desert", "Amazon River",
            "Grand Canyon", "Kremlin"
        ],
        "landmark": [
            "Times Square", "Big Ben", "Golden Gate Bridge",
            "Table Mountain", "Buckingham Palace", "CN Tower",
            "Mount Fuji", "Burj Khalifa", "Petra", "Edinburgh Castle",
            "Acropolis of Athens", "St. Peter's Basilica",
            "Notre-Dame Cathedral", "Hagia Sophia",
            "Forbidden City", "Tower of London",
            "Machu Picchu", "Angkor Wat", "Great Barrier Reef",
            "Stonehenge"
        ]
    },
    "Science": {
        "element": [
            "hydrogen", "helium", "lithium", "beryllium", "boron", "carbon",
            "nitrogen", "oxygen", "fluorine", "neon", "sodium", "magnesium",
            "aluminum", "silicon", "phosphorus", "sulfur", "chlorine", "argon",
            "potassium", "calcium"
        ],
        "compound": [
            "water", "ammonia", "salt", "glucose", "ethanol", "carbon dioxide",
            "methane", "benzene", "acetic acid", "sulfuric acid",
            "hydrogen peroxide", "sodium bicarbonate", "calcium carbonate",
            "ozone", "silicon dioxide", "potassium permanganate",
            "sodium hydroxide", "formaldehyde", "caffeine", "urea"
        ],
        "phenomenon": [
            "gravity", "photosynthesis", "radioactivity", "refraction",
            "evaporation", "condensation", "diffraction", "osmosis",
            "magnetism", "inertia", "capacitance", "resonance",
            "superconductivity", "fluorescence", "piezoelectricity",
            "Doppler effect", "Brownian motion", "thermal expansion",
            "surface tension", "nuclear fission"
        ]
    },
    "Technology": {
        "innovation": [
            "cloud computing", "blockchain", "quantum computing",
            "Internet of Things", "5G networks", "artificial intelligence",
            "virtual reality", "augmented reality", "edge computing",
            "3D printing", "autonomous vehicles", "drones",
            "wearable technology", "nanotechnology", "gene editing",
            "renewable energy tech", "CRISPR", "neural interfaces",
            "sensor networks", "radar imaging"
        ],
        "device": [
            "smartphone", "drone", "3D printer", "smartwatch", "tablet",
            "laptop", "router", "server", "GPU", "VR headset",
            "RFID scanner", "electric car", "robot vacuum",
            "fitness tracker", "home assistant", "digital camera",
            "oscilloscope", "microwave oven", "satellite", "solar panel"
        ],
        "protocol": [
            "HTTP", "TCP/IP", "Bluetooth", "FTP", "SMTP", "SSH", "DNS",
            "UDP", "SSL/TLS", "MQTT", "Zigbee", "NFC", "IPv4", "IPv6",
            "HTTP/2", "WebSocket", "P2P", "RESTful API", "SOAP", "RTP"
        ]
    },
    "Business": {
        "strategy": [
            "market segmentation", "cost leadership", "differentiation",
            "diversification", "vertical integration", "horizontal integration",
            "outsourcing", "crowdsourcing", "blue ocean strategy",
            "disruptive innovation", "lean startup", "customer intimacy",
            "product leadership", "competitive pricing", "brand extension",
            "franchising", "joint venture", "risk management",
            "corporate social responsibility", "digital transformation"
        ],
        "metric": [
            "ROI", "market share", "EBITDA", "net profit margin", "churn rate",
            "customer acquisition cost", "lifetime value", "conversion rate",
            "gross margin", "debt-to-equity ratio", "return on assets",
            "quick ratio", "inventory turnover", "earnings per share",
            "price-to-earnings ratio", "cost of goods sold",
            "operating margin", "revenue growth rate", "employee turnover",
            "customer satisfaction score"
        ],
        "trend": [
            "gig economy", "remote work", "e-commerce", "subscription models",
            "digital marketing", "influencer marketing", "AI automation",
            "sustainability focus", "flexible workspace",
            "cross-border e-commerce", "digital nomadism", "circular economy",
            "personalization", "blockchain adoption", "mobile commerce",
            "sharing economy", "cloud migration", "omnichannel retail",
            "decentralized finance", "climate tech"
        ]
    },
    "Economics": {
        "indicator": [
            "GDP", "inflation rate", "unemployment rate",
            "consumer price index", "purchasing power parity",
            "balance of trade", "interest rate", "gross national income",
            "human development index", "Gini coefficient",
            "trade deficit", "budget deficit", "labor force participation",
            "money supply", "exchange rate", "fiscal deficit",
            "national debt", "producer price index", "poverty rate",
            "economic growth rate"
        ],
        "model": [
            "supply and demand", "IS-LM", "game theory",
            "Keynesian model", "Solow growth model", "Heckscher-Ohlin",
            "comparative advantage", "behavioral economics model",
            "rational choice theory", "principal-agent model",
            "cobweb model", "input-output model",
            "overlapping generations model", "Lewis model",
            "AD-AS model", "Ricardian model", "Neoclassical model",
            "endogenous growth model", "gravity model", "auction theory"
        ]
    },
    "AI/ML": {
        "algorithm": [
            "random forest", "neural network", "SVM", "k-means",
            "decision tree", "logistic regression", "gradient boosting",
            "XGBoost", "LSTM", "convolutional neural network", "PCA",
            "t-SNE", "KNN", "naive Bayes", "reinforcement learning",
            "transformer", "BERT", "GPT", "autoencoder", "GAN"
        ],
        "metric": [
            "accuracy", "precision", "recall", "F1 score", "ROC AUC",
            "mean squared error", "mean absolute error", "R² score",
            "perplexity", "log loss", "BLEU score", "ROUGE score",
            "silhouette score", "Jaccard index", "confusion matrix",
            "Cohen's kappa", "lift", "Gini coefficient", "Spearman correlation",
            "Pearson correlation"
        ],
        "application": [
            "image classification", "speech recognition", "recommendation systems",
            "natural language processing", "anomaly detection", "autonomous driving",
            "fraud detection", "sentiment analysis", "language translation",
            "drug discovery", "robotic control", "time series forecasting",
            "chatbots", "computer vision", "generative art", "predictive maintenance",
            "healthcare diagnostics", "financial modeling", "social network analysis",
            "bioinformatics"
        ]
    }
}

# 2. Template definitions with 15+ entries per domain under "Simple"
TEMPLATES = {
    "Simple": {
        "General Knowledge": [
            "What is the capital of {country}?",
            "Who first discovered {object}?",
            "Where is {landmark} located?",
            "When was {event} first celebrated?",
            "What is the population of {country}?",
            "What is the official language of {country}?",
            "What currency is used in {country}?",
            "What is the highest mountain in {country}?",
            "What is the national animal of {country}?",
            "What is the national flower of {country}?",
            "What is the time zone of {country}?",
            "Which continent is {country} part of?",
            "When did {country} gain independence?",
            "What is the largest city in {country}?",
            "What is the longest river in {country}?"
        ],
        "Science": [
            "What is the atomic number of {element}?",
            "What is the chemical formula of {compound}?",
            "What does the term {phenomenon} refer to?",
            "Which unit measures {measurement}?",
            "What is the melting point of {compound} in C?",
            "What is the molar mass of {compound}?",
            "What is the symbol for {element}?",
            "Which group is {element} in the periodic table?",
            "What state of matter is {compound} at room temperature?",
            "What is the density of {compound}?",
            "What is the pH value of a solution of {compound}?",
            "What is the atomic weight of {element}?",
            "What is the common name for {compound}?",
            "What property does {phenomenon} describe?",
            "What is the boiling point of {compound} in C?"
        ],
        "Technology": [
            "What does the innovation {innovation} enable?",
            "Which device is used for {device}?",
            "Define the protocol {protocol}.",
            "What is a common application of {device}?",
            "What does {innovation} rely on?",
            "When was {innovation} first introduced?",
            "Who invented the first {device}?",
            "What standard replaced {protocol}?",
            "Which company developed the {device}?",
            "What frequency range does {protocol} use?",
            "What bandwidth does {protocol} support?",
            "What operating principle does {device} follow?",
            "What interface does {device} commonly use?",
            "What power source do {device} typically use?",
            "What component is essential for {innovation}?"
        ],
        "Business": [
            "What does the strategy {strategy} focus on?",
            "How is {metric} calculated?",
            "What trend characterizes {trend}?",
            "What is meant by {strategy}?",
            "Which industry commonly uses {strategy}?",
            "What does a high {metric} indicate?",
            "What year did the trend {trend} begin?",
            "Who coined the term {strategy}?",
            "What factor influences {metric} the most?",
            "What software is used to track {metric}?",
            "What role does {strategy} play in startups?",
            "What challenge does {trend} address?",
            "What sector benefits from {trend}?",
            "What is a limitation of {strategy}?",
            "What is the target of {metric} in planning?"
        ],
        "Economics": [
            "What does the economic indicator {indicator} measure?",
            "What is the focus of the model {model}?",
            "In which context is {indicator} reported?",
            "What does the model {model} analyze?",
            "Which country has the highest {indicator} recently?",
            "Who developed the model {model}?",
            "What is the formula for {indicator}?",
            "What data is needed for {model}?",
            "What does a rising {indicator} imply?",
            "What policy uses {indicator} as a benchmark?",
            "What assumption underlies {model}?",
            "What does {model} predict?",
            "What is a limitation of {indicator}?",
            "What sector is affected by {indicator} changes?",
            "What validates the model {model}?"
        ],
        "AI/ML": [
            "What is the primary function of {algorithm}?",
            "What role does {metric} play in evaluation?",
            "Define the application {application}.",
            "What is {algorithm} used for?",
            "Who developed the algorithm {algorithm}?",
            "What is the time complexity of {algorithm}?",
            "What type of data suits {algorithm} best?",
            "What problem does {algorithm} solve?",
            "What is a real-world use of {application}?",
            "What parameter affects {metric} most?",
            "What dataset is used for {application}?",
            "What is the output of {algorithm}?",
            "What library implements {algorithm}?",
            "What is the default hyperparameter of {algorithm}?",
            "What is the training objective of {algorithm}?"
        ]
    },

    # Expanded shallow and deep templates
    "Shallow": {
        "Environmental Issues": [
            "Explain why {complex_issue} is important.",
            "List benefits of implementing {solution}.",
            "What are the causes of {complex_issue}?",
            "Describe the process of {solution} rollout.",
            "What are the effects of {complex_issue} on ecosystems?",
            "How does {solution} contribute to sustainability?",
            "What industries are most affected by {complex_issue}?",
            "What are the key components of {solution}?",
            "How is {complex_issue} measured or monitored?",
            "What are the economic impacts of {complex_issue}?",
            "Compare {solution} with traditional approaches.",
            "What regions are most vulnerable to {complex_issue}?",
            "Describe how {solution} implementation works.",
            "What are common misconceptions about {complex_issue}?",
            "How has {complex_issue} changed over the past decade?"
        ],
        "AI/ML": [
            "Compare {algorithm1} and {algorithm2} for {application}.",
            "Explain advantages of using {metric}.",
            "List challenges in deploying {application}.",
            "Describe how {algorithm} handles large datasets.",
            "What are the limitations of {algorithm} in practice?",
            "How does {application} benefit from recent advances?",
            "What are the key preprocessing steps for {application}?",
            "How can {metric} be optimized in model evaluation?",
            "What computational resources does {algorithm} require?",
            "What are common error sources in {application}?",
            "Describe the output format of {algorithm}.",
            "How does feature selection affect {algorithm} performance?",
            "What industries commonly use {application}?",
            "How does {algorithm} compare to baseline methods?",
            "What are basic tuning approaches for {algorithm}?"
        ],
        "Healthcare": [
            "Explain why {treatment} is used for {condition}.",
            "List the benefits of early detection of {condition}.",
            "Compare {treatment} with alternative therapies.",
            "What are the risk factors for {condition}?",
            "How is {condition} typically diagnosed?",
            "What are the side effects of {treatment}?",
            "How has the prevalence of {condition} changed over time?",
            "What lifestyle changes help manage {condition}?",
            "What are the symptoms of {condition}?",
            "How effective is {treatment} for different stages of {condition}?",
            "What specialties are involved in treating {condition}?",
            "How does {treatment} interact with other medications?",
            "What preventive measures reduce risk of {condition}?",
            "What age groups are most affected by {condition}?",
            "What are emerging therapies for {condition}?"
        ]
    },
    "Deep": {
        "Policy": [
            "Formulate a policy for {policy_area} and analyze its trade-offs.",
            "Design a regulatory framework for the {sector} sector.",
            "Evaluate long-term economic impacts of {policy_area}.",
            "Propose a strategy to reform {sector} and discuss ethical implications.",
            "Develop a comprehensive policy approach to improve equity in {policy_area}.",
            "Analyze how international agreements affect domestic {policy_area} policies.",
            "Design an evidence-based approach to measure policy effectiveness in {policy_area}.",
            "Propose a framework for balancing stakeholder interests in {sector} regulation.",
            "Evaluate how technological changes will reshape {policy_area} in the next decade.",
            "Develop a crisis response plan for {sector} with multiple contingencies.",
            "Create a cross-jurisdictional coordination mechanism for {policy_area}.",
            "Design a public-private partnership model for addressing challenges in {sector}.",
            "Analyze different policy instruments for encouraging innovation in {sector}.",
            "Propose a transition strategy for policy reform in {policy_area} that minimizes disruption.",
            "Develop a multi-generational approach to addressing challenges in {policy_area}."
        ],
        "AI/ML": [
            "Design a scalable architecture for {application} and justify components.",
            "Evaluate ethical considerations of deploying {algorithm} in healthcare.",
            "Propose improvements to {algorithm} for real-time processing.",
            "Discuss long-term impacts of {application} on privacy.",
            "Design a comprehensive testing framework for ensuring fairness in {algorithm}.",
            "Analyze trade-offs between accuracy and explainability in {application}.",
            "Develop a strategy for responsible deployment of {algorithm} in high-stakes scenarios.",
            "Propose a novel hybrid approach combining {algorithm} with complementary methods.",
            "Design an integration architecture for embedding {application} into legacy systems.",
            "Evaluate potential regulatory approaches for governing {application} development.",
            "Propose a framework for continuous validation of {algorithm} in production.",
            "Analyze the environmental impact of large-scale {application} deployment.",
            "Design a cross-organizational governance model for {application} development.",
            "Evaluate computational and ethical trade-offs in federated learning for {application}.",
            "Propose a comprehensive approach to addressing bias in {algorithm} training."
        ],
        "Environmental Issues": [
            "Propose a comprehensive plan to tackle {complex_issue} nationally.",
            "Analyze trade-offs of implementing {solution} versus regulation.",
            "Evaluate global economic effects of {complex_issue} mitigation.",
            "Formulate a multi-sector strategy to reduce {complex_issue}.",
            "Design an international framework for addressing {complex_issue} equitably.",
            "Develop a transition strategy for industries most affected by {complex_issue}.",
            "Analyze how socioeconomic factors influence vulnerability to {complex_issue}.",
            "Propose a financing mechanism for scaling {solution} in developing regions.",
            "Evaluate how climate scenarios affect long-term planning for {complex_issue}.",
            "Design an integrated approach combining multiple solutions for {complex_issue}.",
            "Analyze the distributional impacts of policies addressing {complex_issue}.",
            "Develop a monitoring framework for measuring progress on {complex_issue}.",
            "Propose a model for international technology transfer to address {complex_issue}.",
            "Design a strategy for building community resilience to {complex_issue}.",
            "Evaluate trade-offs between adaptation and mitigation for {complex_issue}."
        ]
    }
}

# 3. Placeholder-filling function
def fill_placeholders(template, domain):
    # global placeholders
    for key, vals in PLACEHOLDERS.items():
        if isinstance(vals, list) and f"{{{key}}}" in template:
            template = template.replace(f"{{{key}}}", random.choice(vals))
    # domain-specific
    domain_pools = PLACEHOLDERS.get(domain, {})
    if isinstance(domain_pools, dict):
        for key, vals in domain_pools.items():
            if f"{{{key}}}" in template:
                template = template.replace(f"{{{key}}}", random.choice(vals))
    # special case: algorithm1/algorithm2
    if "{algorithm1}" in template and "AI/ML" in PLACEHOLDERS:
        algos = PLACEHOLDERS["AI/ML"]["algorithm"]
        a1, a2 = random.sample(algos, 2)
        template = template.replace("{algorithm1}", a1).replace("{algorithm2}", a2)
    return template

# 4. Reasoning explanations with standard hyphens
REASONING = {
    "Simple":  "Direct factual recall; no multi-step reasoning required.",
    "Shallow": "Requires brief explanation or simple comparison without deep trade-offs.",
    "Deep":    "Involves multi-step reasoning, trade-offs, or ethical/long-term analysis."
}

# 5. Dataset generator
def generate_dataset(per_label=1000):
    records = []
    for label, domain_dict in TEMPLATES.items():
        for _ in range(per_label):
            domain = random.choice(list(domain_dict.keys()))
            tmpl = random.choice(domain_dict[domain])
            qry = fill_placeholders(tmpl, domain)
            records.append({
                "query": qry,
                "label": label,
                "reasoning": REASONING[label]
            })
    random.shuffle(records)
    return records

# 6. Write to JSONL
if __name__ == "__main__":
    data = generate_dataset(1000)
    out_path = "reasoning_complexity_dataset.jsonl"
    with open(out_path, "w") as fw:
        for item in data:
            fw.write(json.dumps(item) + "\n")
    print(f"Dataset of {len(data)} records saved to: {out_path}")

    # Test a few examples to verify placeholders are replaced
    print("\nExample queries:")
    for i, item in enumerate(data[:5]):
        print(f"{i+1}. {item['query']} (Label: {item['label']})")

Dataset of 3000 records saved to: reasoning_complexity_dataset.jsonl

Example queries:
1. What is the symbol for beryllium? (Label: Simple)
2. What preventive measures reduce risk of epilepsy? (Label: Shallow)
3. Compare circular economy with traditional approaches. (Label: Shallow)
4. Design a public-private partnership model for addressing challenges in hospitality. (Label: Deep)
5. List the benefits of early detection of Parkinson's disease. (Label: Shallow)
