In [2]:
import random

def generate_diabetes_dataset(num_samples=200):
    """Generates a synthetic dataset for diabetes-related questions and answers."""

    conditions = ["type 1 diabetes", "type 2 diabetes", "gestational diabetes", "prediabetes"]
    symptoms = ["frequent urination", "excessive thirst", "unexplained weight loss", "increased hunger", "blurred vision", "slow-healing sores", "frequent infections", "tingling or numbness in hands or feet", "fatigue"]
    complications = ["heart disease", "stroke", "kidney disease", "nerve damage", "eye damage", "foot problems", "skin conditions", "hearing impairment", "Alzheimer's disease"]
    treatments = ["insulin therapy", "oral medications", "lifestyle changes (diet, exercise)", "blood sugar monitoring", "weight management"]
    risk_factors = ["family history", "obesity", "inactivity", "age", "high blood pressure", "high cholesterol", "gestational diabetes history", "polycystic ovary syndrome (PCOS)", "certain ethnicities"]
    tests = ["A1C test", "fasting blood sugar test", "oral glucose tolerance test", "random blood sugar test"]
    food_groups = ["carbohydrates", "proteins", "fats", "fiber"]
    questions = [
        "What are the symptoms of [condition]?",
        "What are the risk factors for [condition]?",
        "How is [condition] diagnosed?",
        "What are the complications of [condition]?",
        "What are the treatment options for [condition]?",
        "How can I manage my [condition]?",
        "What foods should I avoid with [condition]?",
        "What is a good diet for [condition]?",
        "What is the [test]?",
        "How often should I get a [test]?",
        "What are the long term effects of [condition]?",
        "How does exercise help with [condition]?",
        "What are healthy [food_group] for diabetics?",
    ]

    data = []
    for _ in range(num_samples):
        condition = random.choice(conditions)
        question_template = random.choice(questions)
        question = question_template.replace("[condition]", condition)

        if "[test]" in question:
            test = random.choice(tests)
            question = question.replace("[test]", test)
            context = f"Information about the {test}:..." #replace with more realistic context if needed.
            answer = f"The {test} is used to diagnose..." #replace with more realistic answer.
        elif "[food_group]" in question:
            food_group = random.choice(food_groups)
            question = question.replace("[food_group]", food_group)
            context = f"Information about healthy {food_group} for diabetics:..." #replace with more realistic context if needed
            answer = f"Healthy {food_group} options include..." #replace with more realistic answer.

        elif "symptoms" in question:
            context = f"Symptoms of {condition}: " + ", ".join(random.sample(symptoms, random.randint(3, 5)))
            answer = ", ".join(random.sample(symptoms, random.randint(2, 4))) + ". Consult a doctor if you experience these."
        elif "risk factors" in question:
            context = f"Risk factors for {condition}: " + ", ".join(random.sample(risk_factors, random.randint(3, 5)))
            answer = ", ".join(random.sample(risk_factors, random.randint(2, 4))) + ". Consult your doctor for a risk assessment."
        elif "diagnosed" in question:
            context = f"Diagnosis of {condition}: " + ", ".join(random.sample(tests, random.randint(1, 2)))
            answer = "Diagnosis typically involves " + ", ".join(random.sample(tests, random.randint(1, 2))) + "."
        elif "complications" in question:
            context = f"Potential complications of {condition}: " + ", ".join(random.sample(complications, random.randint(3, 5)))
            answer = ", ".join(random.sample(complications, random.randint(2, 4))) + ". Manage your blood sugar to reduce risk."
        elif "treatment" in question or "manage" in question:
            context = f"Treatment options for {condition}: " + ", ".join(random.sample(treatments, random.randint(2, 4)))
            answer = ", ".join(random.sample(treatments, random.randint(1, 3))) + ". Consult your doctor for a personalized plan."
        elif "foods to avoid" in question:
            context = "Foods to avoid: sugary drinks, processed foods, high-carb meals."
            answer = "Limit sugary drinks, processed foods, and high-carb meals."
        elif "good diet" in question:
            context = "A good diet for diabetics includes: lean proteins, whole grains, non-starchy vegetables."
            answer = "Focus on lean proteins, whole grains, and non-starchy vegetables."
        elif "long term effects" in question:
            context = "Long term effects include: " + ", ".join(random.sample(complications, random.randint(2, 4)))
            answer = ", ".join(random.sample(complications, random.randint(1, 3))) + ". Regular checkups are important."
        elif "exercise help" in question:
            context = "Exercise helps lower blood sugar and improve insulin sensitivity."
            answer = "Regular exercise helps manage blood sugar levels."
        else:
            context = "General information about diabetes."
            answer = "Diabetes is a chronic condition..."

        data.append({"question": question, "context": context, "answer": answer})
    return data

diabetes_data = generate_diabetes_dataset()

# Example output
print(diabetes_data[0])

{'question': 'What are the treatment options for prediabetes?', 'context': 'Treatment options for prediabetes: blood sugar monitoring, insulin therapy', 'answer': 'insulin therapy. Consult your doctor for a personalized plan.'}


In [3]:
import json
def save_to_text_file(data, filename="diabetes_dataset.json"):
    """Saves the dataset to a text file, with one JSON object per line."""
    with open(filename, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

save_to_text_file(diabetes_data)