<a href="https://colab.research.google.com/github/yashikart/syntheticore-agent/blob/main/faker_datasets_Education_Finance_Health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker pandas numpy scikit-learn
!pip install openai transformers stable-baselines3
!pip install openai
!pip install langchain openai

In [29]:
from faker import Faker
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta


In [30]:
fake = Faker()
Faker.seed(42)


In [44]:

def generate_finance_data(n=90):
    data = []
    categories = ['Low', 'Medium', 'High']
    per_category = n // len(categories)

    # Define income ranges per category
    income_ranges = {
        "Low": (30000, 49999),
        "Medium": (50000, 79999),
        "High": (80000, 100000)
    }

    # Loop through each category
    for category in categories:
        for _ in range(per_category):
            name = fake.name()
            region = fake.city()
            income = round(random.uniform(*income_ranges[category]), 2)

            # Expense: 30–90% of income
            expense = round(random.uniform(income * 0.3, income * 0.9), 2)
            surplus = income - expense

            # Savings Goal: 50–100% of surplus
            savings_goal = round(random.uniform(surplus * 0.5, surplus), 2)

            # Derived fields
            savings_pct = round((surplus / income) * 100, 2)

            # Label (simple logic)
            if savings_pct > 20:
                label = "Good Saver"
            else:
                label = "Over-Spender"

            entry = {
                "Name": name,
                "Region": region,
                "Income": income,
                "Expense": expense,
                "Savings Goal": savings_goal,
                "Savings (%)": savings_pct,
                "Income Category": category,
                "Label": label,
                "Timestamp": fake.date_time_this_year()
            }
            data.append(entry)

    # If n is not divisible by 3, fill remaining randomly
    remaining = n - per_category * len(categories)
    for _ in range(remaining):
        category = random.choice(categories)
        income = round(random.uniform(*income_ranges[category]), 2)
        name = fake.name()
        region = fake.city()
        expense = round(random.uniform(income * 0.3, income * 0.9), 2)
        surplus = income - expense
        savings_goal = round(random.uniform(surplus * 0.5, surplus), 2)
        savings_pct = round((surplus / income) * 100, 2)
        label = "Good Saver" if savings_pct > 20 else "Over-Spender"

        entry = {
            "Name": name,
            "Region": region,
            "Income": income,
            "Expense": expense,
            "Savings Goal": savings_goal,
            "Savings (%)": savings_pct,
            "Income Category": category,
            "Label": label,
            "Timestamp": fake.date_time_this_year()
        }
        data.append(entry)

    random.shuffle(data)
    return pd.DataFrame(data)

# Generate and check counts
finance_df = generate_finance_data(90)
print(finance_df["Income Category"].value_counts())
print(finance_df.head())

Income Category
High      30
Low       30
Medium    30
Name: count, dtype: int64
             Name             Region    Income   Expense  Savings Goal  \
0  Jennifer Costa   Lake Jessicaport  90376.75  37948.98      50872.33   
1   Brandy Porter  New Savannahshire  42458.24  17800.80      16170.25   
2     Tiffany Cox  Port Jeffreymouth  36673.45  16152.42      10859.25   
3  James Guerrero          Moorebury  53727.15  38947.33      14175.29   
4     Victor Hill        South Sarah  87690.42  72182.58      14494.75   

   Savings (%) Income Category         Label                  Timestamp  
0        58.01            High    Good Saver 2025-06-27 21:56:04.865403  
1        58.07             Low    Good Saver 2025-04-01 16:54:41.785719  
2        55.96             Low    Good Saver 2025-03-04 08:25:56.823331  
3        27.51          Medium    Good Saver 2025-01-26 14:45:59.333130  
4        17.68            High  Over-Spender 2025-04-18 23:30:21.857169  


In [45]:
def generate_student_subject_scores_balanced(n=9):
    subjects = ['Math', 'Science', 'English', 'History', 'Geography']
    progress_categories = ['Improving', 'Stable', 'Declining']
    per_category = n // len(progress_categories)

    data = []

    # Define score ranges for each progress category
    score_ranges = {
        'Improving': (85, 100),
        'Stable': (60, 84),
        'Declining': (40, 59)
    }

    for progress in progress_categories:
        for _ in range(per_category):
            student_id = fake.uuid4()
            name = fake.name()

            # Generate scores within range for this progress level
            min_score, max_score = score_ranges[progress]
            scores = {subject: random.randint(min_score, max_score) for subject in subjects}

            entry = {
                "Student ID": student_id,
                "Name": name,
                **scores,
                "Progress": progress,
                "Timestamp": fake.date_time_this_year()
            }
            data.append(entry)

    # If n is not divisible by 3, add remaining randomly
    remaining = n - per_category * len(progress_categories)
    for _ in range(remaining):
        progress = random.choice(progress_categories)
        min_score, max_score = score_ranges[progress]
        student_id = fake.uuid4()
        name = fake.name()
        scores = {subject: random.randint(min_score, max_score) for subject in subjects}

        entry = {
            "Student ID": student_id,
            "Name": name,
            **scores,
            "Progress": progress,
            "Timestamp": fake.date_time_this_year()
        }
        data.append(entry)

    random.shuffle(data)  # Shuffle for natural mix
    return pd.DataFrame(data)

df = generate_student_subject_scores_balanced(9)
print(df["Progress"].value_counts())
print(df.head())


Progress
Improving    3
Declining    3
Stable       3
Name: count, dtype: int64
                             Student ID               Name  Math  Science  \
0  8198d1f5-6839-498f-accf-147242d7f112  Benjamin Phillips    95       92   
1  2db13deb-ee6f-4bea-8df7-164713769ff9     Hayden Shannon    42       56   
2  ee25356a-aee0-4ec9-bf04-18a9f2765513   Kathryn Gonzalez    54       49   
3  7bcf2b31-902b-4154-a088-5550f74f6900           Nina Ali    90       98   
4  dcad8ec2-963a-4333-a582-95dc75f1c654        Brian Jones    77       73   

   English  History  Geography   Progress                  Timestamp  
0       85       97         99  Improving 2025-06-27 20:51:39.910542  
1       59       41         56  Declining 2025-02-20 12:54:45.918533  
2       51       50         53  Declining 2025-05-19 03:05:35.479764  
3       96       90         92  Improving 2025-03-28 04:21:02.915977  
4       82       62         78     Stable 2025-04-30 00:21:24.958147  


In [46]:
def generate_health_data(n=100):
    diagnosis_symptom_map = {
        'Cold': ['Cough', 'Sneezing', 'Runny Nose', 'Fever'],
        'Flu': ['Fever', 'Fatigue', 'Cough', 'Body Ache', 'Headache'],
        'Migraine': ['Headache', 'Nausea', 'Sensitivity to Light', 'Blurred Vision'],
        'COVID-19': ['Fever', 'Cough', 'Fatigue', 'Loss of Smell', 'Shortness of Breath'],
        'Gastritis': ['Nausea', 'Vomiting', 'Stomach Pain', 'Loss of Appetite'],
        'Asthma': ['Shortness of Breath', 'Wheezing', 'Chest Tightness', 'Cough'],
        'Diabetes': ['Frequent Urination', 'Fatigue', 'Blurred Vision', 'Increased Thirst'],
        'Hypertension': ['Headache', 'Dizziness', 'Nosebleeds', 'Fatigue'],
        'Anemia': ['Fatigue', 'Pale Skin', 'Shortness of Breath', 'Dizziness'],
        'Depression': ['Fatigue', 'Sadness', 'Loss of Interest', 'Sleep Disturbance'],
        'Appendicitis': ['Abdominal Pain', 'Nausea', 'Fever', 'Loss of Appetite'],
        'UTI': ['Burning Sensation', 'Frequent Urination', 'Pelvic Pain', 'Cloudy Urine'],
        'Allergy': ['Sneezing', 'Runny Nose', 'Itchy Eyes', 'Cough'],
        'Chickenpox': ['Fever', 'Rash', 'Itchy Skin', 'Fatigue'],
        'Dengue': ['High Fever', 'Headache', 'Joint Pain', 'Skin Rash']
    }

    data = []

    for _ in range(n):
        patient_id = fake.uuid4()
        name = fake.name()
        gender = random.choice(['Male', 'Female', 'Other'])
        age = random.randint(1, 90)
        city = fake.city()

        diagnosis = random.choice(list(diagnosis_symptom_map.keys()))
        possible_symptoms = diagnosis_symptom_map[diagnosis]
        num_symptoms = random.randint(3, min(4, len(possible_symptoms)))
        symptoms = random.sample(possible_symptoms, k=num_symptoms)

        # Severity logic based on symptom count
        if num_symptoms == 3:
            severity = "Moderate"
        else:
            severity = "Severe" if diagnosis in ['COVID-19', 'Dengue', 'Appendicitis'] else "Mild"

        # Risk logic
        high_risk = age > 60 or severity == "Severe"

        # Label logic
        if severity == "Severe" or high_risk:
            label = "Needs Hospitalization"
        else:
            label = "Home Care"

        entry = {
            "Patient ID": patient_id,
            "Name": name,
            "Gender": gender,
            "Age": age,
            "Region": city,
            "Symptoms": ", ".join(symptoms),
            "Diagnosis": diagnosis,
            "Severity": severity,
            "High Risk": high_risk,
            "Recommended Care": label,
            "Timestamp": fake.date_time_this_year()
        }
        data.append(entry)

    return pd.DataFrame(data)

# Generate and view sample
df = generate_health_data(5)
print(df.head())

                             Patient ID              Name  Gender  Age  \
0  1417c2f9-eb91-431a-9eb0-5b4a1b4ffd5f       Mary Austin  Female    1   
1  9313b3a7-e5a1-46e4-a188-6146fc40a3d4   Katherine Joyce    Male   34   
2  7a732f35-6c55-40f0-8c9c-f41c86969901       Amanda Reed  Female   35   
3  0227b8d5-b070-4a53-90ce-857f75c72fb6  Daniel Carpenter    Male   37   
4  78aa16ea-44f7-401a-bcb5-26d7434a130e       Grace White    Male   16   

              Region                                           Symptoms  \
0  Port Timothymouth           Dizziness, Nosebleeds, Fatigue, Headache   
1       Raymondshire                      Headache, Nosebleeds, Fatigue   
2      South Tiffany          Loss of Smell, Cough, Shortness of Breath   
3        Michaelview                    Headache, High Fever, Skin Rash   
4          Clarkstad  Loss of Interest, Sleep Disturbance, Sadness, ...   

      Diagnosis  Severity  High Risk Recommended Care  \
0  Hypertension      Mild      False        Hom