In [1]:
pip install faker pandas

Collecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.6.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.9 MB ? eta -:--:--
   ---------------- ----------------------- 0.8/1.9 MB 1.8 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/1.9 MB 2.0 MB/s eta 0:00:01
   -------------------------------- ------- 1.6/1.9 MB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 2.0 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.6.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime

fake = Faker()

# -------------------------
# 1. Real Vaccines (static)
# -------------------------
vaccines = [
    (1, "Pfizer-BioNTech", "Pfizer, BioNTech", "2020-12-11"),
    (2, "Moderna", "Moderna", "2020-12-18"),
    (3, "Covaxin", "Bharat Biotech", "2021-01-03"),
    (4, "AstraZeneca (Covishield)", "Oxford, AstraZeneca", "2020-12-30"),
    (5, "Sputnik V", "Gamaleya Research Institute", "2020-08-11"),
    (6, "Sinovac", "Sinovac Biotech", "2021-02-05")
]

df_vaccines = pd.DataFrame(vaccines, columns=["vaccine_id", "vaccine_name", "manufacturer", "approval_date"])
df_vaccines.to_csv("vaccines.csv", index=False)

# -------------------------
# 2. Countries (static)
# -------------------------
countries = [
    (1, "India", "Asia"),
    (2, "USA", "North America"),
    (3, "Brazil", "South America"),
    (4, "UK", "Europe"),
    (5, "Nigeria", "Africa"),
    (6, "Australia", "Oceania")
]

df_countries = pd.DataFrame(countries, columns=["country_id", "country_name", "continent"])
df_countries.to_csv("countries.csv", index=False)

# -------------------------
# 3. Patients (synthetic, 10k)
# -------------------------
num_patients = 10000
patients = []

for pid in range(1, num_patients + 1):
    name = fake.name()
    age = random.randint(18, 80)
    gender = random.choice(["Male", "Female"])
    country_id = random.choice([c[0] for c in countries])
    patients.append((pid, name, age, gender, country_id))

df_patients = pd.DataFrame(patients, columns=["patient_id", "name", "age", "gender", "country_id"])
df_patients.to_csv("patients.csv", index=False)

# -------------------------
# 4. Trials (synthetic, 50k)
# -------------------------
num_trials = 50000
trials = []

# Vaccine outcome probabilities
outcome_probs = {
    1: ["Success"] * 90 + ["Side Effect"] * 8 + ["Failure"] * 2,  # Pfizer
    2: ["Success"] * 90 + ["Side Effect"] * 8 + ["Failure"] * 2,  # Moderna
    3: ["Success"] * 80 + ["Side Effect"] * 15 + ["Failure"] * 5, # Covaxin
    4: ["Success"] * 80 + ["Side Effect"] * 15 + ["Failure"] * 5, # AstraZeneca
    5: ["Success"] * 70 + ["Side Effect"] * 20 + ["Failure"] * 10,# Sputnik V
    6: ["Success"] * 70 + ["Side Effect"] * 20 + ["Failure"] * 10 # Sinovac
}

for tid in range(1, num_trials + 1):
    patient_id = random.randint(1, num_patients)
    vaccine_id = random.choice([v[0] for v in vaccines])
    trial_date = fake.date_between(start_date=datetime(2020, 6, 1), end_date=datetime(2023, 12, 31))
    outcome = random.choice(outcome_probs[vaccine_id])
    trials.append((tid, patient_id, vaccine_id, trial_date, outcome))

df_trials = pd.DataFrame(trials, columns=["trial_id", "patient_id", "vaccine_id", "trial_date", "outcome"])
df_trials.to_csv("trials.csv", index=False)

print("✅ Dataset generated: vaccines.csv, countries.csv, patients.csv, trials.csv")


✅ Dataset generated: vaccines.csv, countries.csv, patients.csv, trials.csv
