In [None]:
# importng modules

In [38]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, date, timedelta

In [39]:
# settings

In [40]:
Faker.seed(42)
random.seed(42)
fake = Faker("en_NZ")  # NZ locale for names

NUM_PATIENTS = 350
NUM_DOCTORS = 25
NUM_VISITS = 12000

VISIT_START = datetime(2022, 1, 1).date()
VISIT_END = datetime(2024, 12, 31).date()

In [41]:
# blood gruops - distributed logically

In [42]:
blood_types = ["O+", "A+", "B+", "AB+", "O-", "A-", "B-", "AB-"]
blood_weights = [38, 34, 9, 3, 6, 6, 3, 1]  # realistic percentages

In [43]:
# Towns in Auckland 

In [44]:
coastal_towns = ["Takapuna", "Albany", "Silverdale", "Howick", "Glenfield"]
inland_towns = ["Henderson", "Manukau", "Papakura", "New Lynn", "Pukekohe"]
all_towns = coastal_towns + inland_towns

In [45]:
# specialities

In [46]:
specialties = [
    "General Practitioner", "Cardiologist", "Dermatologist", "Pediatrician",
    "Orthopedic Surgeon", "Neurologist", "Psychiatrist", "ENT Specialist",
    "Endocrinologist", "Pulmonologist"
]

In [47]:
# departments

In [48]:
departments = {
    "General Practitioner": "General Medicine",
    "Cardiologist": "Cardiology",
    "Dermatologist": "Dermatology",
    "Pediatrician": "Pediatrics",
    "Orthopedic Surgeon": "Orthopedics",
    "Neurologist": "Neurology",
    "Psychiatrist": "Psychiatry",
    "ENT Specialist": "ENT",
    "Endocrinologist": "Endocrinology",
    "Pulmonologist": "Pulmonology"
}

In [49]:
# seasonal symptoms and diagnosis

In [50]:
seasonal_data = {
    "Summer": {
        "months": [12, 1, 2],
        "symptoms": ["Skin Rash", "Heat Exhaustion", "Hay Fever", "Nausea"],
        "diagnoses": ["Eczema", "Allergic Rhinitis", "Food Poisoning"]
    },
    "Autumn": {
        "months": [3, 4, 5],
        "symptoms": ["Cough", "Fever", "Asthma", "Sneezing"],
        "diagnoses": ["Bronchitis", "Viral Infection", "Asthma Flare-up"]
    },
    "Winter": {
        "months": [6, 7, 8],
        "symptoms": ["Cough", "Fever", "Chest Pain", "Shortness of Breath"],
        "diagnoses": ["Influenza", "Pneumonia", "Angina"]
    },
    "Spring": {
        "months": [9, 10, 11],
        "symptoms": ["Hay Fever", "Asthma", "Skin Rash", "Sinus Pain"],
        "diagnoses": ["Allergic Rhinitis", "Asthma Flare-up", "Sinusitis"]
    }
}

In [51]:
# symptom to speciality mapping

In [54]:
symptom_specialty_map = {
    "Skin Rash": "Dermatologist",
    "Heat Exhaustion": "General Practitioner",
    "Hay Fever": "ENT Specialist",
    "Nausea": "General Practitioner",
    "Cough": "Pulmonologist",
    "Fever": "General Practitioner",
    "Asthma": "Pulmonologist",
    "Sneezing": "ENT Specialist",
    "Chest Pain": "Cardiologist",
    "Shortness of Breath": "Cardiologist",
    "Sinus Pain": "ENT Specialist"
}


In [53]:
#symptom to diagnosis mapping

In [55]:
symptom_to_diag = {
    "Skin Rash": ["Eczema", "Contact Dermatitis"],
    "Heat Exhaustion": ["Heat Stroke", "Dehydration"],
    "Hay Fever": ["Allergic Rhinitis"],
    "Nausea": ["Food Poisoning", "Gastroenteritis"],
    "Cough": ["Bronchitis", "Viral Infection"],
    "Fever": ["Viral Infection", "Influenza"],
    "Asthma": ["Asthma Flare-up"],
    "Sneezing": ["Allergic Rhinitis", "Common Cold"],
    "Chest Pain": ["Angina", "Myocardial Infarction"],
    "Shortness of Breath": ["Pneumonia", "Asthma Flare-up"],
    "Sinus Pain": ["Sinusitis"]
}

In [56]:
# Age based chronic condition

In [57]:
def assign_chronic_condition(age):
    if age <= 12:
        return random.choices(["Asthma", "None", "Diabetes (Type 1)"], weights=[50, 40, 10])[0]
    elif 13 <= age <= 19:
        return random.choices(["Asthma", "None", "Anxiety"], weights=[40, 40, 20])[0]
    elif 20 <= age <= 35:
        return random.choices(["None", "Asthma", "Anxiety"], weights=[50, 30, 20])[0]
    elif 36 <= age <= 59:
        return random.choices(["Hypertension", "Diabetes (Type 2)", "Arthritis", "None"], weights=[40, 30, 20, 10])[0]
    else:
        return random.choices(["Hypertension", "Arthritis", "Heart Disease", "Chronic Kidney Disease"], weights=[35, 30, 20, 15])[0]

In [58]:
# generating patients

In [59]:
patients = []
six_years_ago = date.today() - timedelta(days=6*365)

for pid in range(1, NUM_PATIENTS + 1):
    gender = random.choice(["Male", "Female"])
    town = random.choice(all_towns)
    town_category = "Coastal" if town in coastal_towns else "Inland"
    dob = fake.date_of_birth(minimum_age=1, maximum_age=90)
    # Age as of today (ok for chronic assignment)
    age = int((date.today() - dob).days // 365)
    # Registration date must be <= VISIT_END
    reg_date = fake.date_between(start_date=six_years_ago, end_date=VISIT_END)
    chronic = assign_chronic_condition(age)
    blood = random.choices(blood_types, weights=blood_weights, k=1)[0]
    insurance = random.choices(["Yes", "No"], weights=[7,3], k=1)[0]

    patients.append({
        "PatientID": pid,
        "Name": fake.name_male() if gender == "Male" else fake.name_female(),
        "Gender": gender,
        "DOB": dob,
        "Age": age,
        "City": "Auckland",
        "Town": town,
        "TownCategory": town_category,
        "Insurance": insurance,
        "BloodType": blood,
        "ChronicCondition": chronic,
        "RegistrationDate": reg_date
    })


In [60]:
patients_df = pd.DataFrame(patients)
patients_df.to_csv("aucklandpatients.csv", index=False)

In [61]:
# patients

In [65]:
doctors = []
# create a pool of doctors by speciality so we can pick later
for did in range(1, NUM_DOCTORS + 1):
    speciality = random.choice(specialties)
    doctors.append({
        "DoctorID": did,
        "Name": "Dr."+fake.name(),
        "Speciality": speciality,
        "Department": departments[speciality],
        "ExperienceYears": random.randint(2, 35),
        "City": "Auckland",
        "Town": random.choice(all_towns)
    })


In [66]:
doctors_df = pd.DataFrame(doctors)
doctors_df.to_csv("aucklanddoctors.csv", index=False)

In [67]:
# build a mapping speciality to list of Doctors ids

In [68]:
doctors_by_speciality = {}
for doc in doctors:
    doctors_by_speciality.setdefault(doc["Speciality"], []).append(doc["DoctorID"])

In [69]:
doctors_by_speciality

{'ENT Specialist': [1, 7, 16],
 'Neurologist': [2, 4, 6],
 'Cardiologist': [3, 14],
 'General Practitioner': [5, 11, 18],
 'Endocrinologist': [8, 15, 25],
 'Orthopedic Surgeon': [9, 17, 22, 23, 24],
 'Pulmonologist': [10],
 'Pediatrician': [12, 19],
 'Psychiatrist': [13, 20, 21]}

In [70]:
# build a mapping department to doctor ids 

In [71]:
doctors_by_department = {}
for doc in doctors:
    doctors_by_department.setdefault(doc["Department"], []).append(doc["DoctorID"])

In [72]:
doctors_by_department

{'ENT': [1, 7, 16],
 'Neurology': [2, 4, 6],
 'Cardiology': [3, 14],
 'General Medicine': [5, 11, 18],
 'Endocrinology': [8, 15, 25],
 'Orthopedics': [9, 17, 22, 23, 24],
 'Pulmonology': [10],
 'Pediatrics': [12, 19],
 'Psychiatry': [13, 20, 21]}

In [73]:
# generating visits

In [77]:
visits = []

for _ in range(NUM_VISITS):
    # --- pick patient ---
    patient = random.choice(patients)
    patient_id = patient["PatientID"]
    patient_age = patient["Age"]
    patient_town = patient["Town"]
    town_category = patient["TownCategory"]
    reg_date = pd.to_datetime(patient["RegistrationDate"]).date()

    # ensure visit date >= registration date
    if reg_date > VISIT_END:
        continue  # skip impossible cases

    start_for_patient = max(reg_date, VISIT_START)
    days_range = (VISIT_END - start_for_patient).days
    if days_range <= 0:
        visit_date = start_for_patient
    else:
        visit_date = start_for_patient + timedelta(days=random.randint(0, days_range))

    # --- season selection ---
    month = visit_date.month
    for sname, details in seasonal_data.items():
        if month in details["months"]:
            season = sname
            season_symptoms = details["symptoms"]
            season_diagnoses = details["diagnoses"]
            break

    # --- symptom selection ---
    symptom = random.choice(season_symptoms)

    # coastal bias: in summer coastal towns → ear/skin/heat
    if town_category == "Coastal" and season == "Summer":
        symptom = random.choices(
            ["Skin Rash", "Heat Exhaustion", "Ear Infection", symptom],
            weights=[30, 25, 20, 25],
            k=1
        )[0]

    # age bias: kids & seniors
    if patient_age <= 12:
        symptom = random.choices(
            ["Ear Infection", "Common Cold", "Asthma", symptom],
            weights=[30, 25, 20, 25],
            k=1
        )[0]
    elif patient_age >= 65 and season == "Winter":
        symptom = random.choices(
            ["Chest Pain", "Shortness of Breath", "Cough", symptom],
            weights=[30, 25, 25, 20],
            k=1
        )[0]

    # --- doctor & department ---
    specialty = symptom_specialty_map.get(symptom, "General Practitioner")
    department = departments.get(specialty, "General Medicine")

    eligible_doctors = doctors_by_speciality.get(specialty, [])
    if not eligible_doctors:
        eligible_doctors = doctors_by_department.get(department, [])
    if not eligible_doctors:
        eligible_doctors = [d["DoctorID"] for d in doctors]  # fallback all doctors

    doctor_id = random.choice(eligible_doctors)

    # --- diagnosis selection ---
    diag_candidates = symptom_to_diag.get(symptom, []) + season_diagnoses
    if patient_age <= 12:
        diag_candidates += ["Ear Infection", "Common Cold"]
    if patient_age >= 65:
        diag_candidates += ["Pneumonia", "Heart Disease"]

    diagnosis = random.choice(list(set(diag_candidates)))  # avoid duplicate list entries

    # --- visit type & billing ---
    visit_type = random.choices(
        ["Outpatient", "Emergency", "Inpatient"],
        weights=[70, 20, 10],
        k=1
    )[0]
    payment_method = random.choices(
        ["Insurance", "Self-pay", "ACC"],
        weights=[70, 20, 10],
        k=1
    )[0]
    billing_amount = {
        "Outpatient": round(random.uniform(50, 200), 2),
        "Emergency": round(random.uniform(200, 1500), 2),
        "Inpatient": round(random.uniform(500, 5000), 2)
    }[visit_type]

    # --- follow-up logic ---
    base_followup_prob = 0.18
    if diagnosis in ["Arthritis", "Heart Disease", "Pneumonia", "Bronchitis", "Asthma Flare-up"]:
        base_followup_prob += 0.25
    if patient["ChronicCondition"] != "None":
        base_followup_prob += 0.08

    follow_up_required = "Yes" if random.random() < base_followup_prob else "No"

    followup_doctor_id = None
    if follow_up_required == "Yes":
        if random.random() < 0.7:
            followup_doctor_id = doctor_id
        else:
            same_dept_doctors = doctors_by_department.get(department, [])
            other_docs = [d for d in same_dept_doctors if d != doctor_id]
            followup_doctor_id = random.choice(other_docs) if other_docs else doctor_id

    # --- append visit (no VisitID yet) ---
    visits.append({
        "PatientID": patient_id,
        "DoctorID": doctor_id,
        "FollowUpDoctorID": followup_doctor_id,
        "Town": patient_town,
        "TownCategory": town_category,
        "Symptom": symptom,
        "Diagnosis": diagnosis,
        "VisitDate": visit_date,
        "VisitType": visit_type,
        "BillingAmount": billing_amount,
        "InsuranceUsed": "Yes" if payment_method == "Insurance" else "No",
        "PaymentMethod": payment_method,
        "PrescriptionGiven": random.choices(["Yes", "No"], weights=[70, 30], k=1)[0],
        "FollowUpRequired": follow_up_required,
        "ReadmissionFlag": None,  # will calculate later
        "VisitDurationMinutes": random.randint(5, 240)
    })

# --- chronological VisitID assignment ---
visits.sort(key=lambda x: x["VisitDate"])
for idx, visit in enumerate(visits, start=1):
    visit["VisitID"] = idx

In [78]:
# Convert to DataFrame and compute ReadmissionFlag
visits_df = pd.DataFrame(visits)

# Convert VisitDate to datetime.date if needed
visits_df["VisitDate"] = pd.to_datetime(visits_df["VisitDate"]).dt.date

# Sort visits by PatientID and VisitDate
visits_df.sort_values(["PatientID", "VisitDate"], inplace=True)
visits_df.reset_index(drop=True, inplace=True)

# Compute ReadmissionFlag: a visit is readmission if previous visit for same patient exists and
# current_visit_date - previous_visit_date <= 30 days AND previous was NOT a scheduled follow-up.
readmission_flags = []
prev_dates = {}  # patient_id -> (prev_date, prev_followup_required)

for idx, row in visits_df.iterrows():
    pid = row["PatientID"]
    vdate = pd.to_datetime(row["VisitDate"]).date()
    prev = prev_dates.get(pid)
    flag = "No"
    if prev:
        prev_date, prev_followup = prev
        days_diff = (vdate - prev_date).days
        # if current visit within 30 days of previous visit and previous was not a scheduled follow-up -> readmission
        if days_diff <= 30 and (prev_followup == "No"):
            flag = "Yes"
    # update prev_dates using current visit
    prev_dates[pid] = (vdate, row["FollowUpRequired"])
    readmission_flags.append(flag)

visits_df["ReadmissionFlag"] = readmission_flags


In [79]:
visits_df.to_csv("aucklandvisits.csv", index=False)

In [80]:
print("Datasets generated: patients.csv, doctors.csv, visits.csv")
print("  - RegistrationDate capped <="," VISIT_END")
print("  - VisitDate always >= RegistrationDate")
print("  - Symptom -> Specialty -> Doctor mapping applied")
print("  - FollowUpDoctor assigned (same or same-department)")
print("  - ReadmissionFlag computed (unplanned revisit within 30 days)")
print("  - Age-based chronic conditions, seasonal & coastal/inland bias, blood type weights applied")

Datasets generated: patients.csv, doctors.csv, visits.csv
  - RegistrationDate capped <=  VISIT_END
  - VisitDate always >= RegistrationDate
  - Symptom -> Specialty -> Doctor mapping applied
  - FollowUpDoctor assigned (same or same-department)
  - ReadmissionFlag computed (unplanned revisit within 30 days)
  - Age-based chronic conditions, seasonal & coastal/inland bias, blood type weights applied
