<a href="https://colab.research.google.com/github/wbarghout/Collecting-Data-Using-APIs/blob/main/create_a_synthetic_diabetes_patient_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set seed for reproducibility
np.random.seed(42)

# Generate 100,000 observations
n = 100000

# --- Generate Clean Data First ---
data = {
    # Date of visit (past 2 years)
    "Visit_Date": pd.date_range(start='2022-01-01', end='2023-12-31', periods=n).date,

    # Patient ID (with duplicates)
    "Patient_ID": np.random.choice(np.arange(1000, 2000), size=n, replace=True),

    # Age (skewed toward older adults)
    "Age": np.clip(np.random.normal(loc=55, scale=15, size=n), 18, 100).astype(int),

    # Gender (with typos and missing values)
    "Gender": np.random.choice(['Male', 'Female', 'M', 'F', 'Other', ' ', np.nan],
                              size=n, p=[0.35, 0.35, 0.1, 0.1, 0.05, 0.03, 0.02]),

    # Diagnosis (diabetes-related with noise)
    "Diagnosis": np.random.choice(
        ['Type 1 Diabetes', 'Type 2 Diabetes', 'Prediabetes', 'Gestational Diabetes', ' ', 'Unknown', np.nan],
        size=n, p=[0.4, 0.45, 0.05, 0.03, 0.03, 0.03, 0.01]
    ),

    # Insurance status (boolean with missing)
    "Has_Insurance": np.random.choice(['Yes', 'No', np.nan], size=n, p=[0.7, 0.25, 0.05]),

    # Postcode (synthetic UK-like codes with missing)
    "Postcode": [f"{chr(65 + np.random.randint(0, 26))}{chr(65 + np.random.randint(0, 26))}"
                f"{np.random.randint(1, 20):02d} {np.random.randint(1, 10):02d}"
                f"{chr(65 + np.random.randint(0, 26))}{chr(65 + np.random.randint(0, 26))}"
                if np.random.rand() > 0.05 else np.nan for _ in range(n)],

    # Total cost (skewed with outliers)
    "Total_Cost": np.abs(np.random.gamma(shape=2, scale=50, size=n)).round(2),
}

# Time-related columns (in minutes)
time_columns = [
    "Registration time", "Nursing time", "Laboratory time",
    "Consultation time", "Pharmacy time"
]
for col in time_columns:
    data[col] = np.abs(np.random.normal(loc=30, scale=15, size=n)).round(1)

# Convert to DataFrame
df = pd.DataFrame(data)

# --- Introduce Data Quality Issues ---

# 1. Missing Data (5% missing in random columns)
for col in ['Diagnosis', 'Postcode', 'Has_Insurance']:
    df.loc[df.sample(frac=0.05, random_state=42).index, col] = np.nan

# 2. Duplicate Rows (1% duplicates)
duplicates = df.sample(frac=0.01, random_state=42)
df = pd.concat([df, duplicates], ignore_index=True)

# 3. Inconsistent Gender Labels (e.g., "M" vs "Male")
df['Gender'] = df['Gender'].replace({'M': 'Male', 'F': 'Female'})

# 4. Invalid Diagnoses (introduce typos)
df.loc[df.sample(frac=0.02, random_state=42).index, 'Diagnosis'] = 'Type 3 Diabetes'

# 5. Negative Total_Cost (invalid values)
df.loc[df.sample(frac=0.01, random_state=42).index, 'Total_Cost'] *= -1

# 6. Outliers in Time Columns
for col in time_columns:
    # Get the indices of the sample
    sample_indices = df.sample(frac=0.03, random_state=42).index

    # Generate random values with the correct size
    random_values = np.random.randint(200, 500, size=len(sample_indices))

    # Assign the values to the selected rows and column
    df.loc[sample_indices, col] = random_values

# --- Save to CSV ---
df.to_csv('diabetes_patient_dataset.csv', index=False)
print("Dataset generated!")

Dataset generated!
