In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def generate_digital_twin_nhs(n_patients=5000):
    np.random.seed(42)
    
    # 1. Demographic Features
    # UK has an aging population; normal distribution skewed towards older ages for appointments
    age = np.random.normal(52, 18, n_patients).clip(18, 98).astype(int)
    
    # 2. Socio-Economic Features (The ONS IMD Factor)
    # IMD Decile: 1 is most deprived (higher DNA risk), 10 is least
    imd_decile = np.random.choice(range(1, 11), n_patients, p=[0.15, 0.12, 0.11, 0.10, 0.10, 0.10, 0.09, 0.08, 0.08, 0.07])
    
    # 3. Accessibility Features
    # Distance in KM to the nearest GP/Hospital
    distance_km = np.random.gamma(shape=2, scale=1.5, size=n_patients).clip(0.5, 20)
    
    # 4. Appointment Details
    # 'Specialist' and 'Mental Health' often have higher DNA rates than 'Routine GP'
    app_types = ['Routine GP', 'Specialist', 'Mental Health', 'Dental', 'Physiotherapy']
    appointment_type = np.random.choice(app_types, n_patients, p=[0.4, 0.2, 0.15, 0.15, 0.1])
    
    # 5. Hidden "Ground Truth" Logic (The Signal the ML must find)
    # Risk factors: High deprivation (low decile), long distance, and certain appt types
    base_risk = 0.05
    age_risk = np.where(age > 75, -0.05, 0) # Elderly patients often have LOWER DNA rates
    imd_risk = (11 - imd_decile) * 0.03
    dist_risk = (distance_km / 10) * 0.1
    type_risk = np.where(np.isin(appointment_type, ['Mental Health', 'Specialist']), 0.1, 0)
    
    total_risk = (base_risk + age_risk + imd_risk + dist_risk + type_risk).clip(0.02, 0.9)
    dna_event = np.random.binomial(1, total_risk)

    df = pd.DataFrame({
        'Age': age,
        'IMD_Decile': imd_decile,
        'Distance_KM': distance_km,
        'Appointment_Type': appointment_type,
        'DNA_Event': dna_event
    })
    
    # One-hot encoding for the categorical appointment type
    df = pd.get_dummies(df, columns=['Appointment_Type'], prefix='type')
    
    return df

# Generate and view
nhs_df = generate_digital_twin_nhs(5000)
print(f"Dataset generated. DNA Rate: {nhs_df['DNA_Event'].mean():.2%}")

Dataset generated. DNA Rate: 28.96%
