In [8]:
import pandas as pd
import numpy as np

np.random.seed(29032025)

# generate censored survival data for a birth cohort of 50 000 individuals
# outcome of interest is age at disease onset which might be censored by age at death
# generate age at death and age at disease onset for every individual, then define as events those who got the disease before dying
# age at disease onset depends on genotype - those with more risk alleles have on average lower disease onset ages
# age at disease onset is also influenced by bmi and sex; bmi is influenced by genotype and sex
# goal is to estimate the effect of genotype on age at disease onset

N = 50000
MAF = 0.1 # minor allele frequency is 10%, this means 81% have no risk alleles (AA), 18% have one risk allele (Aa), 1% have two risk alleles (aa) 
HR = 1.7 # effect size for genotype
a = 3.2503 # scale parameter for weibull distribution of disease onset age
b = 173.0897 # 'baseline' shape parameter for weibull distribution of disease onset age
# a and b are chosen so that the 'baseline' prevalence of the disease is about 10% and the median age of disease onset is about 70 

sex = np.random.binomial(1, 0.5, N) # 0: female, 1: male
genotype = np.random.binomial(2, MAF, N) # 'a' is risk allele: 0: AA, 1: Aa, 2: aa 
bmi = 15.5 + 0.9 * genotype + 1.2 * sex + np.random.lognormal(2.1, 0.4, N) # bmi depends on genotype and sex
b_g = b * (np.exp(np.log(HR) * genotype + 0.07 * (bmi - 25) + 0.3 * sex)) ** (-1 / a) # modify the shape parameter of the weibull distribution so it depends on genotype, bmi and sex
age_at_disease_onset = np.random.weibull(a, N) * b_g  # generate genotype-sex-bmi-specific age at disease onset
age_at_death = np.random.weibull(9, N) * 80 # Weibull parameters for age at death are estimated from EstBB cohort
age = np.minimum(age_at_disease_onset, age_at_death) # observed age is minimum of age at death and age at disease onset
event = (age_at_disease_onset == age).astype(int) # events are those who got the disease before dying, others are censored by death
sex_label = np.where(sex == 0, "female", "male")

df = pd.DataFrame({"age": age.astype(int), "event": event, "genotype": genotype, "bmi": bmi, "sex": sex_label})
df.to_csv("CVD_data.csv", index=False)