In [None]:
# %pip install polars
# %pip install numpy

Collecting numpy
  Downloading numpy-2.4.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (6.6 kB)
Downloading numpy-2.4.2-cp312-cp312-macosx_14_0_arm64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.4.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import polars as pl 
import uuid
import numpy as np

In [2]:
RANDOM_SEED = 42
TOTAL_PATIENTS = 10000
def generate_data(num_patients, random_seed):
    np.random.seed(random_seed)
    data = []
    for _ in range(num_patients):
        # set age 
        age = np.random.randint(18, 90, size = 1)

        # random selection with probabilities
        sex = np.random.choice(
            ["Male","Female","Other","Unknown"], 
            size = 1, 
            p = [0.45, 0.48, 0.02, 0.05]
        )

        # random selection with probabilities
        race = np.random.choice(
            ["White","Black or African American","Asian","American Indian or Alaska Native","Native Hawaiian or Pacific Islander","Other Race","Unknown"], 
            size = 1, 
            p = [0.67, 0.16, 0.08, 0.02, 0.02, 0.01, 0.04]
        )

        # conditional based on gender
        height = np.random.normal(
            loc = 167 + (8 if sex == "Male" else (-5 if sex == "Female" else 0)),
            scale = 6.5 + (0.5 if sex == "Male" else (-0.5 if sex == "Female" else 0))
        )

        # first calculate bmi to derive weight 
        bmi = np.random.normal(
            loc = 27 + (0 if race == "Male" else -0.5) + (0 if age < 50 else 1),
            scale = 5
        )
        weight = bmi * (height / 100.0) ** 2

        # calculate blood pressure readings + heart rate - dependent on age & partially to each other
        systolic_bp = 110 + 0.5*(age - 40) + np.random.normal(loc = 0, scale = 12, size = 1)
        diastolic_bp = 0.6*systolic_bp + np.random.normal(loc = 0, scale = 8, size = 1)
        heart_rate = 75 - 0.05*(systolic_bp-120) + np.random.normal(loc = 0, scale = 7, size = 1)

        # clip blood pressures and HR to realistic values
        systolic_bp = np.clip(systolic_bp, 90, 200)
        diastolic_bp = np.clip(diastolic_bp, 50, 120)
        heart_rate = np.clip(heart_rate, 40, 140)

        row = [int(age[0]), str(sex[0]), str(race[0]), int(height), int(weight), int(systolic_bp[0]), int(diastolic_bp[0]), int(heart_rate[0])]
        data.append(row)
    df = pl.DataFrame(
        data, 
        orient='row',
        schema = {
            "patient_age":pl.Int32, 
            "patient_gender":pl.Utf8, 
            "patient_race":pl.Utf8, 
            "patient_height_cm":pl.Int32, 
            "patient_weight_kg":pl.Int32, 
            "patient_systolic_bp":pl.Int32, 
            "patient_diastolic_bp":pl.Int32, 
            "patient_heart_rate":pl.Int32
        }
    )
    df = df.with_columns(
        pl.when(pl.col("patient_gender") == "Unknown").then(None).otherwise(pl.col("patient_gender")).alias("patient_gender"),
        pl.when(pl.col("patient_race") == "Unknown").then(None).otherwise(pl.col("patient_race")).alias("patient_race"),
        pl.when(pl.lit(np.random.rand(df.height)) < 0.05).then(None).otherwise(pl.col("patient_diastolic_bp")).alias("patient_diastolic_bp"),
        pl.when(pl.lit(np.random.rand(df.height)) < 0.05).then(None).otherwise(pl.col("patient_systolic_bp")).alias("patient_systolic_bp"),
        pl.when(pl.lit(np.random.rand(df.height)) < 0.05).then(None).otherwise(pl.col("patient_heart_rate")).alias("patient_heart_rate")
    )

    return df

In [3]:
data = generate_data(TOTAL_PATIENTS, RANDOM_SEED)


In [4]:
data

patient_age,patient_gender,patient_race,patient_height_cm,patient_weight_kg,patient_systolic_bp,patient_diastolic_bp,patient_heart_rate
i32,str,str,i32,i32,i32,i32,i32
69,,"""Black or African American""",159,74,127,84,70
38,"""Female""","""White""",158,36,116,67,74
79,"""Male""","""White""",171,82,115,72,71
77,"""Male""","""White""",172,94,113,69,
21,"""Male""",,176,96,90,50,81
…,…,…,…,…,…,…,…
81,"""Female""","""White""",163,85,137,76,61
42,,"""White""",162,43,125,63,68
22,"""Female""","""White""",159,87,92,70,65
63,"""Female""","""Black or African American""",166,64,143,81,80


In [14]:
data

patient_age,patient_gender,patient_race,patient_height_cm,patient_weight_kg,patient_systolic_bp,patient_diastolic_bp,patient_heart_rate
i32,str,str,i32,i32,i32,i32,i32
66,"""Unknown""","""White""",173,81,119,72,77
33,"""Female""","""White""",159,47,100,50,73
35,"""Male""","""White""",170,53,116,69,60
58,"""Female""","""White""",165,76,124,80,75
54,"""Male""","""Unknown""",168,92,100,54,69
…,…,…,…,…,…,…,…
49,"""Male""","""Asian""",169,68,114,76,78
63,"""Male""","""White""",163,58,129,57,65
34,"""Female""","""Asian""",157,78,114,76,77
31,"""Female""","""White""",159,61,113,72,75
