In [1]:
import numpy as np
import pandas as pd

# -----------------------------
# Configuration
# -----------------------------
RANDOM_SEED = 42
BASE_CSV_PATH = "base_data.csv"
OUTPUT_CSV_PATH = "base_data_expanded.csv"
TARGET_ROWS = 5500  # target size in the 5k–6k range

np.random.seed(RANDOM_SEED)

# -----------------------------
# Load base dataset
# -----------------------------
df_base = pd.read_csv(BASE_CSV_PATH)

# -----------------------------
# Utility: empirical sampler
# -----------------------------
def sample_from_empirical(series, size, noise_scale=0.05, clip_min=None, clip_max=None):
    """
    Sample values from the empirical distribution of a column
    with small continuous noise added to avoid exact duplication.
    """
    samples = np.random.choice(series.values, size=size, replace=True).astype(float)
    noise = np.random.normal(0, noise_scale * np.std(series.values), size=size)
    samples = samples + noise

    if clip_min is not None or clip_max is not None:
        samples = np.clip(samples, clip_min, clip_max)

    return samples

# -----------------------------
# Generate synthetic features
# -----------------------------
n_new = TARGET_ROWS - len(df_base)

synthetic = pd.DataFrame({
    "shift_duration_hours": sample_from_empirical(
        df_base["shift_duration_hours"],
        n_new,
        noise_scale=0.05,
        clip_min=2.0,
        clip_max=12.0
    ),
    "consecutive_work_days": np.random.choice(
        df_base["consecutive_work_days"].values,
        size=n_new,
        replace=True
    ).astype(int),
    "night_work_fraction": sample_from_empirical(
        df_base["night_work_fraction"],
        n_new,
        noise_scale=0.08,
        clip_min=0.0,
        clip_max=1.0
    ),
    "weather_stress_index": np.random.choice(
        df_base["weather_stress_index"].values,
        size=n_new,
        replace=True
    ),
    "self_reported_tiredness": np.random.choice(
        df_base["self_reported_tiredness"].values,
        size=n_new,
        replace=True
    ).astype(int),
})

# -----------------------------
# Workload risk scoring function
# -----------------------------
def compute_workload_score(df):
    """
    Transparent, additive workload risk score.
    This is a RELATIVE workload risk signal, not a medical model.
    """
    score = (
        0.30 * (df["shift_duration_hours"] / 12.0) +
        0.20 * (df["consecutive_work_days"] / 7.0) +
        0.20 * df["night_work_fraction"] +
        0.15 * df["weather_stress_index"] +
        0.15 * (df["self_reported_tiredness"] / 5.0)
    )
    return score

# Base score
synthetic["raw_workload_score"] = compute_workload_score(synthetic)

# -----------------------------
# Controlled stochastic noise
# -----------------------------
noise = np.random.normal(
    loc=0.0,
    scale=0.05 * synthetic["raw_workload_score"].std(),
    size=len(synthetic)
)

synthetic["noisy_workload_score"] = synthetic["raw_workload_score"] + noise

# -----------------------------
# Percentile-based labeling
# -----------------------------
low_thresh, high_thresh = np.percentile(
    synthetic["noisy_workload_score"], [33, 66]
)

def assign_label(score):
    if score <= low_thresh:
        return "Low"
    elif score <= high_thresh:
        return "Medium"
    else:
        return "High"

synthetic["workload_risk_label"] = synthetic["noisy_workload_score"].apply(assign_label)

# -----------------------------
# Cleanup helper columns
# -----------------------------
synthetic = synthetic.drop(
    columns=["raw_workload_score", "noisy_workload_score"]
)

# -----------------------------
# Combine with original dataset
# -----------------------------
df_final = pd.concat([df_base, synthetic], ignore_index=True)

# -----------------------------
# Save expanded dataset
# -----------------------------
df_final.to_csv(OUTPUT_CSV_PATH, index=False)

# -----------------------------
# Final output object
# -----------------------------
df_final


Unnamed: 0,shift_duration_hours,consecutive_work_days,night_work_fraction,weather_stress_index,self_reported_tiredness,workload_risk_label
0,3.200000,1,0.000000,0.0,1,Low
1,4.500000,2,0.100000,0.0,2,Low
2,5.000000,2,0.000000,0.5,2,Low
3,6.000000,3,0.200000,0.0,2,Low
4,3.800000,1,0.000000,0.0,1,Low
...,...,...,...,...,...,...
5495,8.443555,3,0.403036,0.5,2,Medium
5496,7.287239,5,0.737127,0.5,3,High
5497,8.041221,2,0.718778,1.0,3,High
5498,8.456852,5,0.204125,0.0,4,Medium


In [2]:
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
df_final

Unnamed: 0,shift_duration_hours,consecutive_work_days,night_work_fraction,weather_stress_index,self_reported_tiredness,workload_risk_label
0,9.970953,5,0.201340,1.0,2,High
1,7.308434,2,0.000000,0.5,4,Low
2,11.577056,4,0.711393,0.0,4,High
3,12.000000,5,0.175598,0.5,2,High
4,7.990528,2,0.266654,0.0,4,Low
...,...,...,...,...,...,...
5495,8.150207,4,0.208650,0.5,3,Medium
5496,11.916509,4,0.019929,0.5,3,High
5497,6.602785,1,0.402228,0.5,2,Low
5498,9.341936,6,0.240118,1.0,3,High


In [3]:
df_final.to_csv(OUTPUT_CSV_PATH, index=False)