In [1]:
import pandas as pd
import numpy as np

# -----------------------------
# CONFIG
# -----------------------------
BASE_PATH = "route_risk_synthetic_base.csv"
OUTPUT_PATH = "route_risk_expanded_10k.csv"
TARGET_ROWS = 10000
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

# -----------------------------
# LOAD BASE DATA
# -----------------------------
base_df = pd.read_csv(BASE_PATH)

# -----------------------------
# SAMPLING WITH REPLACEMENT
# -----------------------------
expanded_df = base_df.sample(
    n=TARGET_ROWS,
    replace=True,
    random_state=RANDOM_SEED
).reset_index(drop=True)

# -----------------------------
# ADD CONTROLLED NOISE
# -----------------------------

def add_noise(series, noise_pct, min_val=None, max_val=None):
    noise = np.random.normal(0, noise_pct * series.mean(), size=len(series))
    new_vals = series + noise
    if min_val is not None:
        new_vals = np.maximum(new_vals, min_val)
    if max_val is not None:
        new_vals = np.minimum(new_vals, max_val)
    return new_vals

expanded_df["route_distance_km"] = add_noise(
    expanded_df["route_distance_km"], 0.08, min_val=0.5
)

expanded_df["route_duration_min"] = add_noise(
    expanded_df["route_duration_min"], 0.08, min_val=3
)

expanded_df["intersection_density"] = add_noise(
    expanded_df["intersection_density"], 0.12, min_val=0.05
)

expanded_df["shift_duration_hours"] = add_noise(
    expanded_df["shift_duration_hours"], 0.10, min_val=1, max_val=14
)

# Discrete features (small perturbations)
expanded_df["fatigue_score"] = (
    expanded_df["fatigue_score"] +
    np.random.choice([-1, 0, 1], size=TARGET_ROWS, p=[0.15, 0.7, 0.15])
).clip(1, 5)

# -----------------------------
# RE-LABEL RISK (IMPORTANT)
# -----------------------------

def compute_risk_label(row):
    score = (
        0.4 * row["route_duration_min"] +
        15 * row["intersection_density"] +
        20 * row["is_night"] +
        25 * row["weather_stress_index"] +
        10 * row["fatigue_score"] +
        5 * row["shift_duration_hours"]
    )
    if score > 120:
        return "High"
    elif score > 70:
        return "Medium"
    else:
        return "Low"

expanded_df["route_risk_label"] = expanded_df.apply(
    compute_risk_label, axis=1
)

# -----------------------------
# SHUFFLE FINAL DATASET
# -----------------------------
expanded_df = expanded_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# -----------------------------
# SAVE
# -----------------------------
expanded_df.to_csv(OUTPUT_PATH, index=False)

print("Expanded dataset saved to:", OUTPUT_PATH)
print(expanded_df["route_risk_label"].value_counts())


Expanded dataset saved to: route_risk_expanded_10k.csv
route_risk_label
High      5285
Medium    4387
Low        328
Name: count, dtype: int64


In [2]:
dict={1:1,2:3}
list(dict.keys())

[1, 2]

In [6]:
import joblib
import pandas as pd

MODEL_PATH ='route_risk_logreg.joblib'
sample_input = {
        "route_distance_km": 8.5,
        "route_duration_min": 42,
        "intersection_density": 1.4,
        "is_night": 1,
        "weather_stress_index": 0.5,
        "fatigue_score": 4,
        "shift_duration_hours": 9
    }

model = joblib.load(MODEL_PATH)
coef_df = pd.DataFrame(
    model.coef_,
    columns=list(sample_input.keys()),
    index=["Low", "Medium", "High"]) # type: ignore
print(coef_df)

        route_distance_km  route_duration_min  intersection_density  is_night  \
Low             -0.338343            8.644582             11.154316  4.462522   
Medium           0.007167           -8.869026            -11.474453 -4.427661   
High             0.331176            0.224444              0.320137 -0.034861   

        weather_stress_index  fatigue_score  shift_duration_hours  
Low                 5.911628       2.319215             13.797843  
Medium             -6.391879      -2.299160            -13.850114  
High                0.480251      -0.020055              0.052271  
