In [8]:
# =========================================
# === Cell 1: Imports + Config
# =========================================
import logging
from pathlib import Path
from typing import Dict, List, Tuple

import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, roc_auc_score, confusion_matrix
)
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline as SkPipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from sklearn.metrics import precision_recall_curve

logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s - %(message)s")

# -------------------------
# Config
# -------------------------
DATA_DIR = Path("datasets")
MODULE = "BBB"
PRESENTATIONS = ["2013B", "2013J"]

CUTOFFS = [3, 5, 7, 10, 14, 21, 30, 45, 60, 90, 120, 150, 180]

WINDOW_DAYS = 14
HALF_WINDOW = 7
HORIZON = 14

VAR_THRESH = 0.0
RANDOM_SEED = 42

BUNDLE_PATH = "short_term_inactive_next14days.bundle.pkl"

# “decline blend” config (đổi nếu bạn muốn)
ALPHA_MODEL = 0.70  # 0.7 = ưu tiên model dự đoán bỏ học; 0.3 = decline hỗ trợ
DECLINE_WEIGHTS = {
    "drop_ratio": 0.30,
    "drop_trend": 0.25,
    "low_last7": 0.20,
    "high_streak": 0.15,
    "high_since": 0.10,
}


In [9]:
# =========================================
# === Cell 2: Load raw OULAD
# =========================================
def load_raw(data_dir: Path) -> Dict[str, pd.DataFrame]:
    return {
        "student_info": pd.read_csv(data_dir / "studentInfo.csv"),
        "student_reg": pd.read_csv(data_dir / "studentRegistration.csv"),
        "student_vle": pd.read_csv(data_dir / "studentVle.csv"),
    }


def prepare_students(raw: Dict[str, pd.DataFrame], module: str, presentations: List[str]):
    reg_mod = raw["student_reg"][
        (raw["student_reg"]["code_module"] == module)
        & (raw["student_reg"]["code_presentation"].isin(presentations))
    ].copy()

    reg_lookup = reg_mod[["id_student", "date_registration"]].drop_duplicates()

    students = raw["student_info"][
        (raw["student_info"]["code_module"] == module)
        & (raw["student_info"]["code_presentation"].isin(presentations))
        & (raw["student_info"]["id_student"].isin(reg_lookup["id_student"]))
    ].copy()

    vle_mod = raw["student_vle"][
        (raw["student_vle"]["code_module"] == module)
        & (raw["student_vle"]["code_presentation"].isin(presentations))
    ].merge(reg_lookup, on="id_student", how="inner")

    # relative day since registration
    vle_mod["days_since_reg"] = vle_mod["date"] - vle_mod["date_registration"]

    vle_mod = vle_mod[vle_mod["days_since_reg"].notna()].copy()
    vle_mod = vle_mod[vle_mod["days_since_reg"] >= 0].copy()

    logging.info("So hoc vien hop le: %d", students["id_student"].nunique())
    logging.info("So ban ghi VLE: %d", len(vle_mod))
    return students, vle_mod


In [10]:
# =========================================
# === Cell 3: Helpers (label + features)
# =========================================
def compute_inactivity_streak(days_list: List[int], start_day: int, end_day: int) -> int:
    """
    Chuỗi ngày vắng liên tiếp tính từ end_day lùi về start_day.
    Nếu không có activity trong window => streak = window_length
    """
    if not days_list:
        return end_day - start_day + 1

    active = set(days_list)
    streak, d = 0, end_day
    while d >= start_day and d not in active:
        streak += 1
        d -= 1
    return streak


MIN_FUTURE_ACTIVE_DAYS = 1
MIN_FUTURE_CLICKS = 3

def build_short_term_label(
    vle_mod: pd.DataFrame,
    cutoff: int,
    horizon: int,
    min_future_active_days: int = MIN_FUTURE_ACTIVE_DAYS,
    min_future_clicks: int = MIN_FUTURE_CLICKS
) -> pd.DataFrame:
    """
    y_short = 1 nếu trong (cutoff, cutoff+horizon] hoạt động rất ít:
      - số ngày active <= min_future_active_days  OR
      - tổng clicks <= min_future_clicks
    """
    future = vle_mod[(vle_mod["days_since_reg"] > cutoff) & (vle_mod["days_since_reg"] <= cutoff + horizon)]

    if len(future) == 0:
        return pd.DataFrame(columns=["id_student", "y_short"])

    fut_agg = (
        future.groupby("id_student")
        .agg(
            future_clicks=("sum_click", "sum"),
            future_active_days=("days_since_reg", "nunique"),
        )
        .reset_index()
    )

    fut_agg["y_short"] = (
        (fut_agg["future_active_days"] <= min_future_active_days)
        | (fut_agg["future_clicks"] <= min_future_clicks)
    ).astype(int)

    return fut_agg[["id_student", "y_short"]]


def clean_input_features(df: pd.DataFrame, feature_cols: List[str]) -> pd.DataFrame:
    out = df.copy()
    for c in feature_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce")
    out = out.fillna(0)

    # clip ratio trong [0,1]
    for c in ["active_ratio_total", "active_ratio_14"]:
        if c in out.columns:
            out[c] = out[c].clip(0, 1)

    # non-negative
    for c in out.columns:
        out[c] = np.maximum(out[c].values, 0)

    return out


def build_features_short_term(
    students: pd.DataFrame,
    vle_mod: pd.DataFrame,
    cutoffs: List[int],
    window_days: int,
    half_window: int,
    horizon: int,
) -> Tuple[pd.DataFrame, List[str]]:

    student_ids = students["id_student"].unique()
    augmented = []

    for cutoff in cutoffs:
        w_start = max(0, cutoff - (window_days - 1))
        w_end = cutoff

        vle_cum = vle_mod[vle_mod["days_since_reg"] <= cutoff].copy()
        vle_win = vle_cum[vle_cum["days_since_reg"] >= w_start].copy()

        base = pd.DataFrame({"id_student": student_ids})
        base["days_elapsed_since_reg"] = cutoff

        # ---- label ----
        label_df = build_short_term_label(vle_mod, cutoff, horizon=horizon)
        merged = base.merge(label_df, on="id_student", how="left")

        # không có future record => coi là vắng => y_short=1
        merged["y_short"] = merged["y_short"].fillna(1).astype(int)

        # ---- cumulative agg ----
        cum_agg = (
            vle_cum.groupby("id_student")
            .agg(
                total_clicks=("sum_click", "sum"),
                active_days_total=("days_since_reg", "nunique"),
                last_active=("days_since_reg", "max"),
            )
            .reset_index()
        )

        den_total = max(cutoff + 1, 1)
        cum_agg["clicks_per_day_total"] = cum_agg["total_clicks"] / den_total
        cum_agg["active_ratio_total"] = cum_agg["active_days_total"] / den_total
        cum_agg["days_since_last_active"] = cutoff - cum_agg["last_active"]
        cum_agg["avg_clicks_per_active_day_total"] = (
            cum_agg["total_clicks"] / cum_agg["active_days_total"].replace(0, np.nan)
        ).fillna(0)

        # ---- window 14 agg ----
        win_agg = (
            vle_win.groupby("id_student")
            .agg(
                clicks_last_14_days=("sum_click", "sum"),
                active_days_14=("days_since_reg", "nunique"),
            )
            .reset_index()
        )
        win_agg["clicks_per_day_14"] = win_agg["clicks_last_14_days"] / window_days
        win_agg["active_ratio_14"] = win_agg["active_days_14"] / window_days

        # split 14: [w_start..first_end] và [second_start..w_end]
        first_end = min(w_end, w_start + (half_window - 1))
        second_start = min(w_end, first_end + 1)

        clicks_0_7 = (
            vle_win[(vle_win["days_since_reg"] >= w_start) & (vle_win["days_since_reg"] <= first_end)]
            .groupby("id_student")["sum_click"]
            .sum()
            .reset_index(name="clicks_0_7")
        )
        clicks_8_14 = (
            vle_win[(vle_win["days_since_reg"] >= second_start) & (vle_win["days_since_reg"] <= w_end)]
            .groupby("id_student")["sum_click"]
            .sum()
            .reset_index(name="clicks_8_14")
        )

        clicks_last_7 = (
            vle_cum[vle_cum["days_since_reg"] > (cutoff - 7)]
            .groupby("id_student")["sum_click"]
            .sum()
            .reset_index(name="clicks_last_7_days")
        )

        # inactivity streak in last 14 days
        days_list = (
            vle_win.groupby("id_student")["days_since_reg"]
            .apply(lambda x: sorted(x.unique()))
            .reset_index()
            .rename(columns={"days_since_reg": "active_days_list"})
        )
        days_list["inactivity_streak_14"] = days_list["active_days_list"].apply(
            lambda lst: compute_inactivity_streak(lst, w_start, w_end)
        )
        streak = days_list[["id_student", "inactivity_streak_14"]]

        # ---- merge ----
        merged = merged.merge(cum_agg, on="id_student", how="left")
        merged = merged.merge(win_agg, on="id_student", how="left")
        merged = merged.merge(clicks_0_7, on="id_student", how="left")
        merged = merged.merge(clicks_8_14, on="id_student", how="left")
        merged = merged.merge(clicks_last_7, on="id_student", how="left")
        merged = merged.merge(streak, on="id_student", how="left")

        fill0 = [
            "total_clicks","active_days_total","last_active",
            "clicks_per_day_total","active_ratio_total","days_since_last_active",
            "avg_clicks_per_active_day_total",
            "clicks_last_14_days","active_days_14","clicks_per_day_14","active_ratio_14",
            "clicks_last_7_days","clicks_0_7","clicks_8_14","inactivity_streak_14",
        ]
        for col in fill0:
            if col in merged.columns:
                merged[col] = merged[col].fillna(0)

        merged["trend_click_14"] = merged["clicks_8_14"] - merged["clicks_0_7"]
        merged["ratio_click_14"] = (merged["clicks_8_14"] + 1) / (merged["clicks_0_7"] + 1)

        merged["active_ratio_total"] = merged["active_ratio_total"].clip(0, 1)
        merged["active_ratio_14"] = merged["active_ratio_14"].clip(0, 1)

        augmented.append(merged)

    final_df = pd.concat(augmented, ignore_index=True)

    feature_cols = [
        "days_elapsed_since_reg",
        "clicks_per_day_total",
        "active_ratio_total",
        "avg_clicks_per_active_day_total",
        "days_since_last_active",
        "clicks_last_14_days",
        "active_days_14",
        "clicks_per_day_14",
        "active_ratio_14",
        "clicks_last_7_days",
        "clicks_0_7",
        "clicks_8_14",
        "trend_click_14",
        "ratio_click_14",
        "inactivity_streak_14",
    ]
    return final_df, feature_cols


In [11]:
# =========================================
# === Cell 4: Decline score + blend + thresholds helpers
# =========================================
def compute_decline_score(df: pd.DataFrame, weights: Dict[str, float]) -> np.ndarray:
    """
    Rule-based decline score (0..1) để bắt case “tụt mạnh” (Medium).
    """
    ratio = df["ratio_click_14"].astype(float).clip(0, 3)
    trend = df["trend_click_14"].astype(float).clip(-50, 50)
    last7  = df["clicks_last_7_days"].astype(float).clip(0, 200)
    streak = df["inactivity_streak_14"].astype(float).clip(0, 14)
    since  = df["days_since_last_active"].astype(float).clip(0, 60)

    drop_ratio  = (1 - (ratio / 1.0)).clip(0, 1)      # ratio<1 => tăng rủi ro
    drop_trend  = (-trend / 30).clip(0, 1)            # trend âm => tăng rủi ro
    low_last7   = (1 - (last7 / 20)).clip(0, 1)       # 7 ngày gần đây ít => rủi ro
    high_streak = (streak / 14).clip(0, 1)
    high_since  = (since / 14).clip(0, 1)

    score = (
        weights["drop_ratio"] * drop_ratio +
        weights["drop_trend"] * drop_trend +
        weights["low_last7"] * low_last7 +
        weights["high_streak"] * high_streak +
        weights["high_since"] * high_since
    )
    return np.clip(score, 0, 1)


def find_threshold_for_high_recall(y_true: np.ndarray, proba: np.ndarray, min_precision: float = 0.30):
    """
    Chọn threshold sao cho recall cao nhất nhưng precision >= min_precision.
    """
    precision, recall, thresholds = precision_recall_curve(y_true, proba)
    best_thr, best_recall = 0.5, -1

    for p, r, t in zip(precision[:-1], recall[:-1], thresholds):
        if p >= min_precision and r > best_recall:
            best_recall = r
            best_thr = float(t)

    return best_thr, best_recall


def fit_platt_scaler(oof_proba: np.ndarray, y_true: np.ndarray, random_seed: int = 42):
    """
    Platt scaling: fit LogisticRegression trên (oof_proba -> y)
    """
    x = oof_proba.reshape(-1, 1)
    clf = LogisticRegression(solver="lbfgs", max_iter=2000, random_state=random_seed)
    clf.fit(x, y_true.astype(int))
    return clf


def apply_platt_scaler(platt_model, proba: np.ndarray):
    x = proba.reshape(-1, 1)
    return platt_model.predict_proba(x)[:, 1]


def make_eval_pipe(model):
    return ImbPipeline([
        ("variance_threshold", VarianceThreshold(VAR_THRESH)),
        ("smote", SMOTE(random_state=RANDOM_SEED)),
        ("power_transformer", PowerTransformer()),
        ("classifier", model),
    ])


def make_prod_pipe(model):
    return SkPipeline([
        ("variance_threshold", VarianceThreshold(VAR_THRESH)),
        ("power_transformer", PowerTransformer()),
        ("classifier", model),
    ])


In [12]:
# =========================================
# === Cell 5: Models
# =========================================
MODELS = {
    "LogisticRegression": LogisticRegression(
        penalty="l2",
        solver="lbfgs",
        max_iter=3000,
        class_weight="balanced",
        random_state=RANDOM_SEED
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        max_depth=14,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        class_weight="balanced"
    ),
    "GradientBoosting": GradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=3,
        min_samples_leaf=30,
        min_samples_split=20,
        n_estimators=200,
        random_state=RANDOM_SEED
    ),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(128,),
        max_iter=1500,
        early_stopping=True,
        random_state=RANDOM_SEED
    )
}


In [13]:
# =========================================
# === Cell 6: Train/Eval GroupKFold + Choose best + Save bundle
# =========================================
raw = load_raw(DATA_DIR)
students, vle_mod = prepare_students(raw, MODULE, PRESENTATIONS)

final_df, feature_cols = build_features_short_term(
    students, vle_mod, CUTOFFS,
    window_days=WINDOW_DAYS, half_window=HALF_WINDOW, horizon=HORIZON
)

X = clean_input_features(final_df[feature_cols].copy(), feature_cols)
y = final_df["y_short"].astype(int).values
groups = final_df["id_student"].values

logging.info("Train samples: %d", len(final_df))
logging.info("Positive rate (vang_14days): %.2f%%", 100 * y.mean())

gkf = GroupKFold(n_splits=5)

summary_rows = []
oof_store = {}

for name, model in MODELS.items():
    fold_rows = []
    oof_proba = np.zeros(len(X), dtype=float)

    for tr_idx, te_idx in gkf.split(X, y, groups):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]

        pipe = make_eval_pipe(model)
        pipe.fit(X_tr, y_tr)

        y_pred = pipe.predict(X_te)
        proba = pipe.predict_proba(X_te)[:, 1]
        oof_proba[te_idx] = proba

        auc = roc_auc_score(y_te, proba)
        tn, fp, fn, tp = confusion_matrix(y_te, y_pred).ravel()

        fold_rows.append({
            "model": name,
            "accuracy": accuracy_score(y_te, y_pred),
            "f1": f1_score(y_te, y_pred),
            "recall_pos(vang)": recall_score(y_te, y_pred),
            "specificity": tn / (tn + fp + 1e-9),
            "auc": auc,
        })

    df_fold = pd.DataFrame(fold_rows)
    summary_rows.append({
        "model": name,
        "mean_accuracy": df_fold["accuracy"].mean(),
        "mean_f1": df_fold["f1"].mean(),
        "mean_recall_pos(vang)": df_fold["recall_pos(vang)"].mean(),
        "mean_specificity": df_fold["specificity"].mean(),
        "mean_auc": df_fold["auc"].mean(),
    })
    oof_store[name] = oof_proba

summary_df = pd.DataFrame(summary_rows).sort_values("mean_f1", ascending=False)
display(summary_df)

best_model_name = summary_df.iloc[0]["model"]
best_model = MODELS[best_model_name]
print("✅ Best model:", best_model_name)

# --- OOF -> calibrate ---
best_oof = oof_store[best_model_name]
platt = fit_platt_scaler(best_oof, y, random_seed=RANDOM_SEED)
best_oof_cal = apply_platt_scaler(platt, best_oof)

# --- compute decline on full training rows ---
decline_all = compute_decline_score(final_df, DECLINE_WEIGHTS)

# --- build blended risk OOF (raw & calibrated) ---
risk_oof_raw = ALPHA_MODEL * best_oof + (1 - ALPHA_MODEL) * decline_all
risk_oof_cal = ALPHA_MODEL * best_oof_cal + (1 - ALPHA_MODEL) * decline_all

# --- choose thresholds ON RISK (not proba) ---
risk_thr_high_raw, _ = find_threshold_for_high_recall(y, risk_oof_raw, min_precision=0.45)
risk_thr_high_cal, _ = find_threshold_for_high_recall(y, risk_oof_cal, min_precision=0.45)

risk_thr_med_raw, _  = find_threshold_for_high_recall(y, risk_oof_raw, min_precision=0.30)
risk_thr_med_cal, _  = find_threshold_for_high_recall(y, risk_oof_cal, min_precision=0.30)

print(f"✅ RISK raw thr_med={risk_thr_med_raw:.3f}, thr_high={risk_thr_high_raw:.3f}")
print(f"✅ RISK cal thr_med={risk_thr_med_cal:.3f}, thr_high={risk_thr_high_cal:.3f}")


# Threshold cho “cảnh báo bỏ học” (High) ưu tiên recall nhưng giữ precision tối thiểu
thr_high_raw, rec_high_raw = find_threshold_for_high_recall(y, best_oof, min_precision=0.45)
thr_high_cal, rec_high_cal = find_threshold_for_high_recall(y, best_oof_cal, min_precision=0.45)

# Threshold cho “nhắc nhở sớm” (Medium) dễ hơn (precision thấp hơn)
thr_med_raw, _ = find_threshold_for_high_recall(y, best_oof, min_precision=0.30)
thr_med_cal, _ = find_threshold_for_high_recall(y, best_oof_cal, min_precision=0.30)

print(f"✅ Raw thr_med={thr_med_raw:.3f}, thr_high={thr_high_raw:.3f}")
print(f"✅ Cal thr_med={thr_med_cal:.3f}, thr_high={thr_high_cal:.3f}")

# --- train production pipeline on full data ---
prod_pipe = make_prod_pipe(best_model)
prod_pipe.fit(X, y)

bundle = {
    "version": "v1",
    "module": MODULE,
    "presentations": PRESENTATIONS,
    "horizon_days": HORIZON,
    "window_days": WINDOW_DAYS,

    "pipeline": prod_pipe,
    "feature_cols": feature_cols,

    # calibrator
    "platt": platt,

    # blend config
    "blend_params": {
        "alpha_model": float(ALPHA_MODEL),
    },
    "decline_cfg": {
        "weights": DECLINE_WEIGHTS,
    },

    # thresholds (ưu tiên dùng calibrated)
    "thresholds": {
        "raw": {
            "medium": float(thr_med_raw),
            "high": float(thr_high_raw),
        },
        "cal": {
            "medium": float(thr_med_cal),
            "high": float(thr_high_cal),
        },
        "notes": "Use calibrated thresholds for production."
    },
}

joblib.dump(bundle, BUNDLE_PATH)
print(f"✅ Saved bundle to: {BUNDLE_PATH}")


[2025-12-28 22:29:39,717] INFO - So hoc vien hop le: 3960
[2025-12-28 22:29:39,718] INFO - So ban ghi VLE: 864034
[2025-12-28 22:29:41,068] INFO - Train samples: 51480
[2025-12-28 22:29:41,070] INFO - Positive rate (vang_14days): 71.77%


Unnamed: 0,model,mean_accuracy,mean_f1,mean_recall_pos(vang),mean_specificity,mean_auc
1,RandomForest,0.861888,0.904064,0.906962,0.747212,0.874004
3,MLP,0.859751,0.902104,0.900739,0.755213,0.871031
0,LogisticRegression,0.859538,0.902092,0.901864,0.751837,0.852395
2,GradientBoosting,0.858683,0.901105,0.897286,0.760353,0.87496


✅ Best model: RandomForest
✅ RISK raw thr_med=0.000, thr_high=0.000
✅ RISK cal thr_med=0.060, thr_high=0.060
✅ Raw thr_med=0.000, thr_high=0.000
✅ Cal thr_med=0.086, thr_high=0.086
✅ Saved bundle to: short_term_inactive_next14days.bundle.pkl


In [14]:
# =========================================
# === Cell 7: Demo predict on your samples (Low/Medium/High)
# =========================================
def predict_risk_from_bundle(bundle: dict, df: pd.DataFrame, use_calibrated: bool = True) -> pd.DataFrame:
    cols = bundle["feature_cols"]
    pipe = bundle["pipeline"]
    platt = bundle["platt"]
    alpha = float(bundle["blend_params"]["alpha_model"])
    weights = bundle["decline_cfg"]["weights"]

    X = clean_input_features(df[cols].copy(), cols)

    proba_raw = pipe.predict_proba(X)[:, 1]
    proba = apply_platt_scaler(platt, proba_raw) if use_calibrated else proba_raw

    decline = compute_decline_score(df, weights)
    risk = alpha * proba + (1 - alpha) * decline

    bundle["risk_thresholds"] = {
      "raw": {"medium": float(risk_thr_med_raw), "high": float(risk_thr_high_raw)},
      "cal": {"medium": float(risk_thr_med_cal), "high": float(risk_thr_high_cal)},
      "notes": "Thresholds computed on blended risk (alpha*proba + (1-alpha)*decline)."
    }
    
    thr = bundle["risk_thresholds"]["cal"] if use_calibrated else bundle["risk_thresholds"]["raw"]
    thr_m, thr_h = float(thr["medium"]), float(thr["high"])

    level = np.where(risk >= thr_h, "high",
             np.where(risk >= thr_m, "medium", "low"))

    out = df[["student_id"]].copy()
    out["proba"] = proba
    out["decline"] = decline
    out["risk"] = risk
    out["risk_level"] = level
    return out


samples = [
  {
    "student_id": "2",
    "days_elapsed_since_reg": 32,
    "clicks_per_day_total": 2.3125,
    "active_ratio_total": 0.71875,
    "avg_clicks_per_active_day_total": 3.217391304347826,
    "days_since_last_active": 0,
    "clicks_last_14_days": 39,
    "active_days_14": 11,
    "clicks_per_day_14": 2.7857142857142856,
    "active_ratio_14": 0.7857142857142857,
    "clicks_last_7_days": 17,
    "clicks_0_7": 21,
    "clicks_8_14": 18,
    "trend_click_14": -3,
    "ratio_click_14": 0.8636363636363636,
    "inactivity_streak_14": 1
  },
  {
    "student_id": "6",
    "days_elapsed_since_reg": 27,
    "clicks_per_day_total": 1.5555555555555556,
    "active_ratio_total": 0.4444444444444444,
    "avg_clicks_per_active_day_total": 3.5,
    "days_since_last_active": 2,
    "clicks_last_14_days": 30,
    "active_days_14": 9,
    "clicks_per_day_14": 2.142857142857143,
    "active_ratio_14": 0.6428571428571429,
    "clicks_last_7_days": 3,
    "clicks_0_7": 27,
    "clicks_8_14": 3,
    "trend_click_14": -24,
    "ratio_click_14": 0.14285714285714285,
    "inactivity_streak_14": 2
  },
  {
    "student_id": "7",
    "days_elapsed_since_reg": 32,
    "clicks_per_day_total": 0.15625,
    "active_ratio_total": 0.15625,
    "avg_clicks_per_active_day_total": 1,
    "days_since_last_active": 3,
    "clicks_last_14_days": 2,
    "active_days_14": 2,
    "clicks_per_day_14": 0.14285714285714285,
    "active_ratio_14": 0.14285714285714285,
    "clicks_last_7_days": 1,
    "clicks_0_7": 1,
    "clicks_8_14": 1,
    "trend_click_14": 0,
    "ratio_click_14": 1,
    "inactivity_streak_14": 6
  }
]

demo_df = pd.DataFrame(samples)
bundle = joblib.load(BUNDLE_PATH)

pred_demo = predict_risk_from_bundle(bundle, demo_df, use_calibrated=True)
display(pred_demo)


Unnamed: 0,student_id,proba,decline,risk,risk_level
0,2,0.108018,0.106623,0.1076,high
1,6,0.157859,0.662857,0.309359,high
2,7,0.354062,0.275714,0.330558,high
