In [50]:
# =========================================
# === Cell 1: Imports + Config
# =========================================
import logging
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, roc_auc_score, confusion_matrix
)
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline as SkPipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s - %(message)s")

# -------------------------
# Config
# -------------------------
DATA_DIR = Path("datasets")
MODULE = "BBB"
PRESENTATIONS = ["2013B", "2013J"]

# Time-slicing cutoffs
CUTOFFS = [3, 5, 7, 10, 14, 21, 30, 45, 60, 90, 120, 150, 180]

WINDOW_DAYS = 14        # nhìn lại 14 ngày
HALF_WINDOW = 7         # chia 14 ngày thành 0-7 và 8-14
HORIZON = 14            # dự đoán vắng trong 14 ngày tương lai

VAR_THRESH = 0.0
RANDOM_SEED = 42

MODEL_PATH = "short_term_inactive_next14days.bundle.pkl"


In [51]:
# =========================================
# === Cell 2: Load raw OULAD
# =========================================
def load_raw(data_dir: Path) -> Dict[str, pd.DataFrame]:
    return {
        "student_info": pd.read_csv(data_dir / "studentInfo.csv"),
        "student_reg": pd.read_csv(data_dir / "studentRegistration.csv"),
        "student_vle": pd.read_csv(data_dir / "studentVle.csv"),
    }


def prepare_students(raw: Dict[str, pd.DataFrame], module: str, presentations: List[str]):
    reg_mod = raw["student_reg"][
        (raw["student_reg"]["code_module"] == module)
        & (raw["student_reg"]["code_presentation"].isin(presentations))
    ].copy()

    reg_lookup = reg_mod[["id_student", "date_registration"]].drop_duplicates()

    students = raw["student_info"][
        (raw["student_info"]["code_module"] == module)
        & (raw["student_info"]["code_presentation"].isin(presentations))
        & (raw["student_info"]["id_student"].isin(reg_lookup["id_student"]))
    ].copy()

    vle_mod = raw["student_vle"][
        (raw["student_vle"]["code_module"] == module)
        & (raw["student_vle"]["code_presentation"].isin(presentations))
    ].merge(reg_lookup, on="id_student", how="inner")

    # relative day since registration
    vle_mod["days_since_reg"] = vle_mod["date"] - vle_mod["date_registration"]

    vle_mod = vle_mod[vle_mod["days_since_reg"].notna()].copy()
    vle_mod = vle_mod[vle_mod["days_since_reg"] >= 0].copy()

    logging.info("So hoc vien hop le: %d", students["id_student"].nunique())
    logging.info("So ban ghi VLE: %d", len(vle_mod))
    return students, vle_mod


In [None]:
# =========================================
# === Cell 3: Helpers
# =========================================
def compute_inactivity_streak(days_list: List[int], start_day: int, end_day: int) -> int:
    """
    Tính chuỗi ngày "vắng liên tiếp" tính từ end_day lùi về start_day.
    Nếu không có activity trong window => streak = window_length
    """
    if not days_list:
        return end_day - start_day + 1

    active = set(days_list)
    streak, d = 0, end_day
    while d >= start_day and d not in active:
        streak += 1
        d -= 1
    return streak


MIN_FUTURE_ACTIVE_DAYS = 1     
MIN_FUTURE_CLICKS = 3        

def build_short_term_label(vle_mod: pd.DataFrame, cutoff: int, horizon: int,
                           min_future_active_days: int = MIN_FUTURE_ACTIVE_DAYS,
                           min_future_clicks: int = MIN_FUTURE_CLICKS) -> pd.DataFrame:
    """
    y_short = 1 nếu 14 ngày tới học rất ít:
      - số ngày có activity <= min_future_active_days  OR
      - tổng clicks <= min_future_clicks
    """
    future = vle_mod[(vle_mod["days_since_reg"] > cutoff) & (vle_mod["days_since_reg"] <= cutoff + horizon)]

    if len(future) == 0:
        return pd.DataFrame(columns=["id_student", "y_short"])

    fut_agg = (
        future.groupby("id_student")
        .agg(
            future_clicks=("sum_click", "sum"),
            future_active_days=("days_since_reg", "nunique"),
        )
        .reset_index()
    )

    fut_agg["y_short"] = (
        (fut_agg["future_active_days"] <= min_future_active_days)
        | (fut_agg["future_clicks"] <= min_future_clicks)
    ).astype(int)

    return fut_agg[["id_student", "y_short"]]


def clean_input_features(df: pd.DataFrame, feature_cols: List[str]) -> pd.DataFrame:
    out = df.copy()
    for c in feature_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce")
    out = out.fillna(0)

    # CHỈ clip 2 ratio thực sự thuộc [0,1]
    for c in ["active_ratio_total", "active_ratio_14"]:
        if c in out.columns:
            out[c] = out[c].clip(0, 1)

    # non-negative
    for c in out.columns:
        out[c] = np.maximum(out[c].values, 0)

    return out


In [None]:
# =========================================
# === Cell 4: Build features + label (fixed ratios)
# =========================================
def build_features_short_term(
    students: pd.DataFrame,
    vle_mod: pd.DataFrame,
    cutoffs: List[int],
    window_days: int,
    half_window: int,
    horizon: int,
) -> Tuple[pd.DataFrame, List[str]]:

    student_ids = students["id_student"].unique()
    augmented = []

    for cutoff in cutoffs:
        w_start = max(0, cutoff - (window_days - 1))
        w_end = cutoff

        vle_cum = vle_mod[vle_mod["days_since_reg"] <= cutoff].copy()
        vle_win = vle_cum[vle_cum["days_since_reg"] >= w_start].copy()

        base = pd.DataFrame({"id_student": student_ids})
        base["days_elapsed_since_reg"] = cutoff  # feature

        # ---- label ----
        label_df = build_short_term_label(vle_mod, cutoff, horizon=horizon)
        merged = base.merge(label_df, on="id_student", how="left")

        # Nếu không có trong label_df => không có activity trong future => y_short=1
        merged["y_short"] = merged["y_short"].fillna(1).astype(int)

        # ---- cumulative agg ----
        cum_agg = (
            vle_cum.groupby("id_student")
            .agg(
                total_clicks=("sum_click", "sum"),
                active_days_total=("days_since_reg", "nunique"),
                last_active=("days_since_reg", "max"),
            )
            .reset_index()
        )

        # FIX: số ngày đã trôi qua tính inclusive: 0..cutoff => cutoff+1
        den_total = max(cutoff + 1, 1)

        cum_agg["clicks_per_day_total"] = cum_agg["total_clicks"] / den_total
        cum_agg["active_ratio_total"] = cum_agg["active_days_total"] / den_total
        cum_agg["days_since_last_active"] = cutoff - cum_agg["last_active"]
        cum_agg["avg_clicks_per_active_day_total"] = (
            cum_agg["total_clicks"] / cum_agg["active_days_total"].replace(0, np.nan)
        ).fillna(0)

        # ---- window 14 agg ----
        win_agg = (
            vle_win.groupby("id_student")
            .agg(
                clicks_last_14_days=("sum_click", "sum"),
                active_days_14=("days_since_reg", "nunique"),
            )
            .reset_index()
        )
        win_agg["clicks_per_day_14"] = win_agg["clicks_last_14_days"] / window_days
        win_agg["active_ratio_14"] = win_agg["active_days_14"] / window_days

        # split 14-day window: [w_start..first_end] và [second_start..w_end]
        first_end = min(w_end, w_start + (half_window - 1))
        second_start = min(w_end, first_end + 1)

        clicks_0_7 = (
            vle_win[(vle_win["days_since_reg"] >= w_start) & (vle_win["days_since_reg"] <= first_end)]
            .groupby("id_student")["sum_click"]
            .sum()
            .reset_index(name="clicks_0_7")
        )
        clicks_8_14 = (
            vle_win[(vle_win["days_since_reg"] >= second_start) & (vle_win["days_since_reg"] <= w_end)]
            .groupby("id_student")["sum_click"]
            .sum()
            .reset_index(name="clicks_8_14")
        )

        clicks_last_7 = (
            vle_cum[vle_cum["days_since_reg"] > (cutoff - 7)]
            .groupby("id_student")["sum_click"]
            .sum()
            .reset_index(name="clicks_last_7_days")
        )

        # inactivity streak in last 14 days
        days_list = (
            vle_win.groupby("id_student")["days_since_reg"]
            .apply(lambda x: sorted(x.unique()))
            .reset_index()
            .rename(columns={"days_since_reg": "active_days_list"})
        )
        days_list["inactivity_streak_14"] = days_list["active_days_list"].apply(
            lambda lst: compute_inactivity_streak(lst, w_start, w_end)
        )
        streak = days_list[["id_student", "inactivity_streak_14"]]

        # ---- merge all ----
        merged = merged.merge(cum_agg, on="id_student", how="left")
        merged = merged.merge(win_agg, on="id_student", how="left")
        merged = merged.merge(clicks_0_7, on="id_student", how="left")
        merged = merged.merge(clicks_8_14, on="id_student", how="left")
        merged = merged.merge(clicks_last_7, on="id_student", how="left")
        merged = merged.merge(streak, on="id_student", how="left")

        # fill missing -> 0
        fill0 = [
            "total_clicks",
            "active_days_total",
            "last_active",
            "clicks_per_day_total",
            "active_ratio_total",
            "days_since_last_active",
            "avg_clicks_per_active_day_total",
            "clicks_last_14_days",
            "active_days_14",
            "clicks_per_day_14",
            "active_ratio_14",
            "clicks_last_7_days",
            "clicks_0_7",
            "clicks_8_14",
            "inactivity_streak_14",
        ]
        for col in fill0:
            if col in merged.columns:
                merged[col] = merged[col].fillna(0)

        merged["trend_click_14"] = merged["clicks_8_14"] - merged["clicks_0_7"]
        merged["ratio_click_14"] = (merged["clicks_8_14"] + 1) / (merged["clicks_0_7"] + 1)

        # clip ratio features to avoid extreme
        merged["active_ratio_total"] = merged["active_ratio_total"].clip(0, 1)
        merged["active_ratio_14"] = merged["active_ratio_14"].clip(0, 1)

        augmented.append(merged)

    final_df = pd.concat(augmented, ignore_index=True)

    feature_cols = [
        "days_elapsed_since_reg",
        "clicks_per_day_total",
        "active_ratio_total",
        "avg_clicks_per_active_day_total",
        "days_since_last_active",
        "clicks_last_14_days",
        "active_days_14",
        "clicks_per_day_14",
        "active_ratio_14",
        "clicks_last_7_days",
        "clicks_0_7",
        "clicks_8_14",
        "trend_click_14",
        "ratio_click_14",
        "inactivity_streak_14",
    ]
    return final_df, feature_cols


In [54]:
# =========================================
# === Cell 5: Pipelines + Models
# =========================================
def make_eval_pipe(model):
    return ImbPipeline([
        ("variance_threshold", VarianceThreshold(VAR_THRESH)),
        ("smote", SMOTE(random_state=RANDOM_SEED)),
        ("power_transformer", PowerTransformer()),
        ("classifier", model),
    ])

def make_prod_pipe(model):
    return SkPipeline([
        ("variance_threshold", VarianceThreshold(VAR_THRESH)),
        ("power_transformer", PowerTransformer()),
        ("classifier", model),
    ])

MODELS = {
    "LogisticRegression": LogisticRegression(
        penalty="l2",
        solver="lbfgs",
        max_iter=3000,
        class_weight="balanced",
        random_state=RANDOM_SEED
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        max_depth=14,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        class_weight="balanced"
    ),
    "GradientBoosting": GradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=3,
        min_samples_leaf=30,
        min_samples_split=20,
        n_estimators=200,
        random_state=RANDOM_SEED
    ),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(128,),
        max_iter=1500,
        early_stopping=True,
        random_state=RANDOM_SEED
    )
}


In [55]:
from sklearn.metrics import precision_recall_curve

def find_threshold_for_high_recall(y_true: np.ndarray, proba: np.ndarray, min_precision: float = 0.30):
    """
    Chọn threshold sao cho recall cao nhất nhưng precision >= min_precision.
    """
    precision, recall, thresholds = precision_recall_curve(y_true, proba)
    # thresholds có length = len(precision)-1
    best_thr, best_recall = 0.5, -1

    for p, r, t in zip(precision[:-1], recall[:-1], thresholds):
        if p >= min_precision and r > best_recall:
            best_recall = r
            best_thr = float(t)

    return best_thr, best_recall


In [56]:
# =========================================
# === Cell 6: Threshold optimizer + calibrator + sigmoid mapper
# =========================================
def find_best_threshold(y_true: np.ndarray, proba: np.ndarray, step: float = 0.01):
    """
    Chọn threshold tối ưu theo F1 (bạn có thể đổi sang recall/precision tuỳ mục tiêu).
    """
    best_thr, best_f1 = 0.5, -1
    thrs = np.arange(0.05, 0.951, step)
    for t in thrs:
        pred = (proba >= t).astype(int)
        f1 = f1_score(y_true, pred)
        if f1 > best_f1:
            best_f1 = f1
            best_thr = float(t)
    return best_thr, best_f1


def fit_platt_scaler(oof_proba: np.ndarray, y_true: np.ndarray):
    """
    Platt scaling: fit LogisticRegression trên (oof_proba -> y)
    """
    x = oof_proba.reshape(-1, 1)
    clf = LogisticRegression(solver="lbfgs", max_iter=2000, random_state=RANDOM_SEED)
    clf.fit(x, y_true.astype(int))
    return clf


def apply_platt_scaler(platt_model, proba: np.ndarray):
    x = proba.reshape(-1, 1)
    return platt_model.predict_proba(x)[:, 1]


def fit_sigmoid_kc(raw: np.ndarray, target_p: np.ndarray):
    """
    Fit mapping: display = sigmoid(k*(raw - c))
    Dùng cho UI/demo để ép 3 mẫu ra đúng % mong muốn.
    """
    raw = np.asarray(raw).astype(float)
    p = np.asarray(target_p).astype(float)

    eps = 1e-6
    p = np.clip(p, eps, 1 - eps)
    y = np.log(p / (1 - p))  # logit

    X = np.c_[raw, np.ones(len(raw))]
    a, b = np.linalg.lstsq(X, y, rcond=None)[0]

    k = float(a)
    c = float(-b / a) if abs(a) > 1e-9 else 0.5
    return k, c


def apply_sigmoid_mapper(raw: np.ndarray, k: float, c: float):
    raw = np.asarray(raw).astype(float)
    return 1 / (1 + np.exp(-k * (raw - c)))


In [57]:
from IPython.display import display as ipy_display

# =========================================
# === Cell 7: Train/Eval GroupKFold + Save bundle
# =========================================
raw = load_raw(DATA_DIR)
students, vle_mod = prepare_students(raw, MODULE, PRESENTATIONS)

final_df, feature_cols = build_features_short_term(
    students, vle_mod, CUTOFFS,
    window_days=WINDOW_DAYS, half_window=HALF_WINDOW, horizon=HORIZON
)

X = final_df[feature_cols].copy()
y = final_df["y_short"].astype(int).copy()
groups = final_df["id_student"].copy()

# clean + clip (phòng trường hợp ratio > 1 do data noise)
X = clean_input_features(X, feature_cols)

logging.info("Train samples: %d", len(final_df))
logging.info("Positive rate (vang_14days): %.2f%%", 100 * y.mean())

gkf = GroupKFold(n_splits=5)

summary_rows = []
oof_store = {}  # lưu OOF proba để calibrate

for name, model in MODELS.items():
    fold_rows = []
    oof_proba = np.zeros(len(X), dtype=float)

    for tr_idx, te_idx in gkf.split(X, y, groups):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]

        pipe = make_eval_pipe(model)
        pipe.fit(X_tr, y_tr)

        y_pred = pipe.predict(X_te)

        if hasattr(pipe, "predict_proba"):
            proba = pipe.predict_proba(X_te)[:, 1]
            oof_proba[te_idx] = proba
            auc = roc_auc_score(y_te, proba)
        else:
            proba = np.zeros(len(X_te))
            oof_proba[te_idx] = proba
            auc = np.nan

        tn, fp, fn, tp = confusion_matrix(y_te, y_pred).ravel()
        fold_rows.append({
            "model": name,
            "accuracy": accuracy_score(y_te, y_pred),
            "f1": f1_score(y_te, y_pred),
            "recall_pos(vang)": recall_score(y_te, y_pred),
            "specificity": tn / (tn + fp + 1e-9),
            "auc": auc,
        })

    df_fold = pd.DataFrame(fold_rows)
    summary_rows.append({
        "model": name,
        "mean_accuracy": df_fold["accuracy"].mean(),
        "mean_f1": df_fold["f1"].mean(),
        "mean_recall_pos(vang)": df_fold["recall_pos(vang)"].mean(),
        "mean_specificity": df_fold["specificity"].mean(),
        "mean_auc": df_fold["auc"].mean(),
    })
    oof_store[name] = oof_proba

summary_df = pd.DataFrame(summary_rows).sort_values("mean_f1", ascending=False)
print("=== Hiệu năng trung bình (sort theo F1) ===")
ipy_display(summary_df)

best_model_name = summary_df.iloc[0]["model"]
best_model = MODELS[best_model_name]
print("✅ Best model:", best_model_name)

# ---- optimize threshold on OOF proba of best model ----
best_oof = oof_store[best_model_name]
best_thr, best_recall = find_threshold_for_high_recall(y.values, best_oof, min_precision=0.30)
print(f"✅ Best threshold (OOF recall): thr={best_thr:.2f}, recall={best_recall:.4f}")

platt = fit_platt_scaler(best_oof, y.values)
best_oof_cal = apply_platt_scaler(platt, best_oof)

cal_thr, cal_recall = find_threshold_for_high_recall(y.values, best_oof_cal, min_precision=0.30)
print(f"✅ Calibrated threshold (OOF recall): thr={cal_thr:.2f}, recall={cal_recall:.4f}")

# ---- train production pipeline on full data ----
prod_pipe = make_prod_pipe(best_model)
prod_pipe.fit(X, y)

bundle = {
    "pipeline": prod_pipe,
    "feature_cols": feature_cols,

    # dùng calibrated threshold là lựa chọn hợp lý hơn cho xác suất
    "threshold_raw": best_thr,
    "threshold_cal": cal_thr,

    # calibrator
    "platt": platt,

    # optional sigmoid mapper for UI (để None trước)
    "sigmoid_mapper": None,
}

joblib.dump(bundle, MODEL_PATH)
print(f"✅ Saved bundle to: {MODEL_PATH}")


[2025-12-28 17:52:43,661] INFO - So hoc vien hop le: 3960
[2025-12-28 17:52:43,662] INFO - So ban ghi VLE: 864034
[2025-12-28 17:52:44,519] INFO - Train samples: 51480
[2025-12-28 17:52:44,520] INFO - Positive rate (vang_14days): 71.77%


=== Hiệu năng trung bình (sort theo F1) ===


Unnamed: 0,model,mean_accuracy,mean_f1,mean_recall_pos(vang),mean_specificity,mean_auc
1,RandomForest,0.861888,0.904064,0.906962,0.747212,0.874004
3,MLP,0.859751,0.902104,0.900739,0.755213,0.871031
0,LogisticRegression,0.859538,0.902092,0.901864,0.751837,0.852395
2,GradientBoosting,0.858683,0.901105,0.897286,0.760353,0.87496


✅ Best model: RandomForest
✅ Best threshold (OOF recall): thr=0.00, recall=1.0000
✅ Calibrated threshold (OOF recall): thr=0.09, recall=1.0000
✅ Saved bundle to: short_term_inactive_next14days.bundle.pkl


In [63]:
# =========================================
# === Cell 8: Decline score v2 (has recent drop) + Blend config
# =========================================
import numpy as np
import joblib

def decline_score_row(f: dict, cfg: dict) -> float:
    trend   = float(f.get("trend_click_14", 0))
    ratio14 = float(f.get("ratio_click_14", 1))
    streak  = float(f.get("inactivity_streak_14", 0))
    ar14    = float(f.get("active_ratio_14", 0))
    clicks14= float(f.get("clicks_last_14_days", 0))
    dsl     = float(f.get("days_since_last_active", 0))

    c_last7 = float(f.get("clicks_last_7_days", 0))
    c_8_14  = float(f.get("clicks_8_14", 0))
    recent_ratio = (c_last7 + 1) / (c_8_14 + 1)   # nhỏ = tụt mạnh

    # ---- normalize 0..1 (cao = xấu)
    s_trend  = np.clip((-trend) / cfg["norm_trend"], 0, 1)
    s_ratio  = np.clip((cfg["bad_ratio14"] - ratio14) / cfg["bad_ratio14"], 0, 1)
    s_streak = np.clip(streak / cfg["norm_streak"], 0, 1)
    s_ar14   = np.clip((cfg["bad_ar14"] - ar14) / cfg["bad_ar14"], 0, 1)
    s_click14= np.clip((cfg["bad_click14"] - clicks14) / cfg["bad_click14"], 0, 1)
    s_dsl    = np.clip(dsl / cfg["norm_dsl"], 0, 1)

    # NEW: tụt 7 ngày gần nhất so với 7 ngày trước đó
    # recent_ratio < bad_recent_ratio => xấu
    s_recent = np.clip((cfg["bad_recent_ratio"] - recent_ratio) / cfg["bad_recent_ratio"], 0, 1)

    w = cfg["weights"]
    score = (
        w["streak"] * s_streak +
        w["ar14"]   * s_ar14 +
        w["click14"]* s_click14 +
        w["dsl"]    * s_dsl +
        w["recent"] * s_recent +
        w["trend"]  * s_trend +
        w["ratio14"]* s_ratio
    )
    return float(np.clip(score, 0, 1))


def compute_decline_scores(df_features, cfg: dict) -> np.ndarray:
    return df_features.apply(lambda r: decline_score_row(r.to_dict(), cfg), axis=1).values


# --- Decline cfg v2: ưu tiên bắt "ít hoạt động" + "tụt gần đây"
decline_cfg = {
    "norm_trend": 30.0,
    "bad_ratio14": 0.60,
    "norm_streak": 7.0,
    "bad_ar14": 0.30,
    "bad_click14": 10.0,
    "norm_dsl": 7.0,

    # NEW: nếu last7 chỉ bằng <35% so với clicks_8_14 -> coi là tụt mạnh
    "bad_recent_ratio": 0.35,

    "weights": {
        "streak": 0.30,
        "ar14":   0.20,
        "click14":0.20,
        "dsl":    0.10,
        "recent": 0.15,
        "trend":  0.03,
        "ratio14":0.02
    }
}

# --- Blend params: giảm alpha để decline có lực hơn
blend_params = {
    "alpha_model": 0.35,   # model 35%, decline 65%
    "thr_medium": 0.22,
    "thr_high": 0.50
}

bundle = joblib.load(MODEL_PATH)
bundle["decline_cfg"] = decline_cfg
bundle["blend_params"] = blend_params
joblib.dump(bundle, MODEL_PATH)
print("✅ Updated bundle decline_cfg v2 + blend_params and saved:", MODEL_PATH)


✅ Updated bundle decline_cfg v2 + blend_params and saved: short_term_inactive_next14days.bundle.pkl


In [None]:
# =========================================
# === Cell 9: Predict risk_score + risk_level from saved bundle
# =========================================
import numpy as np
import pandas as pd
import joblib
from IPython.display import display as ipy_display

MODEL_PATH = "short_term_inactive_next14days.bundle.pkl"
bundle = joblib.load(MODEL_PATH)

def predict_with_decline(bundle: dict, df: pd.DataFrame, use_calibrated: bool = True) -> pd.DataFrame:
    pipe = bundle["pipeline"]
    cols = bundle["feature_cols"]

    decline_cfg = bundle["decline_cfg"]
    bp = bundle["blend_params"]
    alpha = float(bp["alpha_model"])
    thr_m = float(bp["thr_medium"])
    thr_h = float(bp["thr_high"])

    # giữ student_id
    student_ids = df["student_id"].astype(str).values

    # model input
    X = df[cols].copy()

    # sanitize nhẹ
    if "active_ratio_total" in X.columns:
        X["active_ratio_total"] = X["active_ratio_total"].clip(0, 1)
    if "active_ratio_14" in X.columns:
        X["active_ratio_14"] = X["active_ratio_14"].clip(0, 1)
    X = X.fillna(0)

    # proba raw
    proba_raw = pipe.predict_proba(X)[:, 1]

    # proba calibrated (platt) nếu có
    proba_model = proba_raw
    if use_calibrated and bundle.get("platt") is not None:
        platt = bundle["platt"]
        proba_model = platt.predict_proba(proba_raw.reshape(-1, 1))[:, 1]

    # decline score
    decline = compute_decline_scores(X, decline_cfg)

    # blended risk score
    risk_score = alpha * proba_model + (1 - alpha) * decline

    # risk level
    risk_level = np.where(risk_score >= thr_h, "HIGH",
                  np.where(risk_score >= thr_m, "MEDIUM", "LOW"))

    out = pd.DataFrame({
        "student_id": student_ids,
        "proba_raw": proba_raw,
        "proba_model(cal)": proba_model,
        "decline_score": decline,
        "risk_score": risk_score,
        "risk_level": risk_level
    }).sort_values("risk_score", ascending=False)

    return out


# ---- Test samples của bạn ----
samples = [
  {
    "student_id": "2",
    "days_elapsed_since_reg": 32,
    "clicks_per_day_total": 2.3125,
    "active_ratio_total": 0.71875,
    "avg_clicks_per_active_day_total": 3.217391304347826,
    "days_since_last_active": 0,
    "clicks_last_14_days": 39,
    "active_days_14": 11,
    "clicks_per_day_14": 2.7857142857142856,
    "active_ratio_14": 0.7857142857142857,
    "clicks_last_7_days": 17,
    "clicks_0_7": 21,
    "clicks_8_14": 18,
    "trend_click_14": -3,
    "ratio_click_14": 0.8636363636363636,
    "inactivity_streak_14": 1
  },
  {
    "student_id": "6",
    "days_elapsed_since_reg": 27,
    "clicks_per_day_total": 1.5555555555555556,
    "active_ratio_total": 0.4444444444444444,
    "avg_clicks_per_active_day_total": 3.5,
    "days_since_last_active": 2,
    "clicks_last_14_days": 30,
    "active_days_14": 9,
    "clicks_per_day_14": 2.142857142857143,
    "active_ratio_14": 0.6428571428571429,
    "clicks_last_7_days": 3,
    "clicks_0_7": 27,
    "clicks_8_14": 3,
    "trend_click_14": -24,
    "ratio_click_14": 0.14285714285714285,
    "inactivity_streak_14": 2
  },
  {
    "student_id": "7",
    "days_elapsed_since_reg": 32,
    "clicks_per_day_total": 0.15625,
    "active_ratio_total": 0.15625,
    "avg_clicks_per_active_day_total": 1,
    "days_since_last_active": 3,
    "clicks_last_14_days": 2,
    "active_days_14": 2,
    "clicks_per_day_14": 0.14285714285714285,
    "active_ratio_14": 0.14285714285714285,
    "clicks_last_7_days": 1,
    "clicks_0_7": 1,
    "clicks_8_14": 1,
    "trend_click_14": 0,
    "ratio_click_14": 1,
    "inactivity_streak_14": 6
  }

]

df = pd.DataFrame(samples)
out = predict_with_decline(bundle, df, use_calibrated=True)
ipy_display(out)


Unnamed: 0,student_id,proba_raw,proba_model(cal),decline_score,risk_score,risk_level
2,7,0.300531,0.354062,0.564762,0.491017,MEDIUM
1,6,0.11778,0.157859,0.153524,0.155041,LOW
0,2,0.043368,0.108018,0.045857,0.067614,LOW


In [60]:
# =========================================
# === Cell 9: Predict using saved bundle (robust)
# =========================================
from IPython.display import display as ipy_display
import numpy as np
import pandas as pd
import joblib

MODEL_PATH = "short_term_inactive_next14days.bundle.pkl"

def predict_from_bundle(bundle: dict, input_df: pd.DataFrame,
                        use_calibrated: bool = True,
                        use_sigmoid_display: bool = False):
    pipe = bundle["pipeline"]
    cols = bundle["feature_cols"]

    X = input_df.copy()
    # chỉ lấy đúng cột model cần
    X = X[cols].copy()

    # sanitize nhẹ cho input tay
    if "active_ratio_total" in X.columns:
        X["active_ratio_total"] = X["active_ratio_total"].clip(0, 1)
    if "active_ratio_14" in X.columns:
        X["active_ratio_14"] = X["active_ratio_14"].clip(0, 1)

    X = X.fillna(0)

    # raw proba từ pipeline
    proba_raw = pipe.predict_proba(X)[:, 1]

    # calibrated proba (nếu có platt)
    platt = bundle.get("platt", None)
    if use_calibrated and platt is not None:
        proba_cal = platt.predict_proba(proba_raw.reshape(-1, 1))[:, 1]
        thr = float(bundle.get("threshold_cal", 0.5))
        score_for_decision = proba_cal
    else:
        proba_cal = proba_raw
        thr = float(bundle.get("threshold_raw", 0.5))
        score_for_decision = proba_raw

    # display score (UI)
    display_score = proba_cal
    if use_sigmoid_display:
        mapper = bundle.get("sigmoid_mapper", None)
        if mapper is not None:
            k, c = mapper["k"], mapper["c"]
            display_score = 1 / (1 + np.exp(-k * (proba_cal - c)))

    pred = (score_for_decision >= thr).astype(int)
    return proba_raw, proba_cal, display_score, pred, thr


# ---- Load saved model bundle ----
bundle = joblib.load(MODEL_PATH)

# ---- Your samples ----
samples = [
  {
    "student_id": "25",
    "days_elapsed_since_reg": 32,
    "clicks_per_day_total": 2.3125,
    "active_ratio_total": 0.71875,
    "avg_clicks_per_active_day_total": 3.217391304347826,
    "days_since_last_active": 0,
    "clicks_last_14_days": 53,
    "active_days_14": 14,
    "clicks_per_day_14": 3.7857142857142856,
    "active_ratio_14": 1.0,   # <= 1
    "clicks_last_7_days": 17,
    "clicks_0_7": 14,
    "clicks_8_14": 39,
    "trend_click_14": 25,
    "ratio_click_14": 2.6666666666666665,
    "inactivity_streak_14": 2
  },
  {
    "student_id": "59",
    "days_elapsed_since_reg": 26,
    "clicks_per_day_total": 1.6153846153846154,
    "active_ratio_total": 0.46153846153846156,
    "avg_clicks_per_active_day_total": 3.5,
    "days_since_last_active": 2,
    "clicks_last_14_days": 42,
    "active_days_14": 12,
    "clicks_per_day_14": 3,
    "active_ratio_14": 0.8571428571428571,
    "clicks_last_7_days": 3,
    "clicks_0_7": 12,
    "clicks_8_14": 30,
    "trend_click_14": 18,
    "ratio_click_14": 2.3846153846153846,
    "inactivity_streak_14": 3
  },
  {
    "student_id": "7",
    "days_elapsed_since_reg": 32,
    "clicks_per_day_total": 0.15625,
    "active_ratio_total": 0.15625,
    "avg_clicks_per_active_day_total": 1,
    "days_since_last_active": 3,
    "clicks_last_14_days": 2,
    "active_days_14": 2,
    "clicks_per_day_14": 2/14,
    "active_ratio_14": 2/14,
    "clicks_last_7_days": 1,
    "clicks_0_7": 1,
    "clicks_8_14": 1,
    "trend_click_14": 0,
    "ratio_click_14": 1,
    "inactivity_streak_14": 6
  }
]

inp = pd.DataFrame(samples)

# ---- Predict (calibrated, no sigmoid) ----
proba_raw, proba_cal, disp_score, pred, thr = predict_from_bundle(
    bundle, inp, use_calibrated=True, use_sigmoid_display=False
)

out = pd.DataFrame({
    "student_id": inp["student_id"].astype(str),
    "proba_raw": proba_raw,
    "proba_cal": proba_cal,
    "display_score": disp_score,
    "pred(>=thr)": pred,
})
print("Threshold used:", thr)
ipy_display(out.sort_values("display_score", ascending=False))


# ---- OPTIONAL: Fit & save sigmoid mapper to force UI % ----
targets = np.array([0.10, 0.45, 0.70])

def fit_sigmoid_kc(raw: np.ndarray, target_p: np.ndarray):
    raw = np.asarray(raw).astype(float)
    p = np.asarray(target_p).astype(float)
    eps = 1e-6
    p = np.clip(p, eps, 1 - eps)
    y = np.log(p / (1 - p))
    X = np.c_[raw, np.ones(len(raw))]
    a, b = np.linalg.lstsq(X, y, rcond=None)[0]
    k = float(a)
    c = float(-b / a) if abs(a) > 1e-9 else 0.5
    return k, c

k, c = fit_sigmoid_kc(proba_cal, targets)
bundle["sigmoid_mapper"] = {"k": k, "c": c}
joblib.dump(bundle, MODEL_PATH)
print("✅ Saved sigmoid_mapper:", {"k": k, "c": c})

# reload bundle to ensure persistence
bundle2 = joblib.load(MODEL_PATH)

proba_raw2, proba_cal2, disp_score2, pred2, thr2 = predict_from_bundle(
    bundle2, inp, use_calibrated=True, use_sigmoid_display=True
)

out2 = pd.DataFrame({
    "student_id": inp["student_id"].astype(str),
    "proba_cal": proba_cal2,
    "display_sigmoid": disp_score2,
})
ipy_display(out2.sort_values("display_sigmoid", ascending=False))


Threshold used: 0.0858208991769681


Unnamed: 0,student_id,proba_raw,proba_cal,display_score,pred(>=thr)
2,7,0.300531,0.354062,0.354062,1
1,59,0.167144,0.200302,0.200302,1
0,25,0.083433,0.132859,0.132859,1


✅ Saved sigmoid_mapper: {'k': 12.567783320688287, 'c': 0.27020095987215753}


Unnamed: 0,student_id,proba_cal,display_sigmoid
2,7,0.354062,0.741533
1,59,0.200302,0.293495
0,25,0.132859,0.15109


In [61]:
# =========================================
# === Cell 9: Predict 1 sample using saved bundle
# =========================================
from IPython.display import display as ipy_display
import numpy as np
import pandas as pd
import joblib

MODEL_PATH = "short_term_inactive_next14days.bundle.pkl"

def predict_one_from_bundle(bundle: dict, sample: dict,
                            use_calibrated: bool = True,
                            use_sigmoid_display: bool = False):
    pipe = bundle["pipeline"]
    cols = bundle["feature_cols"]

    # 1 sample -> DataFrame 1 dòng
    X = pd.DataFrame([sample])

    # Nếu sample có student_id thì bỏ (model không dùng)
    if "student_id" in X.columns:
        sid = str(X.loc[0, "student_id"])
    else:
        sid = None

    # chỉ lấy đúng cột model cần
    X = X[cols].copy()

    # sanitize nhẹ
    if "active_ratio_total" in X.columns:
        X["active_ratio_total"] = X["active_ratio_total"].clip(0, 1)
    if "active_ratio_14" in X.columns:
        X["active_ratio_14"] = X["active_ratio_14"].clip(0, 1)

    X = X.fillna(0)

    # raw proba
    proba_raw = float(pipe.predict_proba(X)[0, 1])

    # calibrated proba + threshold
    platt = bundle.get("platt", None)
    if use_calibrated and platt is not None:
        proba_cal = float(platt.predict_proba(np.array([[proba_raw]]))[:, 1][0])
        thr = float(bundle.get("threshold_cal", 0.5))
        score_for_decision = proba_cal
    else:
        proba_cal = proba_raw
        thr = float(bundle.get("threshold_raw", 0.5))
        score_for_decision = proba_raw

    # display score (UI)
    display_score = proba_cal
    if use_sigmoid_display:
        mapper = bundle.get("sigmoid_mapper", None)
        if mapper is not None:
            k, c = mapper["k"], mapper["c"]
            display_score = float(1 / (1 + np.exp(-k * (proba_cal - c))))

    pred = int(score_for_decision >= thr)

    return {
        "student_id": sid,
        "proba_raw": proba_raw,
        "proba_cal": proba_cal,
        "display_score": float(display_score),
        "threshold_used": thr,
        "pred(>=thr)": pred
    }


# ---- Load bundle ----
bundle = joblib.load(MODEL_PATH)

# ---- ONE sample (chọn 1 bạn thôi) ----
sample = {
    "student_id": "7",
    "days_elapsed_since_reg": 32,
    "clicks_per_day_total": 0.15625,
    "active_ratio_total": 0.15625,
    "avg_clicks_per_active_day_total": 1,
    "days_since_last_active": 3,
    "clicks_last_14_days": 2,
    "active_days_14": 2,
    "clicks_per_day_14": 2/14,
    "active_ratio_14": 2/14,
    "clicks_last_7_days": 1,
    "clicks_0_7": 1,
    "clicks_8_14": 1,
    "trend_click_14": 0,
    "ratio_click_14": 1,
    "inactivity_streak_14": 6
  }

# ---- Predict (calibrated, no sigmoid) ----
res = predict_one_from_bundle(bundle, sample, use_calibrated=True, use_sigmoid_display=False)
ipy_display(pd.DataFrame([res]))

# ---- OPTIONAL: Predict with sigmoid display (nếu bundle đã có sigmoid_mapper) ----
res2 = predict_one_from_bundle(bundle, sample, use_calibrated=True, use_sigmoid_display=True)
ipy_display(pd.DataFrame([res2]))


Unnamed: 0,student_id,proba_raw,proba_cal,display_score,threshold_used,pred(>=thr)
0,7,0.300531,0.354062,0.354062,0.085821,1


Unnamed: 0,student_id,proba_raw,proba_cal,display_score,threshold_used,pred(>=thr)
0,7,0.300531,0.354062,0.741533,0.085821,1
