# 05 — Sequence Forecast (Greedy / Beam) for Task‑A
# 05 — 任务A多步序列预测（贪心 / Beam）

In [None]:
# 项目根路径
#Project root
import sys
from pathlib import Path
proj_root = Path.cwd()
if (proj_root.name.lower() == "notebooks" or (proj_root/"utils").exists() is False) and (proj_root.parent/"utils").exists():
    proj_root = proj_root.parent
if str(proj_root) not in sys.path:
    sys.path.append(str(proj_root))

import numpy as np, pandas as pd, polars as pl, joblib
from utils.config import DATA_DIR, INTERIM_DIR, PROCESSED_DIR
from utils.splits import temporal_split, add_crisis_flag
from utils.candidates import build_origin_next_transitions, global_mf_next, build_pc_coords, build_candidates_for_split
from utils.features import build_ports_attr, compute_port_degree, attach_port_side, build_sample_side, merge_all_features

samples = pl.read_parquet(PROCESSED_DIR / "samples_taskA.parquet")
pc = pl.read_parquet(INTERIM_DIR / "port_calls.cleaned.parquet")
tr = pl.read_csv(DATA_DIR / "trades.csv",  try_parse_dates=True)
vs = pl.read_csv(DATA_DIR / "vessels.csv", try_parse_dates=True)

train, val, test = temporal_split(samples)
train = add_crisis_flag(train); val = add_crisis_flag(val); test = add_crisis_flag(test)

trans = build_origin_next_transitions(train)
g_top = global_mf_next(trans)
pc_coords = build_pc_coords(pc)

# 选择一个测试起点样本
seed = test.head(1)
print(seed)

In [None]:
# 载入一个训练好的排序器（LR 或 GBDT 均可）
#Load a trained ranker (LR or GBDT)
import joblib
lr_path  = PROCESSED_DIR / "model_taskA_logreg.joblib"
gbdt_path= PROCESSED_DIR / "model_taskA_gbdt.joblib"
use_gbdt = gbdt_path.exists()

if use_gbdt:
    pack = joblib.load(gbdt_path)
    clf = pack["clf"]; enc = pack["enc"]
    num_cols = pack["num_cols"]; cat_cols = pack["cat_cols"]
else:
    clf = joblib.load(lr_path)  # pipeline

In [None]:
# 贪心前进一步预测函数
#Greedy one-step predictor
from utils.features import build_sample_side, merge_all_features, build_ports_attr, compute_port_degree, attach_port_side

ports_attr  = build_ports_attr(pc_coords)
port_degree = compute_port_degree(trans)
s_side = build_sample_side(samples, pc, vs)

def rank_topk_for_sample(sample_row: pl.DataFrame, k=5):
    # 构建候选（不注入真值）
    cands = build_candidates_for_split(sample_row, trans, pc_coords, add_true_label=False, N=10, M=10, global_top1=g_top)
    cands = attach_port_side(cands, ports_attr, port_degree)
    # 标注危机
    sample_row = add_crisis_flag(sample_row)
    cands = merge_all_features(cands, s_side, sample_row)
    # 变成特征矩阵
    num_cols = ["dist_km","is_same_region","in_cnt","out_cnt","age",
                "prev_dist_km","last_leg_knots_est","month_sin","month_cos","dow_sin","dow_cos",
                "is_crisis_time","dist_x_crisis"]
    cat_cols = ["origin","candidate","vessel_type","dwt_bucket","product_family_dom"]
    import pandas as pd, numpy as np
    cols = ["sample_port_call_id","origin","candidate"] + num_cols + cat_cols
    for c in num_cols:
        if c not in cands.columns: cands = cands.with_columns(pl.lit(0.0).alias(c))
    for c in cat_cols:
        if c not in cands.columns: cands = cands.with_columns(pl.lit("unk").alias(c))
    pdf = cands.select(cols).to_pandas()

    if isinstance(clf, object) and hasattr(clf, "predict_proba") and hasattr(clf, "steps"):  # LR pipeline
        X = pdf[num_cols + cat_cols]
        proba = clf.predict_proba(X)[:,1]
    else:  # GBDT
        from sklearn.preprocessing import OneHotEncoder
        enc2 = enc
        X_num = pdf[num_cols].values
        X_cat = enc2.transform(pdf[cat_cols])
        X = np.hstack([X_num, X_cat])
        proba = clf.predict_proba(X)[:,1]

    pdf["score"] = proba
    pdf = pdf.sort_values("score", ascending=False)
    return pdf.head(k)[["candidate","score"]]

# 演示：对 seed 样本做一步预测
topk = rank_topk_for_sample(seed, k=5)
print(topk)