# 04  Candidate Recall + GBDT Ranker

In [1]:
#Adjustable: Add parent directory (which contains utils/) to Python search path
import sys, os
sys.path.append(os.path.abspath(".."))  #  notebooks  sys.path

In [2]:
#Project root
from pathlib import Path
proj_root = Path.cwd()
if (proj_root.name.lower() == "notebooks" or (proj_root/"utils").exists() is False) and (proj_root.parent/"utils").exists():
    proj_root = proj_root.parent
if str(proj_root) not in sys.path:
    sys.path.append(str(proj_root))
print("Project root:", proj_root)

#Unified imports
import numpy as np, pandas as pd, polars as pl, json, joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier

from utils.config import DATA_DIR, INTERIM_DIR, PROCESSED_DIR
from utils.etl_clean import ensure_interim
from utils.splits import temporal_split, add_crisis_flag
from utils.candidates import build_origin_next_transitions, global_mf_next, build_pc_coords, build_candidates_for_split
from utils.features import build_ports_attr, compute_port_degree, attach_port_side, build_sample_side, merge_all_features
from utils.metrics import eval_topk_mrr

Project root: /Users/wangwei/Documents/Folders/工作/Kpler


In [3]:
samples = pl.read_parquet(PROCESSED_DIR / "samples_taskA.parquet")
pc = pl.read_parquet(INTERIM_DIR / "port_calls.cleaned.parquet")
tr = pl.read_csv(DATA_DIR / "trades.csv",  try_parse_dates=True)
vs = pl.read_csv(DATA_DIR / "vessels.csv", try_parse_dates=True)

train, val, test = temporal_split(samples)
train = add_crisis_flag(train); val = add_crisis_flag(val); test = add_crisis_flag(test)

trans = build_origin_next_transitions(train)
g_top = global_mf_next(trans)
pc_coords = build_pc_coords(pc)

cand_train = build_candidates_for_split(train, trans, pc_coords, add_true_label=True,  N=10, M=10, global_top1=g_top)
cand_val   = build_candidates_for_split(val,   trans, pc_coords, add_true_label=True,  N=10, M=10, global_top1=g_top)
cand_test  = build_candidates_for_split(test,  trans, pc_coords, add_true_label=False, N=10, M=10, global_top1=g_top)

ports_attr  = build_ports_attr(pc_coords)
port_degree = compute_port_degree(trans)
cand_train  = attach_port_side(cand_train, ports_attr, port_degree)
cand_val    = attach_port_side(cand_val,   ports_attr, port_degree)
cand_test   = attach_port_side(cand_test,  ports_attr, port_degree)

s_side   = build_sample_side(samples, pc, vs)
cand_train = merge_all_features(cand_train, s_side, train)
cand_val   = merge_all_features(cand_val,   s_side, val)
cand_test  = merge_all_features(cand_test,  s_side, test)

In [4]:
#  GBDT HGB
num_cols = ["dist_km","is_same_region","in_cnt","out_cnt","age",
            "prev_dist_km","last_leg_knots_est","month_sin","month_cos","dow_sin","dow_cos",
            "is_crisis_time","dist_x_crisis"]
cat_cols = ["origin","candidate","vessel_type","dwt_bucket","product_family_dom"]

def to_xy(df: pl.DataFrame):
    keep = ["sample_port_call_id","origin","candidate","label","y"] + num_cols + cat_cols
    for c in num_cols:
        if c not in df.columns: df = df.with_columns(pl.lit(0.0).alias(c))
    for c in cat_cols:
        if c not in df.columns: df = df.with_columns(pl.lit("unk").alias(c))
    pdf = df.select(keep).to_pandas()
    X_num = pdf[num_cols].values
    #  denseHGB 
    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    X_cat = enc.fit_transform(pdf[cat_cols])
    X = np.hstack([X_num, X_cat])
    y = pdf["y"].values
    meta = pdf[["sample_port_call_id","origin","candidate","label"]]
    return X, y, meta, enc

Xtr, ytr, mtr, enc = to_xy(cand_train)
#  val/test  encoder
def to_xy_with_enc(df: pl.DataFrame, enc):
    keep = ["sample_port_call_id","origin","candidate","label","y"] + num_cols + cat_cols
    for c in num_cols:
        if c not in df.columns: df = df.with_columns(pl.lit(0.0).alias(c))
    for c in cat_cols:
        if c not in df.columns: df = df.with_columns(pl.lit("unk").alias(c))
    pdf = df.select(keep).to_pandas()
    X_num = pdf[num_cols].values
    X_cat = enc.transform(pdf[cat_cols])
    X = np.hstack([X_num, X_cat])
    y = pdf["y"].values
    meta = pdf[["sample_port_call_id","origin","candidate","label"]]
    return X, y, meta

Xva, yva, mva = to_xy_with_enc(cand_val, enc)
Xte, yte, mte = to_xy_with_enc(cand_test, enc)

clf = HistGradientBoostingClassifier(max_depth=8, learning_rate=0.08, max_iter=300)
clf.fit(Xtr, ytr)

def rank_predict_hgb(clf, X, meta, ks=(1,3,5)):
    proba = clf.predict_proba(X)[:,1]
    meta2 = meta.copy()
    meta2["score"] = proba
    topk = {}
    for sid, g in meta2.groupby("sample_port_call_id"):
        g2 = g.sort_values("score", ascending=False)
        topk[sid] = g2["candidate"].tolist()
    truth, preds = [], []
    for sid, g in meta2.groupby("sample_port_call_id"):
        truth.append(g["label"].iloc[0])
        preds.append(topk[sid])
    return preds, truth

from utils.metrics import eval_topk_mrr
preds_val, truth_val = rank_predict_hgb(clf, Xva, mva)
preds_te,  truth_te  = rank_predict_hgb(clf, Xte, mte)
print("VAL:", eval_topk_mrr([p[:5] for p in preds_val], truth_val, ks=(1,3,5)))
print("TEST:", eval_topk_mrr([p[:5] for p in preds_te],  truth_te,  ks=(1,3,5)))

#  encoder
import joblib
joblib.dump({"clf":clf, "enc":enc, "num_cols":num_cols, "cat_cols":cat_cols}, PROCESSED_DIR / "model_taskA_gbdt.joblib")
print("saved:", PROCESSED_DIR / "model_taskA_gbdt.joblib")

DuplicateError: the name 'origin' is duplicate

It's possible that multiple expressions are returning the same default column name. If this is the case, try renaming the columns with `.alias("new_name")` to avoid duplicate column names.