# 03  Candidate Recall + Logistic Ranker

In [1]:
#Adjustable: Add parent directory (which contains utils/) to Python search path
import sys, os
sys.path.append(os.path.abspath(".."))  #  notebooks  sys.path

In [2]:
# === Unified imports ===
import polars as pl

# --- Standard imports ---
import numpy as np, pandas as pd, json, joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss

from utils.config import DATA_DIR, INTERIM_DIR, PROCESSED_DIR
from utils.etl_clean import ensure_interim
from utils.splits import temporal_split, add_crisis_flag
from utils.candidates import build_origin_next_transitions, global_mf_next, build_pc_coords, build_candidates_for_split
from utils.features import build_ports_attr, compute_port_degree, attach_port_side, build_sample_side, merge_all_features
from utils.metrics import eval_topk_mrr

# Force Polars to use all CPU cores
os.environ["POLARS_MAX_THREADS"] = str(os.cpu_count())

# Display and string cache setup
pl.Config.set_tbl_rows(5)
pl.Config.set_tbl_cols(10)
pl.Config.set_tbl_formatting("ASCII_FULL")

# Force Polars to use all CPU cores
os.environ["POLARS_MAX_THREADS"] = str(os.cpu_count())

# Compatibility: enable global string cache for joins
if hasattr(pl, "enable_string_cache"):
    pl.enable_string_cache()   # Polars ≥ 1.0
elif hasattr(pl, "toggle_string_cache"):
    pl.toggle_string_cache(True)  # legacy

In [6]:
%%time
#Load samples and cleaned table
samples = pl.read_parquet(PROCESSED_DIR / "samples_taskA.parquet")
pc = pl.read_parquet(INTERIM_DIR / "port_calls.cleaned.parquet")
tr = pl.read_csv(DATA_DIR / "trades.csv",  try_parse_dates=True)
vs = pl.read_csv(DATA_DIR / "vessels.csv", try_parse_dates=True)

train, val, test = temporal_split(samples)
train = add_crisis_flag(train); val = add_crisis_flag(val); test = add_crisis_flag(test)

# Build transition tables and coordinate maps from training data
trans = build_origin_next_transitions(train)
g_top = global_mf_next(trans)
pc_coords = build_pc_coords(pc)

cache_train_path = PROCESSED_DIR / "cand_train_cached.parquet"
cache_train_sampled_path = PROCESSED_DIR / "cand_train_cached_500k.parquet"

if cache_train_sampled_path.exists():
    cand_train = pl.read_parquet(cache_train_sampled_path)
else:
    if cache_train_path.exists():
        cand_train = pl.read_parquet(cache_train_path)  
    else:
        cand_train = build_candidates_for_split(train, trans, pc_coords, add_true_label=True,  N=10, M=10, global_top1=g_top)
        cand_train.write_parquet(cache_train_path)
        
    print(f"Sampling training set ({len(cand_train):,} rows) down to 500k rows...")
    cand_train = cand_train.sample(n=min(500_000, len(cand_train)), seed=42)

    cand_train.write_parquet(cache_train_sampled_path)
    print(f" Saved sampled training set to: {cache_train_sampled_path}")

cache_val_path = PROCESSED_DIR / "cand_val_cached.parquet"
if cache_val_path.exists():
    cand_val = pl.read_parquet(cache_val_path)
else:
    cand_val = build_candidates_for_split(val,  trans, pc_coords, add_true_label=True,  N=10, M=10, global_top1=g_top)
    cand_val.write_parquet(cache_val_path)

cache_test_path = PROCESSED_DIR / "cand_test_cached.parquet"
if cache_test_path.exists():
    cand_test = pl.read_parquet(cache_test_path)
else:
    cand_test = build_candidates_for_split(test, trans, pc_coords, add_true_label=False, N=10, M=10, global_top1=g_top)
    cand_test.write_parquet(cache_test_path)

ports_attr  = build_ports_attr(pc_coords)
port_degree = compute_port_degree(trans)
cand_train  = attach_port_side(cand_train, ports_attr, port_degree)
cand_val    = attach_port_side(cand_val,   ports_attr, port_degree)
cand_test   = attach_port_side(cand_test,  ports_attr, port_degree)

s_side   = build_sample_side(samples, pc, vs)
cand_train = merge_all_features(cand_train, s_side, train)
cand_val   = merge_all_features(cand_val,   s_side, val)
cand_test  = merge_all_features(cand_test,  s_side, test)

print("cand_train:", cand_train.shape)

Sampling training set (4,857,768 rows) down to 500k rows...
 Saved sampled training set to: /Users/wangwei/Documents/Folders/工作/Kpler/data/processed/cand_train_cached_500k.parquet
cand_train: (1418642, 42)
CPU times: user 2min 34s, sys: 6.04 s, total: 2min 41s
Wall time: 2min 32s


In [7]:
def assert_no_dup_names(df):
    cols = df.columns
    assert len(cols) == len(set(cols)), \
        f"duplicate columns: {[c for c in set(cols) if cols.count(c)>1]}"
assert_no_dup_names(cand_train)

In [9]:
%%time
# Logistic Ranker — One-Hot + Logistic Regression
num_cols = [
    "dist_km","is_same_region","in_cnt","out_cnt","age",
    "prev_dist_km","last_leg_knots_est",
    "month_sin","month_cos","dow_sin","dow_cos",
    "is_crisis_time","dist_x_crisis"
]
cat_cols = ["origin","candidate","vessel_type","dwt_bucket","product_family_dom"]

def to_xy(df: pl.DataFrame):
    """Prepare feature matrix X, label y, and metadata for ranking."""

    # 1) make 'keep' UNIQUE 
    base = ["sample_port_call_id","origin","candidate","label","y"]
    keep = list(dict.fromkeys(base + num_cols + cat_cols))  # preserves order, drops dups

    # 2) add missing columns with defaults
    for c in keep:
        if c not in df.columns:
            df = df.with_columns((pl.lit(0.0) if c in num_cols else pl.lit("unk")).alias(c))

    # 3) select & ensure one row per (sample, candidate)
    df = df.select(keep).unique(subset=["sample_port_call_id","candidate"], keep="first")

    # 4) to pandas for sklearn
    pdf = df.to_pandas()
    X = pdf[num_cols + cat_cols]
    y = pdf["y"].astype(int).values
    meta = pdf[["sample_port_call_id","origin","candidate","label"]]
    return X, y, meta

# === build matrices ===
Xtr, ytr, mtr = to_xy(cand_train)
Xva, yva, mva = to_xy(cand_val)
Xte, yte, mte = to_xy(cand_test)

# === pipeline ===
numeric_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='median')),
     ('scaler', StandardScaler())
 ])
 
preproc = ColumnTransformer(
    transformers=[
    ("num", numeric_transformer, num_cols), 
     ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
    ],
    remainder="drop", 
    sparse_threshold=1.0
    )

clf = LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=-1)
pipe = Pipeline([("prep", preproc), ("clf", clf)])
pipe.fit(Xtr, ytr)

# === ranking ===
def rank_predict(pipe, X, meta, ks=(1,3,5)):
    proba = pipe.predict_proba(X)[:, 1]
    meta2 = meta.copy()
    meta2["score"] = proba
    topk, truth = {}, []
    for sid, g in meta2.groupby("sample_port_call_id"):
        g2 = g.sort_values("score", ascending=False)
        topk[sid] = g2["candidate"].tolist()
        truth.append(g["label"].iloc[0])
    preds = [topk[sid] for sid in meta2["sample_port_call_id"].unique()]
    return preds, truth

preds_val, truth_val = rank_predict(pipe, Xva, mva)
preds_te,  truth_te  = rank_predict(pipe, Xte, mte)

print("VAL:",  eval_topk_mrr([p[:5] for p in preds_val], truth_val, ks=(1,3,5)))
print("TEST:", eval_topk_mrr([p[:5] for p in preds_te],  truth_te,  ks=(1,3,5)))

outm = PROCESSED_DIR / "model_taskA_logreg.joblib"
joblib.dump(pipe, outm)
print("Model saved to:", outm)

VAL: {'hits@1': 0.005480329531502998, 'hits@3': 0.016654507926905215, 'hits@5': 0.024643689614063807, 'mrr': 0.012072145809979687}
TEST: {'hits@1': 0.006820396232543034, 'hits@3': 0.018048531526933604, 'hits@5': 0.025982461838259176, 'mrr': 0.013557277409177365}
Model saved to: /Users/wangwei/Documents/Folders/工作/Kpler/data/processed/model_taskA_logreg.joblib
CPU times: user 10.7 s, sys: 3.88 s, total: 14.6 s
Wall time: 19.4 s
