# 02  Baselines: Global-MF & Conditional Markov
# 02   &  Markov

In [1]:
#Adjustable: Add parent directory (which contains utils/) to Python search path
import sys, os
sys.path.append(os.path.abspath(".."))  #  notebooks  sys.path

In [2]:
import json, numpy as np, pandas as pd, polars as pl
from utils.config import DATA_DIR, INTERIM_DIR, PROCESSED_DIR
from utils.etl_clean import ensure_interim, build_samples_taskA
from utils.splits import temporal_split, add_crisis_flag
from utils.candidates import build_origin_next_transitions, global_mf_next
from utils.metrics import eval_topk_mrr

In [3]:
#Load samples; rebuild if missing
s_path = PROCESSED_DIR / "samples_taskA.parquet"
if not s_path.exists():
    pc = ensure_interim()
    tr = pl.read_csv(DATA_DIR / "trades.csv",  try_parse_dates=True)
    vs = pl.read_csv(DATA_DIR / "vessels.csv", try_parse_dates=True)
    _ = build_samples_taskA(pc, tr, vs)

samples = pl.read_parquet(s_path)
train, val, test = temporal_split(samples)
train = add_crisis_flag(train); val = add_crisis_flag(val); test = add_crisis_flag(test)
print("shapes:", train.shape, val.shape, test.shape)

shapes: (260606, 16) (56201, 16) (21553, 16)


In [4]:
#Build transitions & global most-frequent
trans = build_origin_next_transitions(train)
g_top = global_mf_next(trans)
print("GLOBAL_TOP1:", g_top)

GLOBAL_TOP1: Singapore


In [5]:
#Evaluation: Val/Test
def predict_top1_gmf(n):
    return [[g_top] for _ in range(n)]

def topk_by_markov(split_df: pl.DataFrame, K=5):
    #  Top-K 
    topk_map = {}
    for origin, sub in trans.group_by("destination", maintain_order=True):
        arr = sub.sort("cnt", descending=True)["next_call_name"].to_list()[:K]
        topk_map[origin] = arr
    preds = [topk_map.get(o, [g_top]) for o in split_df["destination"].to_list()]
    return preds

val_truth = val["next_call_name"].to_list()
test_truth= test["next_call_name"].to_list()

pred0_val  = predict_top1_gmf(len(val_truth))
pred0_test = predict_top1_gmf(len(test_truth))
res0_val   = eval_topk_mrr(pred0_val,  val_truth, ks=(1,3,5))
res0_test  = eval_topk_mrr(pred0_test, test_truth, ks=(1,3,5))

pred1_val  = topk_by_markov(val, K=5)
pred1_test = topk_by_markov(test, K=5)
res1_val   = eval_topk_mrr(pred1_val,  val_truth, ks=(1,3,5))
res1_test  = eval_topk_mrr(pred1_test, test_truth, ks=(1,3,5))

print("Val  Global-MF:", res0_val)
print("Val  Markov-1 :", res1_val)
print("Test Global-MF:", res0_test)
print("Test Markov-1 :", res1_test)

out = PROCESSED_DIR / "baseline_taskA_metrics.json"
import json
out.write_text(json.dumps({"val_gmf":res0_val,"val_mkv":res1_val,"test_gmf":res0_test,"test_mkv":res1_test}, indent=2), encoding="utf-8")
print("saved:", out)

Val  Global-MF: {'hits@1': 0.03674311844984965, 'hits@3': 0.03674311844984965, 'hits@5': 0.03674311844984965, 'mrr': 0.03674311844984965}
Val  Markov-1 : {'hits@1': 0.03674311844984965, 'hits@3': 0.03674311844984965, 'hits@5': 0.03674311844984965, 'mrr': 0.03674311844984965}
Test Global-MF: {'hits@1': 0.038324131211432286, 'hits@3': 0.038324131211432286, 'hits@5': 0.038324131211432286, 'mrr': 0.038324131211432286}
Test Markov-1 : {'hits@1': 0.038324131211432286, 'hits@3': 0.038324131211432286, 'hits@5': 0.038324131211432286, 'mrr': 0.038324131211432286}
saved: /Users/wangwei/Documents/Folders/工作/Kpler/data/processed/baseline_taskA_metrics.json
