In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.config import RANDOM_SEED
from src.data_loading import load_all_raw
from src.preprocessing import make_processed_games, save_processed_games
from src.modeling import (
    build_feature_table,
    split_by_season,
    train_logistic_baseline,
    predict_logistic_proba,
    train_bn_model,
    predict_bn_proba,
    evaluate_model,
)

np.random.seed(RANDOM_SEED)
pd.set_option("display.max_columns", 100)


In [3]:
raw = load_all_raw()
games_processed = make_processed_games(raw)

table, feature_cols = build_feature_table(games_processed)

(train_X, train_y, train_df), (val_X, val_y, val_df), (test_X, test_y, test_df) = \
    split_by_season(table, feature_cols)

In [None]:
# Logistic baseline
log_clf = train_logistic_baseline(train_X, train_y)
val_proba_log = predict_logistic_proba(log_clf, val_X)
test_proba_log = predict_logistic_proba(log_clf, test_X)
metrics_log_val = evaluate_model("Logistic (val)", val_y, val_proba_log)
metrics_log_test = evaluate_model("Logistic (test)", test_y, test_proba_log)

# this is lowkey stupid because our data has the values used to calculate score.

Logistic (val): acc=1.000, logloss=0.001, auc=1.000
Logistic (test): acc=1.000, logloss=0.001, auc=1.000


In [5]:

# Naive Bayes BN
nb, disc, featlist = train_bn_model(train_df, feature_cols)
val_proba_bn = predict_bn_proba(nb, val_df, feature_cols, disc)
test_proba_bn = predict_bn_proba(nb, test_df, feature_cols, disc)
metrics_bn_val = evaluate_model("BN (val)", val_y, val_proba_bn)
metrics_bn_test = evaluate_model("BN (test)", test_y, test_proba_bn)


BN (val): acc=0.902, logloss=0.232, auc=0.968
BN (test): acc=0.886, logloss=0.261, auc=0.959
