In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Diagnostics notebook\n",
    "\n",
    "Trains an XGBoost model on curated orderbook + results, computes calibration and EV curves."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "from sklearn.calibration import calibration_curve\n",
    "\n",
    "from ml.features import build_features\n",
    "\n",
    "# --- Load data (replace with actual MinIO parquet load) ---\n",
    "orderbook_df = pd.read_parquet(\"/mnt/data/sample_orderbook.parquet\")\n",
    "results_df = pd.read_parquet(\"/mnt/data/sample_results.parquet\")\n",
    "\n",
    "# --- Build features ---\n",
    "X, y = build_features(orderbook_df, results_df, decision_secs=30)\n",
    "\n",
    "# Drop non-numeric for training\n",
    "Xnum = X.select_dtypes(include=[np.number]).fillna(0)\n",
    "\n",
    "# --- Train GPU XGBoost ---\n",
    "dtrain = xgb.DMatrix(Xnum, label=y)\n",
    "params = {\n",
    "    'max_depth': 6,\n",
    "    'eta': 0.1,\n",
    "    'objective': 'binary:logistic',\n",
    "    'eval_metric': 'logloss',\n",
    "    'tree_method': 'gpu_hist'\n",
    "}\n",
    "bst = xgb.train(params, dtrain, num_boost_round=200)\n",
    "\n",
    "probs = bst.predict(dtrain)\n",
    "\n",
    "# --- Calibration curve ---\n",
    "frac_pos, mean_pred = calibration_curve(y, probs, n_bins=10)\n",
    "plt.figure(figsize=(5,5))\n",
    "plt.plot(mean_pred, frac_pos, marker='o')\n",
    "plt.plot([0,1],[0,1],'--', color='gray')\n",
    "plt.xlabel(\"Predicted win probability\")\n",
    "plt.ylabel(\"Empirical win rate\")\n",
    "plt.title(\"Calibration curve\")\n",
    "plt.show()\n",
    "\n",
    "# --- EV curve (unit back bet, 5% commission) ---\n",
    "odds = X[\"ltp\"].values\n",
    "commission = 0.05\n",
    "\n",
    "stakes = 1.0\n",
    "profit = np.where(y==1, (odds-1)*stakes*(1-commission), -stakes)\n",
    "\n",
    "thresholds = np.linspace(0.05,0.95,10)\n",
    "evs, covs = [], []\n",
    "for t in thresholds:\n",
    "    mask = probs >= t\n",
    "    if mask.sum()==0:\n",
    "        evs.append(0)\n",
    "        covs.append(0)\n",
    "    else:\n",
    "        evs.append(profit[mask].mean())\n",
    "        covs.append(mask.mean())\n",
    "\n",
    "plt.figure()\n",
    "plt.plot(thresholds, evs, marker='o')\n",
    "plt.xlabel(\"Threshold\")\n",
    "plt.ylabel(\"Expected Value per bet\")\n",
    "plt.title(\"EV curve\")\n",
    "plt.axhline(0, color='gray', linestyle='--')\n",
    "plt.show()\n",
    "\n",
    "plt.figure()\n",
    "plt.plot(thresholds, covs, marker='o')\n",
    "plt.xlabel(\"Threshold\")\n",
    "plt.ylabel(\"Coverage (fraction of bets)\")\n",
    "plt.title(\"Coverage vs threshold\")\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
