In [2]:
!pip install xgboost pandas numpy scikit-learn tqdm
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb
from sklearn.model_selection import GroupKFold

Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl (2.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.0


In [3]:
DATA_PATH = 'data/'

In [4]:
df_seeds = pd.read_csv(DATA_PATH + 'MNCAATourneySeeds.csv').reset_index(drop=True)
df_season_results = pd.read_csv(DATA_PATH + 'MRegularSeasonCompactResults.csv').reset_index(drop=True)
df_tourney_results = pd.read_csv(DATA_PATH + 'MNCAATourneyCompactResults.csv').reset_index(drop=True)


In [5]:
df_seeds

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374
...,...,...,...
2621,2025,Z12,1161
2622,2025,Z13,1213
2623,2025,Z14,1423
2624,2025,Z15,1303


In [6]:
df_season_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0
...,...,...,...,...,...,...,...,...
192925,2025,132,1196,86,1397,77,N,0
192926,2025,132,1272,84,1412,72,N,0
192927,2025,132,1276,59,1458,53,N,0
192928,2025,132,1433,68,1206,63,N,0


In [7]:
df_team_season_results = pd.concat(
    [
        df_season_results[["Season", "WTeamID", "DayNum", "WScore", "LScore"]]
        .assign(GameResult="W")
        .rename(
            columns={"WTeamID": "TeamID", "WScore": "TeamScore", "LScore": "OppScore"}
        ),
        df_season_results[["Season", "LTeamID", "DayNum", "WScore", "LScore"]]
        .assign(GameResult="L")
        .rename(
            columns={"LTeamID": "TeamID", "LScore": "TeamScore", "WScore": "OppScore"}
        ),
    ]
).reset_index(drop=True)
df_team_season_results

Unnamed: 0,Season,TeamID,DayNum,TeamScore,OppScore,GameResult
0,1985,1228,20,81,64,W
1,1985,1106,25,77,70,W
2,1985,1112,25,63,56,W
3,1985,1165,25,70,54,W
4,1985,1192,25,86,74,W
...,...,...,...,...,...,...
385855,2025,1397,132,77,86,L
385856,2025,1412,132,72,84,L
385857,2025,1458,132,53,59,L
385858,2025,1206,132,63,68,L


In [8]:
# Score Differential
df_team_season_results["ScoreDiff"] = (
    df_team_season_results["TeamScore"] - df_team_season_results["OppScore"]
)
df_team_season_results["Win"] = (df_team_season_results["GameResult"] == "W").astype(
    "int"
)
df_team_season_results

Unnamed: 0,Season,TeamID,DayNum,TeamScore,OppScore,GameResult,ScoreDiff,Win
0,1985,1228,20,81,64,W,17,1
1,1985,1106,25,77,70,W,7,1
2,1985,1112,25,63,56,W,7,1
3,1985,1165,25,70,54,W,16,1
4,1985,1192,25,86,74,W,12,1
...,...,...,...,...,...,...,...,...
385855,2025,1397,132,77,86,L,-9,0
385856,2025,1412,132,72,84,L,-12,0
385857,2025,1458,132,53,59,L,-6,0
385858,2025,1206,132,63,68,L,-5,0


In [9]:
df_team_season_results.sample(10, random_state=529)

Unnamed: 0,Season,TeamID,DayNum,TeamScore,OppScore,GameResult,ScoreDiff,Win
250441,1999,1212,56,67,86,L,-19,0
350856,2019,1242,47,76,80,L,-4,0
216978,1991,1163,38,64,79,L,-15,0
34375,1993,1448,97,106,69,W,37,1
137265,2015,1413,73,74,61,W,13,1
294002,2008,1210,89,86,88,L,-2,0
286829,2007,1160,40,69,72,L,-3,0
341713,2017,1204,89,66,78,L,-12,0
324106,2014,1328,56,98,102,L,-4,0
248076,1998,1247,103,80,86,L,-6,0


In [10]:
# Aggregate the data
team_season_agg = (
    df_team_season_results.groupby(["Season", "TeamID"])
    .agg(
        AvgScoreDiff=("ScoreDiff", "mean"),
        MedianScoreDiff=("ScoreDiff", "median"),
        MinScoreDiff=("ScoreDiff", "min"),
        MaxScoreDiff=("ScoreDiff", "max"),
        Wins=("Win", "sum"),
        Losses=("GameResult", lambda x: (x == "L").sum()),
        WinPercentage=("Win", "mean"),
    )
    .reset_index()
)

In [11]:
team_season_agg.head()

Unnamed: 0,Season,TeamID,AvgScoreDiff,MedianScoreDiff,MinScoreDiff,MaxScoreDiff,Wins,Losses,WinPercentage
0,1985,1102,-5.791667,-5.5,-41,29,5,19,0.208333
1,1985,1103,-3.043478,-2.0,-22,16,9,14,0.391304
2,1985,1104,7.8,6.5,-12,25,21,9,0.7
3,1985,1106,-3.791667,-1.5,-35,28,10,14,0.416667
4,1985,1108,7.96,4.0,-15,35,19,6,0.76


In [12]:
df_seeds

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374
...,...,...,...
2621,2025,Z12,1161
2622,2025,Z13,1213
2623,2025,Z14,1423
2624,2025,Z15,1303


In [13]:
df_seeds["ChalkSeed"] = (
    df_seeds["Seed"].str.replace("a", "").str.replace("b", "").str[1:].astype("int")
)

team_season_agg = team_season_agg.merge(
    df_seeds, on=["Season", "TeamID"], how="left"
)



In [14]:
df_seeds

Unnamed: 0,Season,Seed,TeamID,ChalkSeed
0,1985,W01,1207,1
1,1985,W02,1210,2
2,1985,W03,1228,3
3,1985,W04,1260,4
4,1985,W05,1374,5
...,...,...,...,...
2621,2025,Z12,1161,12
2622,2025,Z13,1213,13
2623,2025,Z14,1423,14
2624,2025,Z15,1303,15


In [15]:
team_season_agg

Unnamed: 0,Season,TeamID,AvgScoreDiff,MedianScoreDiff,MinScoreDiff,MaxScoreDiff,Wins,Losses,WinPercentage,Seed,ChalkSeed
0,1985,1102,-5.791667,-5.5,-41,29,5,19,0.208333,,
1,1985,1103,-3.043478,-2.0,-22,16,9,14,0.391304,,
2,1985,1104,7.800000,6.5,-12,25,21,9,0.700000,X07,7.0
3,1985,1106,-3.791667,-1.5,-35,28,10,14,0.416667,,
4,1985,1108,7.960000,4.0,-15,35,19,6,0.760000,,
...,...,...,...,...,...,...,...,...,...,...,...
13383,2025,1476,-3.566667,-2.0,-35,26,13,17,0.433333,,
13384,2025,1477,-10.516129,-9.0,-35,22,5,26,0.161290,,
13385,2025,1478,-9.466667,-8.5,-41,16,7,23,0.233333,,
13386,2025,1479,-5.964286,-5.5,-34,22,12,16,0.428571,,


In [16]:
team_season_agg.shape, df_seeds.shape

((13388, 11), (2626, 4))

In [17]:
df_team_tourney_results = pd.concat(
    [
        df_tourney_results[
            ["Season", "WTeamID", "LTeamID", "WScore", "LScore"]
        ]
        .assign(GameResult="W")
        .rename(
            columns={
                "WTeamID": "TeamID",
                "LTeamID": "OppTeamID",
                "WScore": "TeamScore",
                "LScore": "OppScore",
            }
        ),
        df_tourney_results[
            ["Season", "LTeamID", "WTeamID", "LScore", "WScore"]
        ]
        .assign(GameResult="L")
        .rename(
            columns={
                "LTeamID": "TeamID",
                "WTeamID": "OppTeamID",
                "LScore": "TeamScore",
                "WScore": "OppScore",
            }
        ),
    ]
).reset_index(drop=True)

df_team_tourney_results["Win"] = (df_team_tourney_results["GameResult"] == "W").astype(
    "int"
)

In [18]:
df_team_tourney_results.head()

Unnamed: 0,Season,TeamID,OppTeamID,TeamScore,OppScore,GameResult,Win
0,1985,1116,1234,63,54,W,1
1,1985,1120,1345,59,58,W,1
2,1985,1207,1250,68,43,W,1
3,1985,1229,1425,58,55,W,1
4,1985,1242,1325,49,38,W,1


In [19]:
df_historic_tourney_features = df_team_tourney_results.merge(
    team_season_agg[
        ["Season","TeamID", "WinPercentage", "MedianScoreDiff", "ChalkSeed"]
    ],
    on=["Season","TeamID"],
    how="left",
).merge(
    team_season_agg[
        ["Season","TeamID", "WinPercentage", "MedianScoreDiff", "ChalkSeed"]
    ].rename(
        columns={
            "TeamID": "OppTeamID",
            "WinPercentage": "OppWinPercentage",
            "MedianScoreDiff": "OppMedianScoreDiff",
            "ChalkSeed": "OppChalkSeed",
        }
    ),
    on=["Season","OppTeamID"],
)

In [20]:
df_historic_tourney_features

Unnamed: 0,Season,TeamID,OppTeamID,TeamScore,OppScore,GameResult,Win,WinPercentage,MedianScoreDiff,ChalkSeed,OppWinPercentage,OppMedianScoreDiff,OppChalkSeed
0,1985,1116,1234,63,54,W,1,0.636364,5.0,9.0,0.666667,9.5,8.0
1,1985,1120,1345,59,58,W,1,0.620690,2.0,11.0,0.680000,9.0,6.0
2,1985,1207,1250,68,43,W,1,0.925926,14.0,1.0,0.379310,-3.0,16.0
3,1985,1229,1425,58,55,W,1,0.740741,6.0,9.0,0.678571,2.5,8.0
4,1985,1242,1325,49,38,W,1,0.766667,5.5,3.0,0.740741,6.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5031,2024,1181,1301,64,76,L,0,0.750000,12.0,4.0,0.611111,6.0,11.0
5032,2024,1397,1345,66,72,L,0,0.750000,10.5,2.0,0.878788,8.0,1.0
5033,2024,1104,1163,72,86,L,0,0.656250,8.0,4.0,0.911765,14.0,1.0
5034,2024,1301,1345,50,63,L,0,0.611111,6.0,11.0,0.878788,8.0,1.0


In [21]:
df_historic_tourney_features["WinPctDiff"] = (
    df_historic_tourney_features["WinPercentage"]
    - df_historic_tourney_features["OppWinPercentage"]
)

df_historic_tourney_features["ChalkSeedDiff"] = (
    df_historic_tourney_features["ChalkSeed"]
    - df_historic_tourney_features["OppChalkSeed"]
)

df_historic_tourney_features["MedianScoreDiffDiff"] = (
    df_historic_tourney_features["MedianScoreDiff"]
    - df_historic_tourney_features["OppMedianScoreDiff"]
)

In [22]:
df_historic_tourney_features.columns

Index(['Season', 'TeamID', 'OppTeamID', 'TeamScore', 'OppScore', 'GameResult',
       'Win', 'WinPercentage', 'MedianScoreDiff', 'ChalkSeed',
       'OppWinPercentage', 'OppMedianScoreDiff', 'OppChalkSeed', 'WinPctDiff',
       'ChalkSeedDiff', 'MedianScoreDiffDiff'],
      dtype='object')

In [23]:
df_historic_tourney_features.sample(5, random_state=529)

Unnamed: 0,Season,TeamID,OppTeamID,TeamScore,OppScore,GameResult,Win,WinPercentage,MedianScoreDiff,ChalkSeed,OppWinPercentage,OppMedianScoreDiff,OppChalkSeed,WinPctDiff,ChalkSeedDiff,MedianScoreDiffDiff
3405,1999,1335,1196,61,75,L,0,0.807692,11.0,11.0,0.714286,13.0,6.0,0.093407,5.0,-2.0
4384,2014,1113,1400,85,87,L,0,0.65625,5.0,10.0,0.69697,5.0,7.0,-0.04072,3.0,0.0
2003,2016,1218,1143,77,66,W,1,0.833333,11.0,13.0,0.69697,8.0,4.0,0.136364,9.0,3.0
2918,1991,1453,1277,58,60,L,0,0.785714,6.5,12.0,0.642857,3.5,5.0,0.142857,7.0,3.0
4714,2019,1125,1268,77,79,L,0,0.833333,12.5,11.0,0.6875,8.0,6.0,0.145833,5.0,4.5


In [24]:
fivethiryeight_scores = pd.read_csv(DATA_PATH + "538Ratings.csv").reset_index(drop=True)

In [25]:
fivethiryeight_scores.head()

Unnamed: 0,Season,TeamID,TeamName,538rating
0,2016,1242,Kansas,94.46
1,2016,1314,North Carolina,93.94
2,2016,1438,Virginia,92.46
3,2016,1277,Michigan State,91.84
4,2016,1328,Oklahoma,89.96


In [26]:
df_historic_tourney_features = df_historic_tourney_features.merge(
    fivethiryeight_scores.drop("TeamName", axis=1),
    on=["Season", "TeamID"],
    how="left",
).dropna(subset=["538rating"])

df_historic_tourney_features = df_historic_tourney_features.merge(
    fivethiryeight_scores.drop("TeamName", axis=1).rename(
        columns={"TeamID": "OppTeamID"}
    ),
    on=["Season", "OppTeamID"],
    how="left",
    suffixes=("", "Opp"),
)

In [27]:
df_historic_tourney_features["538rating_diff"] = (
    df_historic_tourney_features["538rating"]
    - df_historic_tourney_features["538ratingOpp"]
)

In [28]:
df_historic_tourney_features[
    ["Season", "TeamID", "538rating", "538ratingOpp", "538rating_diff"]
].head()

Unnamed: 0,Season,TeamID,538rating,538ratingOpp,538rating_diff
0,2016,1195,71.41,66.72,4.69
1,2016,1455,86.59,85.59,1.0
2,2016,1221,66.85,67.96,-1.11
3,2016,1276,79.57,79.93,-0.36
4,2016,1114,78.9,88.68,-9.78


In [29]:
df_historic_tourney_features["BaselinePred"] = (
    df_historic_tourney_features["ChalkSeed"]
    < df_historic_tourney_features["OppChalkSeed"]
)

df_historic_tourney_features.loc[
    df_historic_tourney_features["ChalkSeed"]
    == df_historic_tourney_features["OppChalkSeed"],
    "BaselinePred",
] = (
    df_historic_tourney_features["WinPercentage"]
    > df_historic_tourney_features["OppWinPercentage"]
)

In [30]:
df_historic_tourney_features

Unnamed: 0,Season,TeamID,OppTeamID,TeamScore,OppScore,GameResult,Win,WinPercentage,MedianScoreDiff,ChalkSeed,OppWinPercentage,OppMedianScoreDiff,OppChalkSeed,WinPctDiff,ChalkSeedDiff,MedianScoreDiffDiff,538rating,538ratingOpp,538rating_diff,BaselinePred
0,2016,1195,1192,96,65,W,1,0.551724,2.0,16.0,0.548387,2.0,16.0,0.003337,0.0,0.0,71.41,66.72,4.69,True
1,2016,1455,1435,70,50,W,1,0.741935,17.0,11.0,0.593750,10.0,11.0,0.148185,0.0,7.0,86.59,85.59,1.00,True
2,2016,1221,1380,59,55,W,1,0.424242,-4.0,16.0,0.612903,4.0,16.0,-0.188661,0.0,-8.0,66.85,67.96,-1.11,False
3,2016,1276,1409,67,62,W,1,0.636364,5.0,11.0,0.645161,8.0,11.0,-0.008798,0.0,-3.0,79.57,79.93,-0.36,False
4,2016,1114,1345,85,83,W,1,0.870968,8.0,12.0,0.764706,13.0,5.0,0.106262,7.0,-5.0,78.90,88.68,-9.78,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
931,2023,1400,1274,81,88,L,0,0.764706,7.0,2.0,0.781250,7.0,5.0,-0.016544,-3.0,0.0,90.08,83.60,6.48,True
932,2023,1166,1361,56,57,L,0,0.636364,10.0,6.0,0.812500,8.0,5.0,-0.176136,1.0,2.0,87.56,86.04,1.52,False
933,2023,1274,1163,59,72,L,0,0.781250,7.0,5.0,0.757576,12.0,4.0,0.023674,1.0,-5.0,83.60,89.24,-5.64,False
934,2023,1194,1361,71,72,L,0,0.906250,12.5,9.0,0.812500,8.0,5.0,0.093750,4.0,4.5,82.57,86.04,-3.47,False


In [31]:
cv_scores_baseline = []
for season in df_historic_tourney_features["Season"].unique():
    pred = df_historic_tourney_features.query("Season == @season")[
        "BaselinePred"
    ].astype("int")
    y = df_historic_tourney_features.query("Season == @season")["Win"]
    score = accuracy_score(y, pred)
    score_ll = log_loss(y, pred)
    cv_scores_baseline.append(score)
    print(f"Holdout season {season} - Accuracy {score:0.4f} Log Loss {score_ll:0.4f}")

print(f"Baseline accuracy {np.mean(cv_scores_baseline):0.4f}")

Holdout season 2016 - Accuracy 0.6716 Log Loss 11.8352
Holdout season 2017 - Accuracy 0.7463 Log Loss 9.1454
Holdout season 2018 - Accuracy 0.6716 Log Loss 11.8352
Holdout season 2019 - Accuracy 0.6866 Log Loss 11.2973
Holdout season 2021 - Accuracy 0.6970 Log Loss 10.9223
Holdout season 2022 - Accuracy 0.6418 Log Loss 12.9112
Holdout season 2023 - Accuracy 0.7015 Log Loss 10.7593
Baseline accuracy 0.6881


In [32]:
FEATURES = [
    # "WinPercentage",
    "MedianScoreDiff",
    # "ChalkSeed",
    # "OppWinPercentage",
    # "OppMedianScoreDiff",
    # "OppChalkSeed",
    "WinPctDiff",
    "ChalkSeedDiff",
    # "538rating",
    # "538ratingOpp",
    # "538rating_diff",
]
TARGET = "Win"


X = df_historic_tourney_features[FEATURES]
y = df_historic_tourney_features[TARGET]
groups = df_historic_tourney_features["Season"]
seasons = df_historic_tourney_features["Season"].unique()

# Setup cross-validation
gkf = GroupKFold(n_splits=df_historic_tourney_features["Season"].nunique())
cv_results = []
models = []

season_idx = 0
for train_index, test_index in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Prepare the model
    model = xgb.XGBRegressor(
        eval_metric="logloss",
        n_estimators=1_000,
        learning_rate=0.001,
    )
    holdout_season = seasons[season_idx]
    print(f"Holdout Season: {holdout_season}")
    # Train the model
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    # Predict on the test set
    y_pred = model.predict(X_test)
    score_ll = log_loss(y_test, y_pred)
    y_pred = y_pred > 0.5
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    cv_results.append(accuracy)
    season_idx += 1
    print(f"Season {holdout_season}: {accuracy} {score_ll}")
    models.append(model)
# Print the average accuracy across all folds
print("Average CV Accuracy:", np.mean(cv_results))

Holdout Season: 2016
[0]	validation_0-logloss:0.69300
[100]	validation_0-logloss:0.67923
[200]	validation_0-logloss:0.66667
[300]	validation_0-logloss:0.65777
[400]	validation_0-logloss:0.65071
[500]	validation_0-logloss:0.64567
[600]	validation_0-logloss:0.64215
[700]	validation_0-logloss:0.63889
[800]	validation_0-logloss:0.63657
[900]	validation_0-logloss:0.63478
[999]	validation_0-logloss:0.63471
Season 2016: 0.6865671641791045 0.6347127836438969
Holdout Season: 2017
[0]	validation_0-logloss:0.69289
[100]	validation_0-logloss:0.67175
[200]	validation_0-logloss:0.65607
[300]	validation_0-logloss:0.64405
[400]	validation_0-logloss:0.63488
[500]	validation_0-logloss:0.62738
[600]	validation_0-logloss:0.62191
[700]	validation_0-logloss:0.61847
[800]	validation_0-logloss:0.61573
[900]	validation_0-logloss:0.61483
[999]	validation_0-logloss:0.61459
Season 2017: 0.7164179104477612 0.6145922924640412
Holdout Season: 2018
[0]	validation_0-logloss:0.69291
[100]	validation_0-logloss:0.67130
[

In [33]:
TEST_SEASON = 2025  # Change to 2025 when it comes out!

seeds_2025 = pd.read_csv(DATA_PATH + "2025_tourney_seeds.csv")


seeds_2025["ChalkSeed"] = (
    seeds_2025["Seed"].str.replace("a", "").str.replace("b", "").str[1:].astype("int")
)
seeds_2025

Unnamed: 0,Tournament,Seed,TeamID,ChalkSeed
0,M,W01,1181,1
1,M,W02,1104,2
2,M,W03,1458,3
3,M,W04,1112,4
4,M,W05,1332,5
...,...,...,...,...
59,M,Z12,1161,12
60,M,Z13,1213,13
61,M,Z14,1423,14
62,M,Z15,1303,15


In [34]:
tourney_pairs = (
    seeds_2025.merge(seeds_2025, on=["Tournament"], suffixes=("", "Opp"))
    .assign(Season=TEST_SEASON)
    .query("TeamID != TeamIDOpp")
    .rename(columns={"Tournament": "League"})
)

tourney_pairs = (
    tourney_pairs.merge(
        team_season_agg[
            ["Season", "TeamID", "WinPercentage", "MedianScoreDiff"]
        ],
        on=["Season", "TeamID"],
        how="left",
    )
    .merge(
        team_season_agg[
            ["Season", "TeamID", "WinPercentage", "MedianScoreDiff"]
        ].rename(
            columns={
                "TeamID": "TeamIDOpp",
                "WinPercentage": "OppWinPercentage",
                "MedianScoreDiff": "OppMedianScoreDiff",
            }
        ),
        on=["Season", "TeamIDOpp"],
    )
    .reset_index(drop=True)
)

tourney_pairs["OppChalkSeed"] = (
    tourney_pairs["SeedOpp"]
    .str.replace("a", "")
    .str.replace("b", "")
    .str[1:]
    .astype("int")
)

In [35]:
tourney_pairs = tourney_pairs.merge(
    fivethiryeight_scores.drop("TeamName", axis=1),
    on=["Season", "TeamID"],
    how="left",
)

tourney_pairs = tourney_pairs.merge(
    fivethiryeight_scores.drop("TeamName", axis=1).rename(
        columns={"TeamID": "TeamIDOpp"}
    ),
    on=["Season", "TeamIDOpp"],
    how="left",
    suffixes=("", "Opp"),
)

# Diff features
tourney_pairs["538rating_diff"] = (
    tourney_pairs["538rating"] - tourney_pairs["538ratingOpp"]
)

tourney_pairs["BaselinePred"] = (
    tourney_pairs["ChalkSeed"] < tourney_pairs["OppChalkSeed"]
)

tourney_pairs.loc[
    tourney_pairs["ChalkSeed"] == tourney_pairs["OppChalkSeed"],
    "BaselinePred",
] = (
    tourney_pairs["WinPercentage"] > tourney_pairs["OppWinPercentage"]
)

tourney_pairs["WinPctDiff"] = (
    tourney_pairs["WinPercentage"] - tourney_pairs["OppWinPercentage"]
)

tourney_pairs["ChalkSeedDiff"] = (
    tourney_pairs["ChalkSeed"] - tourney_pairs["OppChalkSeed"]
)

tourney_pairs["MedianScoreDiffDiff"] = (
    tourney_pairs["MedianScoreDiff"] - tourney_pairs["OppMedianScoreDiff"]
)

In [36]:
tourney_pairs.head()

Unnamed: 0,League,Seed,TeamID,ChalkSeed,SeedOpp,TeamIDOpp,ChalkSeedOpp,Season,WinPercentage,MedianScoreDiff,OppWinPercentage,OppMedianScoreDiff,OppChalkSeed,538rating,538ratingOpp,538rating_diff,BaselinePred,WinPctDiff,ChalkSeedDiff,MedianScoreDiffDiff
0,M,W01,1181,1,W02,1104,2,2025,0.911765,21.5,0.757576,7.0,2,,,,True,0.154189,-1,14.5
1,M,W01,1181,1,W03,1458,3,2025,0.911765,21.5,0.742857,10.0,3,,,,True,0.168908,-2,11.5
2,M,W01,1181,1,W04,1112,4,2025,0.911765,21.5,0.647059,9.0,4,,,,True,0.264706,-3,12.5
3,M,W01,1181,1,W05,1332,5,2025,0.911765,21.5,0.727273,6.0,5,,,,True,0.184492,-4,15.5
4,M,W01,1181,1,W06,1140,6,2025,0.911765,21.5,0.727273,10.0,6,,,,True,0.184492,-5,11.5


In [37]:
models

[XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.001, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=1000, n_jobs=None,
              num_parallel_tree=None, ...),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enab

In [38]:
for i, model in enumerate(models):
    tourney_pairs[f"pred_model{i}"] = model.predict(tourney_pairs[FEATURES])

In [39]:
tourney_pairs

Unnamed: 0,League,Seed,TeamID,ChalkSeed,SeedOpp,TeamIDOpp,ChalkSeedOpp,Season,WinPercentage,MedianScoreDiff,...,WinPctDiff,ChalkSeedDiff,MedianScoreDiffDiff,pred_model0,pred_model1,pred_model2,pred_model3,pred_model4,pred_model5,pred_model6
0,M,W01,1181,1,W02,1104,2,2025,0.911765,21.5,...,0.154189,-1,14.5,0.618502,0.674659,0.677127,0.644435,0.699901,0.628543,0.628362
1,M,W01,1181,1,W03,1458,3,2025,0.911765,21.5,...,0.168908,-2,11.5,0.726723,0.755181,0.688265,0.735345,0.734813,0.628543,0.705230
2,M,W01,1181,1,W04,1112,4,2025,0.911765,21.5,...,0.264706,-3,12.5,0.650047,0.720708,0.697057,0.711823,0.579176,0.674271,0.567219
3,M,W01,1181,1,W05,1332,5,2025,0.911765,21.5,...,0.184492,-4,15.5,0.707114,0.760297,0.731411,0.713278,0.722327,0.791422,0.597692
4,M,W01,1181,1,W06,1140,6,2025,0.911765,21.5,...,0.184492,-5,11.5,0.675584,0.656770,0.728245,0.703214,0.722327,0.688930,0.597692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,M,Z16,1313,16,Z11,1179,11,2025,0.677419,6.0,...,-0.225806,5,-4.0,0.321461,0.326336,0.496564,0.307767,0.368958,0.379266,0.423431
4028,M,Z16,1313,16,Z12,1161,12,2025,0.677419,6.0,...,-0.057875,4,-2.0,0.528836,0.513948,0.468790,0.451062,0.499967,0.690797,0.509941
4029,M,Z16,1313,16,Z13,1213,13,2025,0.677419,6.0,...,-0.103831,3,-2.0,0.268259,0.297029,0.330787,0.419660,0.306462,0.320500,0.361748
4030,M,Z16,1313,16,Z14,1423,14,2025,0.677419,6.0,...,-0.103831,2,-3.0,0.268259,0.297029,0.330787,0.419660,0.306462,0.320500,0.361748


In [40]:
tourney_pairs["Pred"] = tourney_pairs[
    [f for f in tourney_pairs.columns if "model" in f]
].mean(axis=1)

tourney_pairs["ID"] = (
    tourney_pairs["Season"].astype("str")
    + "_"
    + tourney_pairs["TeamID"].astype("str")
    + "_"
    + tourney_pairs["TeamIDOpp"].astype("str")
)

preds = tourney_pairs.copy()
print(preds['ID'])

0       2025_1181_1104
1       2025_1181_1458
2       2025_1181_1112
3       2025_1181_1332
4       2025_1181_1140
             ...      
4027    2025_1313_1179
4028    2025_1313_1161
4029    2025_1313_1213
4030    2025_1313_1423
4031    2025_1313_1303
Name: ID, Length: 4032, dtype: object


In [41]:
tourney_pairs.head()

Unnamed: 0,League,Seed,TeamID,ChalkSeed,SeedOpp,TeamIDOpp,ChalkSeedOpp,Season,WinPercentage,MedianScoreDiff,...,MedianScoreDiffDiff,pred_model0,pred_model1,pred_model2,pred_model3,pred_model4,pred_model5,pred_model6,Pred,ID
0,M,W01,1181,1,W02,1104,2,2025,0.911765,21.5,...,14.5,0.618502,0.674659,0.677127,0.644435,0.699901,0.628543,0.628362,0.653076,2025_1181_1104
1,M,W01,1181,1,W03,1458,3,2025,0.911765,21.5,...,11.5,0.726723,0.755181,0.688265,0.735345,0.734813,0.628543,0.70523,0.710586,2025_1181_1458
2,M,W01,1181,1,W04,1112,4,2025,0.911765,21.5,...,12.5,0.650047,0.720708,0.697057,0.711823,0.579176,0.674271,0.567219,0.657186,2025_1181_1112
3,M,W01,1181,1,W05,1332,5,2025,0.911765,21.5,...,15.5,0.707114,0.760297,0.731411,0.713278,0.722327,0.791422,0.597692,0.717649,2025_1181_1332
4,M,W01,1181,1,W06,1140,6,2025,0.911765,21.5,...,11.5,0.675584,0.65677,0.728245,0.703214,0.722327,0.68893,0.597692,0.681823,2025_1181_1140


In [42]:
from tqdm import tqdm

# Load and filter data
round_slots = pd.read_csv(
    DATA_PATH + "MNCAATourneySlots.csv"
)
round_slots = round_slots[round_slots["Season"] == 2025]
round_slots = round_slots[
    round_slots["Slot"].str.contains("R")
]  # Filter out First Four

seeds = pd.read_csv(
    DATA_PATH + "2025_tourney_seeds.csv"
)
seeds_m = seeds[seeds["Tournament"] == "M"]

preds["ID"] = preds["ID"].str.split("_")

In [46]:
def prepare_data(seeds, preds):
    # Function preparing the data for the simulation
    seed_dict = seeds.set_index("Seed")["TeamID"].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    probas_dict = {}

    for teams, proba in zip(preds["ID"], preds["Pred"]):
        team1, team2 = teams[1], teams[2]

        probas_dict.setdefault(team1, {})[team2] = proba
        probas_dict.setdefault(team2, {})[team1] = 1 - proba

    return seed_dict, inverted_seed_dict, probas_dict


def simulate(round_slots, seeds, inverted_seeds, probas, sim=True):
    """
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    """
    winners = []
    slots = []

    for slot, strong, weak in zip(
        round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed
    ):
        team_1, team_2 = seeds[strong], seeds[weak]

        # Get the probability of team_1 winning
        proba = probas[str(team_1)][str(team_2)]

        if sim:
            # Randomly determine the winner based on the probability
            winner = np.random.choice([team_1, team_2], p=[proba, 1 - proba])
        else:
            # Determine the winner based on the higher probability
            winner = [team_1, team_2][np.argmax([proba, 1 - proba])]

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    # Convert winners to original seeds using the inverted_seeds dictionary
    return [inverted_seeds[w] for w in winners], slots


def run_simulation(brackets=1, seeds=None, preds=None, round_slots=None, sim=True):
    """
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    """
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict, probas_dict = prepare_data(seeds, preds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, probas_dict, sim)

        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({"Bracket": bracket, "Slot": slots, "Team": results})

    return result_df


n_brackets = 10000000
result_m = run_simulation(
    brackets=n_brackets, seeds=seeds_m, preds=preds, round_slots=round_slots, sim=True
)
result_m["Tournament"] = "M"
submission = pd.concat([result_m])
submission = submission.reset_index(drop=True)
submission.index.names = ["RowId"]
submission = submission.reset_index()

 41%|████      | 4089716/10000000 [40:05<57:56, 1700.22it/s]  


KeyboardInterrupt: 

In [None]:
submission

Unnamed: 0,RowId,Bracket,Slot,Team,Tournament
0,0,1,R1W1,W01,M
1,1,1,R1W2,W02,M
2,2,1,R1W3,W14,M
3,3,1,R1W4,W13,M
4,4,1,R1W5,W05,M
...,...,...,...,...,...
62999995,62999995,1000000,R4Y1,Y01,M
62999996,62999996,1000000,R4Z1,Z03,M
62999997,62999997,1000000,R5WX,X08,M
62999998,62999998,1000000,R5YZ,Z03,M


In [None]:
result_with_names = submission.rename(columns={"Team": "Seed"}).merge(
    seeds, on=["Seed"], how="left"
)

teams = pd.read_csv(DATA_PATH + "MTeams.csv")

result_with_names = result_with_names.merge(
    teams[["TeamID", "TeamName"]], how="left"
)

In [None]:
result_with_names


Unnamed: 0,RowId,Bracket,Slot,Seed,Tournament_x,Tournament_y,TeamID,TeamName
0,0,1,R1W1,W01,M,M,1181,Duke
1,1,1,R1W2,W02,M,M,1104,Alabama
2,2,1,R1W3,W14,M,M,1285,Montana
3,3,1,R1W4,W13,M,M,1103,Akron
4,4,1,R1W5,W05,M,M,1332,Oregon
...,...,...,...,...,...,...,...,...
62999995,62999995,1000000,R4Y1,Y01,M,M,1120,Auburn
62999996,62999996,1000000,R4Z1,Z03,M,M,1403,Texas Tech
62999997,62999997,1000000,R5WX,X08,M,M,1211,Gonzaga
62999998,62999998,1000000,R5YZ,Z03,M,M,1403,Texas Tech


In [None]:
slot_team_counts = result_with_names.groupby(['Slot', 'TeamName']).size().reset_index(name='Count')

# Sort the results to find the team with the highest count for each slot
slot_team_counts = slot_team_counts.sort_values(by=['Slot', 'Count'], ascending=[True, False])

# Display the results
print(slot_team_counts)

#Now for each slot, we will select the team with the highest count
final_submission = slot_team_counts.groupby('Slot').first().reset_index()
final_submission

     Slot       TeamName   Count
0    R1W1           Duke  765503
1    R1W1   Mt St Mary's  234497
2    R1W2        Alabama  779598
3    R1W2  Robert Morris  220402
5    R1W3      Wisconsin  790856
..    ...            ...     ...
366  R6CH  Robert Morris     889
373  R6CH           Troy     855
358  R6CH   Mt St Mary's     846
361  R6CH     Norfolk St     613
367  R6CH           SIUE     569

[384 rows x 3 columns]


Unnamed: 0,Slot,TeamName,Count
0,R1W1,Duke,765503
1,R1W2,Alabama,779598
2,R1W3,Wisconsin,790856
3,R1W4,Arizona,694401
4,R1W5,Oregon,648881
...,...,...,...
58,R4Y1,Auburn,151152
59,R4Z1,Florida,211040
60,R5WX,Duke,137320
61,R5YZ,Florida,140336


In [None]:
final_submission.to_csv('submission_1000000_sims.csv', index=False)