In [None]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.metrics import log_loss
from scipy.interpolate import UnivariateSpline
import statsmodels.api as sm
import matplotlib.pyplot as plt
import collections
import scipy.stats as stats
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import warnings
from tqdm import tqdm
from scipy.stats import linregress

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option("display.max_column", 999)
print(os.listdir("../input"))
DATA_PATH = "/kaggle/input/march-machine-learning-mania-2024/"
seeds_2024 = pd.read_csv(DATA_PATH + "2024_tourney_seeds.csv")
fname_slots = DATA_PATH + 'MNCAATourneySlots.csv'

In [None]:
# Analyze a bracket submission
class BracketChecker(object):
    def __init__(self, fname_slots):
        self.fname_slots = fname_slots
        self.df_slots = self._make_df_slots(self.fname_slots)
        self.set_slots = set(self.df_slots['Slot'])
        self.n_slots = self.df_slots.shape[0]
        self.dict_next_slot = self._make_dict_next_slot()
        self.dict_paths_to_victory = self._make_dict_paths_to_victory()
        
    
    @staticmethod
    def _make_df_slots(fname_slots):
        '''Reduced version of dataframe containing the slots information
        Parameters
        ----------
        fname_slots : str
            path to the file containing tournament Slots info
            Mens or Womens tournament should result in the same output here

        Returns
        -------
        df_slots : pandas DataFrame
            Slots info for NCAA tournament for Round 1 and later

        '''
        df_slots = pd.read_csv(fname_slots)
        
        # Only keep slots that are part of the traditional 
        # tournament (no play-ins)
        df_slots = df_slots[df_slots['Slot'].str.startswith('R')]     
        
        # except for play-ins (which we don't care about)
        # the tournament is the same structure every year
        # So, drop Season column and duplicate Slot entries
        df_slots.drop_duplicates('Slot', inplace=True)
        df_slots.drop(columns='Season', inplace=True)
        return df_slots
    
    def _make_dict_next_slot(self):
        '''Makes a dictionary where the value is the next Slot played by the 
        team that wins the Slot specified by key.
        
        Returns
        -------
        next_slot : dict
        '''
        next_slot = {}
        for ir, r in self.df_slots.iterrows():
            next_slot[r['StrongSeed']] = r['Slot']
            next_slot[r['WeakSeed']] = r['Slot']
        return next_slot
    
    def _make_dict_paths_to_victory(self):
        '''Dictionary with paths to victory for every seed.

        Returns
        -------
        paths : dict
            Each key is a Seed in the tournament.  Each value is an ordered
            list containing the Slots that must be won by that Seed to
            win the tournament
        '''
        seeds = [f'{region}{num:02d}' for region in list('WXYZ') \
                 for num in range(1,17)]
        
        paths = {}
        for s in seeds:
            slot = s
            path = []
            while slot in self.dict_next_slot.keys():
                slot = self.dict_next_slot[slot]
                path.append(slot)
                
            paths[s] = path
        return paths
    
    def check_predicted_slots(self, df_bracket):
        '''Checks a bracket dataframe to see if all Slot predictions are
        present and there are no doubles or extra slots
        
        Note: this does not check for bracket consistency.
        
        Parameters
        ----------
        df_bracket : pandas DataFrame
            in the format of the submission file, but only a single bracket
            for a single tournament

        Returns
        -------
        pass_check : bool
            the bracket contained 1 prediction for every required slot, 
            and nothing extra
        errors: dict
            dictionary explaining the errors that were found. Supported are
            'missing' : list of slots that need to be predicted 
                but were missing from the bracket
            'overrepped' : list of slots that were predicted more than 
                once in the bracket(e.g., you picked two different teams
                                    to win the same slot)
            'extra' : of slots that were included in the bracket but should 
                      not be (useful in case of typos)
        '''
        
        pass_check = (self.set_slots == set(df_bracket['Slot']) \
                      and self.n_slots == df_bracket.shape[0])
        if pass_check:
            errors = {}
        else:
            # get the unique list of slots that are predicted and their counts
            pred_slots, count_pred_slots = np.unique(df_bracket['Slot'],
                                                 return_counts=True)
            
            set_pred_slots = set(pred_slots)
            overrepped_slots = list(pred_slots[count_pred_slots > 1])           
            missing_slots = list(self.set_slots.difference(set_pred_slots))
            extra_slots = list(set_pred_slots.difference(self.set_slots))
            
            errors = {'missing': missing_slots, 'overrepped': overrepped_slots,
                      'extra': extra_slots}
            
        return pass_check, errors
        
    def check_consistency(self, df_bracket):
        '''
        Checks to make sure that brackets adhere to the following consistency
        rule:
            "The predictions in each bracket must follow valid tournament paths. 
            In other words, if a team is predicted to win a game in Round N, 
            that team must have also been predicted as the winner of Round 
            N-1 in one of the respective feeder games. The winner of a game 
            in Round 1 must be one of the two teams scheduled to play in that 
            game.
            
        Parameters
        ----------
        df_bracket : pandas DataFrame
            in the format of the submission file, but only a single bracket
            for a single tournament

        Returns
        -------
        pass_check : bool
            if True, the bracket passed the consistency check
        inconsistencies : dict
            dictionary where keys are Seeds that were deemed to have 
            inconsistent paths in the tournament and values are the paths as
            they appeared in the bracket.

        '''
        
        # predicted paths of teams according to this bracket
        predicted_paths = self.calc_predicted_paths(df_bracket)
        
        inconsistencies = {}
        # loop over predicted paths to see if they are possible in the 
        # tournament
        for seed, path in predicted_paths.items():
            # allowed path to victory
            allowed_ptv = self.dict_paths_to_victory[seed]
            len_path = len(path)
            if path != allowed_ptv[0:len_path]:
                inconsistencies[seed] = path
        
        pass_check = (len(inconsistencies.keys()) == 0)
        
        return pass_check, inconsistencies
    
    @staticmethod
    def calc_predicted_paths(df_bracket):
        '''
        Calculates a dictionary of lists where keys are Seeds in the tournament
        and values are ordered lists of the predicted Slots won by that Seed

        Parameters
        ----------
        df_bracket : pandas DataFrame
            in the format of the submission file, but only a single bracket
            for a single tournament

        Returns
        -------
        dict of lists where keys are Seeds in the tournament
        and values are ordered lists of the predicted Slots won by that Seed

        '''
        return df_bracket.groupby('Team')['Slot'].apply(sorted).to_dict()

bracket_checker = BracketChecker(fname_slots)

In [None]:
# Analyze a bracket score
def average_bracket_score(prediction_df: pd.DataFrame, tournament_result_df: pd.DataFrame):
    """Compute the average bracket score given bracket predictions and results of one result."""
    bracket_scores = []
    brackets = prediction_df.Bracket.unique()
    for bracket in brackets:
        bracket_prediction_df = prediction_df.query("Bracket == @bracket")
        bracket_scores.append(bracket_score(bracket_prediction_df, tournament_result_df))
    return sum(bracket_scores) / len(bracket_scores)


def bracket_score(bracket_prediction_df: pd.DataFrame, tournament_result_df: pd.DataFrame):
    """
    Compute the bracket score given a prediction and the result for a tournament 
    (both in submission format). Each bracket will be awarded points based on the number 
    of correct predictions per tournament round: 1, 2, 4, 8, 16, 32 points for each 
    correct prediction in rounds 1, 2, 3, 4, 5, 6, respectively.
    """
    score = (bracket_prediction_df
            .merge(
                tournament_result_df,
                on=["Tournament", "Slot"],
                suffixes=("_pred", "_actual")
             )
            .assign(
                points =  lambda x: 2 ** (x.Slot.str[1].astype(int) - 1),
                correct = lambda x: x.Team_pred == x.Team_actual,
                score = lambda x: x.points * x.correct
            )
            .score
            .sum()
            )
    return score

def get_tournament_result(season: int, tournament: str):
    """Return the result a specified tournament in the submission format."""
    result_path = DATA_PATH +  f"{tournament}NCAATourneyCompactResults.csv"
    seed_path = DATA_PATH + f"{tournament}NCAATourneySeeds.csv"
    result_df = pd.read_csv(result_path)
    win_seed_df = pd.read_csv(seed_path).rename(columns={"Seed": "WinningSeed", "TeamID": "WTeamID"})
    lose_seed_df = pd.read_csv(seed_path).rename(columns={"Seed": "LosingSeed", "TeamID": "LTeamID"})
    result_df = (result_df
                    .merge(win_seed_df, on=["Season", "WTeamID"])
                    .merge(lose_seed_df, on=["Season", "LTeamID"])
                    .assign(
                        WinningSeed = lambda x: x.WinningSeed.str[:3],
                        LosingSeed = lambda x: x.LosingSeed.str[:3],
                    )
                    .loc[:, ["Season", "WinningSeed", "LosingSeed"]]
                    .query("Season == @season")
                )
    return to_submission_format(result_df, tournament)


def to_submission_format(result_df: pd.DataFrame, tournament: str, bracket: int = 1):
    """Transform result dataframe with cols 'WinningSeed' and 'LosingSeed' to submission format."""
    bracket_df = pd.DataFrame(columns=["Slot", "Team", "Team1", "Team2"])
    for rnd in range(1,7):
        slots = get_slots(rnd)
        round_df = pd.DataFrame({"Slot": slots, "Team1": "", "Team2": "", "Team": ""})
        fill_round_with_teams(rnd, round_df, bracket_df)
        fill_round_with_winners(round_df, result_df)
        bracket_df = pd.concat([bracket_df, round_df])
    bracket_df['RowId'] = list(range(1, 1+len(bracket_df)))
    bracket_df['Tournament'] = tournament
    bracket_df['Bracket'] = 1
    return bracket_df[['RowId', 'Tournament', 'Bracket', 'Slot', 'Team']]


def fill_round_with_teams(rnd: int, round_df: pd.DataFrame, bracket_df: pd.DataFrame):
    """Fill 'Team1' and 'Team2' columns in the round_df DataFrame."""
    for idx, row in round_df.iterrows():
        rnd = int(row.Slot[1])
        region = row.Slot[2]
        chalk_seed = row.Slot[3]
        if rnd == 1:
            round_df.at[idx, "Team1"] = region + chalk_seed.zfill(2)
            round_df.at[idx, "Team2"] = region + str(17 - int(chalk_seed)).zfill(2)
        else:
            team_1_prev_slot, team_2_prev_slot = get_prev_slots(rnd, region, chalk_seed)
            round_df.at[idx,"Team1"] = get_slot_winner(team_1_prev_slot, bracket_df)
            round_df.at[idx,"Team2"] = get_slot_winner(team_2_prev_slot, bracket_df)


def fill_round_with_winners(round_df: pd.DataFrame, result_df: pd.DataFrame):
    """Fill 'Team' column in the round_df DataFrame."""
    for idx, row in round_df.iterrows():
        team_1, team_2 = row["Team1"], row["Team2"]
        
        winning_team = (result_df
                        .query("(WinningSeed == @team_1 and LosingSeed == @team_2) or (WinningSeed == @team_2 and LosingSeed == @team_1)")
                        .WinningSeed
                        .values[0]
                        )
        round_df.at[idx, "Team"] = winning_team


def get_slot_winner(slot: str, df: pd.DataFrame):
    """Get the identifier of the winner of a given slot."""
    return df.loc[df.Slot == slot, "Team"].values[0]


def get_prev_slots(rnd: int, region: str, chalk_seed: str):
    """
    Given a slot in the tournament bracket, identified by the round, region, and chalk seed, this 
    function returns the slots in the previous round where the teams in the specified slot were determined.
    """
    if 2 <= rnd and rnd <= 4:
        return [f"R{rnd-1}{region}{chalk_seed}", f"R{rnd-1}{region}{1+int(2**(4-(rnd-1))-int(chalk_seed))}"]
    if rnd == 5:
        return ["R4W1", "R4X1"] if f"{region}{chalk_seed}" == "WX" else ["R4Y1", "R4Z1"]
    if rnd == 6:
        return ["R5WX", "R5YZ"]


def get_slots(rnd: int):
    """Get list of all slots for a given round."""
    if rnd >= 1 and rnd <= 4:
        return [f"R{rnd}{region}{seed}" for region in "WXYZ" for seed in range(1, 1+int(2 ** (4 - rnd)))]
    if rnd == 5:
        return ["R5WX", "R5YZ"]
    if rnd == 6:
        return ["R6CH"]

def evaluate(prediction_df: pd.DataFrame, 
             season: int,
             tournaments = ['M', 'W']):
    """For a prediction for a previous tournament given in the submission format compute the target metric."""
    tournament_scores = []
    for tournament in tournaments:
        tournament_prediction_df = prediction_df.query("Tournament == @tournament")
        if len(tournament_prediction_df) != 63:
            continue
        tournament_result_df = get_tournament_result(season, tournament)
        tournament_scores.append(average_bracket_score(tournament_result_df, tournament_prediction_df))
    return sum(tournament_scores) / len(tournament_scores)

# Load the data

In [None]:
men_tourney_results = pd.read_csv(DATA_PATH + "MNCAATourneyDetailedResults.csv")
women_tourney_results = pd.read_csv(DATA_PATH + "WNCAATourneyDetailedResults.csv")

men_seeds = pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv")
women_seeds = pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv")

men_regular_results = pd.read_csv(DATA_PATH + "MRegularSeasonDetailedResults.csv")
women_regular_results = pd.read_csv(DATA_PATH + "WRegularSeasonDetailedResults.csv")

men_teams = pd.read_csv(DATA_PATH + "MTeams.csv")[['TeamID', 'TeamName']]
women_teams = pd.read_csv(DATA_PATH + "WTeams.csv")[['TeamID', 'TeamName']]

men_conferences = pd.read_csv(DATA_PATH + "MTeamConferences.csv")
women_conferences = pd.read_csv(DATA_PATH + "WTeamConferences.csv")

# massey = pd.read_csv(DATA_PATH + "MMasseyOrdinals.csv")

def prepare_data(df):
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    
    return output

In [None]:
men_regular_data = prepare_data(men_regular_results)
men_tourney_data = prepare_data(men_tourney_results)
women_regular_data = prepare_data(women_regular_results)
women_tourney_data = prepare_data(women_tourney_results)

print(men_regular_data.shape)
print(men_tourney_data.shape)
print(women_regular_data.shape)
print(women_tourney_data.shape)

men_regular_data.head()

# Feature engineering!

In [None]:
boxscore_cols = ['T1_Score', 'T2_Score', 
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM', 'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA', 'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF', 
        'PointDiff']

# Only keep relevant columns
men_tourney_data = men_tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score', 'PointDiff']]
women_tourney_data = women_tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score', 'PointDiff']]

In [None]:
men_regular_compact = pd.read_csv(DATA_PATH + "MRegularSeasonCompactResults.csv").query('Season >= 2003')
men_regular_compact['point_diff'] = men_regular_compact['WScore'] - men_regular_compact['LScore']

num_win = men_regular_compact.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "num_wins", "WTeamID": "T1_TeamID"})

num_loss = men_regular_compact.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "num_losses", "LTeamID": "T1_TeamID"})

gap_win = men_regular_compact.groupby(['Season', 'WTeamID']).mean().reset_index()
gap_win = gap_win[['Season', 'WTeamID', 'point_diff']].rename(columns={"point_diff": "T1_point_diff_wins", "WTeamID": "T1_TeamID"})

gap_loss = men_regular_compact.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'point_diff']].rename(columns={"point_diff": "T1_point_diff_losses", "LTeamID": "T1_TeamID"})

men_more_regular_stats = pd.merge(num_win, num_loss, on=['Season', 'T1_TeamID'], how='left')
men_more_regular_stats = men_more_regular_stats.merge(gap_win, on=['Season', 'T1_TeamID'], how='left')
men_more_regular_stats = men_more_regular_stats.merge(gap_loss, on=['Season', 'T1_TeamID'], how='left')
men_more_regular_stats['T1_win_ratio'] = men_more_regular_stats['num_wins'] / (men_more_regular_stats['num_wins'] + men_more_regular_stats['num_losses'])
men_more_regular_stats.drop(['num_wins', 'num_losses'], axis=1, inplace=True)



women_regular_compact = pd.read_csv(DATA_PATH + "WRegularSeasonCompactResults.csv").query('Season >= 2003')
women_regular_compact['point_diff'] = women_regular_compact['WScore'] - women_regular_compact['LScore']

num_win = women_regular_compact.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "num_wins", "WTeamID": "T1_TeamID"})

num_loss = women_regular_compact.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "num_losses", "LTeamID": "T1_TeamID"})

gap_win = women_regular_compact.groupby(['Season', 'WTeamID']).mean().reset_index()
gap_win = gap_win[['Season', 'WTeamID', 'point_diff']].rename(columns={"point_diff": "T1_point_diff_wins", "WTeamID": "T1_TeamID"})

gap_loss = women_regular_compact.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'point_diff']].rename(columns={"point_diff": "T1_point_diff_losses", "LTeamID": "T1_TeamID"})

women_more_regular_stats = pd.merge(num_win, num_loss, on=['Season', 'T1_TeamID'], how='left')
women_more_regular_stats = women_more_regular_stats.merge(gap_win, on=['Season', 'T1_TeamID'], how='left')
women_more_regular_stats = women_more_regular_stats.merge(gap_loss, on=['Season', 'T1_TeamID'], how='left')
women_more_regular_stats['T1_win_ratio'] = women_more_regular_stats['num_wins'] / (women_more_regular_stats['num_wins'] + women_more_regular_stats['num_losses'])
women_more_regular_stats.drop(['num_wins', 'num_losses'], axis=1, inplace=True)

In [None]:
# Mean regular season statistics
men_season_statistics = men_regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg([np.mean]).reset_index()
men_season_statistics.columns = [''.join(col).strip() for col in men_season_statistics.columns.values]
men_season_statistics.columns = [col.replace('mean', '_mean') for col in men_season_statistics.columns]
men_season_statistics = pd.merge(men_season_statistics, men_more_regular_stats, on=['Season', 'T1_TeamID'], how='left')

women_season_statistics = women_regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg([np.mean]).reset_index()
women_season_statistics.columns = [''.join(col).strip() for col in women_season_statistics.columns.values]
women_season_statistics.columns = [col.replace('mean', '_mean') for col in women_season_statistics.columns]
women_season_statistics = pd.merge(women_season_statistics, women_more_regular_stats, on=['Season', 'T1_TeamID'], how='left')

men_season_statistics_T1 = men_season_statistics.copy()
men_season_statistics_T2 = men_season_statistics.copy()

women_season_statistics_T1 = women_season_statistics.copy()
women_season_statistics_T2 = women_season_statistics.copy()

men_season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(men_season_statistics_T1.columns)]
men_season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(men_season_statistics_T2.columns)]
men_season_statistics_T1.columns.values[0] = "Season"
men_season_statistics_T2.columns.values[0] = "Season"

women_season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(women_season_statistics_T1.columns)]
women_season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(women_season_statistics_T2.columns)]
women_season_statistics_T1.columns.values[0] = "Season"
women_season_statistics_T2.columns.values[0] = "Season"

men_regular_stats = men_season_statistics_T1 

women_regular_stats = women_season_statistics_T1

In [None]:
# Team names
men_regular_stats = pd.merge(men_regular_stats, men_teams, left_on='T1_TeamID', right_on='TeamID', how='left')
men_regular_stats.rename(columns={'TeamName': 'T1_TeamName'}, inplace=True)
men_regular_stats.drop(columns=['TeamID'], inplace=True)

women_regular_stats = pd.merge(women_regular_stats, women_teams, left_on='T1_TeamID', right_on='TeamID', how='left')
women_regular_stats.rename(columns={'TeamName': 'T1_TeamName'}, inplace=True)
women_regular_stats.drop(columns=['TeamID'], inplace=True)

# Conferences
p6 = {'acc', 'sec', 'big_east', 'big_ten', 'big_twelve', 'pac_twelve'}
men_regular_stats = pd.merge(men_regular_stats, men_conferences, left_on=['T1_TeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
men_regular_stats.rename(columns={'ConfAbbrev': 'T1_Conference'}, inplace=True)
men_regular_stats.drop(columns=['TeamID'], inplace=True)
men_regular_stats['T1_power6'] = men_regular_stats['T1_Conference'].isin(p6).astype(int)

women_regular_stats = pd.merge(women_regular_stats, women_conferences, left_on=['T1_TeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
women_regular_stats.rename(columns={'ConfAbbrev': 'T1_Conference'}, inplace=True)
women_regular_stats.drop(columns=['TeamID'], inplace=True)
women_regular_stats['T1_power6'] = women_regular_stats['T1_Conference'].isin(p6).astype(int)

men_regular_stats.head()

In [None]:
# Last 14 & 28 days mean stats
men_last14days_stats_T1 = men_regular_data.loc[men_regular_data.DayNum>118].reset_index(drop=True)
men_last14days_stats_T1['win'] = np.where(men_last14days_stats_T1['PointDiff']>0,1,0)
men_last14days_stats_T1 = men_last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')

women_last14days_stats_T1 = women_regular_data.loc[women_regular_data.DayNum>118].reset_index(drop=True)
women_last14days_stats_T1['win'] = np.where(women_last14days_stats_T1['PointDiff']>0,1,0)
women_last14days_stats_T1 = women_last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')

men_last28days_stats_T1 = men_regular_data.loc[men_regular_data.DayNum>104].reset_index(drop=True)
men_last28days_stats_T1['win'] = np.where(men_last28days_stats_T1['PointDiff']>0,1,0)
men_last28days_stats_T1 = men_last28days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_28d')

women_last28days_stats_T1 = women_regular_data.loc[women_regular_data.DayNum>104].reset_index(drop=True)
women_last28days_stats_T1['win'] = np.where(women_last28days_stats_T1['PointDiff']>0,1,0)
women_last28days_stats_T1 = women_last28days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_28d')

men_regular_stats = pd.merge(men_regular_stats, men_last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
men_regular_stats = pd.merge(men_regular_stats, men_last28days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')

women_regular_stats = pd.merge(women_regular_stats, women_last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
women_regular_stats = pd.merge(women_regular_stats, women_last28days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')

men_regular_stats.head()

In [None]:
# Seeds
men_seeds['seed'] = men_seeds['Seed'].apply(lambda x: int(x[1:3]))

women_seeds['seed'] = women_seeds['Seed'].apply(lambda x: int(x[1:3]))

men_seeds_T1 = men_seeds[['Season','TeamID','seed']].copy()
men_seeds_T1.columns = ['Season','T1_TeamID','T1_seed']

women_seeds_T1 = women_seeds[['Season','TeamID','seed']].copy()
women_seeds_T1.columns = ['Season','T1_TeamID','T1_seed']

men_regular_stats = pd.merge(men_regular_stats, men_seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')

women_regular_stats = pd.merge(women_regular_stats, women_seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')

men_regular_stats.head()

In [None]:
def calculate_elo(teams, data, initial_rating=2000, k=140):
    '''
    Function to calculate each teams elo-rating.
    
    Parameters:
    teams (array-like): Containing Team-IDs
    data (pd.DataFrame): DataFrame with all matches in chronological order
    - initial_rating (float): Initial rating of an unranked team (default: 2000).
    - k (float): K-factor, determining the impact of each match on team ratings (default: 140).
    
    Returns: 
    - list: List containing the historical ratings of WTeam
    - list: List containing the historical ratings of LTeam
    '''
    
    # Dictionary to keep track of current ratings for each team
    team_dict = {}
    for team in teams:
        team_dict[team] = initial_rating
        
    # Lists to store ratings for each team in each game
    r1, r2 = [], []

    # Iterate through the game data
    for ind, row in tqdm(data.iterrows(), total=len(data)):
        # Append current ratings for teams to lists
        r1.append(team_dict[row['WTeamID']])
        r2.append(team_dict[row['LTeamID']])

        # Calculate expected outcomes based on Elo ratings
        rateW = 1 / (1 + 10 ** ((team_dict[row['LTeamID']] - team_dict[row['WTeamID']]) / initial_rating))
        rateL = 1 / (1 + 10 ** ((team_dict[row['WTeamID']] - team_dict[row['LTeamID']]) / initial_rating))

        # Update ratings for winning and losing teams
        team_dict[row['WTeamID']] += k * (1 - rateW)
        team_dict[row['LTeamID']] += k * (0 - rateL)

        # Ensure that ratings do not go below 1
        if team_dict[row['LTeamID']] < 1:
            team_dict[row['LTeamID']] = 1
        
    return r1, r2

def create_elo_data(teams, data, initial_rating=2000, k=140):
    '''
    Function to create a DataFrame containing summary statistics of Elo ratings 
    for teams based on historical match data.
    
    Parameters:
    - teams (array-like): Containing Team-IDs.
    - data (pd.DataFrame): DataFrame with all matches in chronological order.
    - initial_rating (float): Initial rating of an unranked team (default: 2000).
    - k (float): K-factor, determining the impact of each match on team ratings (default: 140).
    
    Returns: 
    - DataFrame: DataFrame summarizing a team's Elo rating throughout a season.
    '''
    
    r1, r2 = calculate_elo(teams, data, initial_rating, k)
    
    # Concatenate arrays vertically
    seasons = np.concatenate([data.Season, data.Season])
    days = np.concatenate([data.DayNum, data.DayNum])
    teams = np.concatenate([data.WTeamID, data.LTeamID])
    tourney = np.concatenate([data.tourney, data.tourney])
    ratings = np.concatenate([r1, r2])
    # Create a DataFrame
    rating_df = pd.DataFrame({
        'Season': seasons,
        'DayNum': days,
        'TeamID': teams,
        'Rating': ratings,
        'Tourney': tourney
    })

    # Sort DataFrame and remove tournament data
    rating_df.sort_values(['TeamID', 'Season', 'DayNum'], inplace=True)
    rating_df = rating_df[rating_df['Tourney'] == 0]
    grouped = rating_df.groupby(['TeamID', 'Season'])
    results = grouped['Rating'].agg(['mean', 'median', 'std', 'min', 'max', 'last'])
    results.columns = ['Rating_Mean', 'Rating_Median', 'Rating_Std', 'Rating_Min', 'Rating_Max', 'Rating_Last']
    results['Rating_Trend'] = grouped.apply(lambda x: linregress(range(len(x)), x['Rating']).slope)

    results.reset_index(inplace=True)
    
    return results

In [None]:
# Get ELO ratings for regular seasons
regular_m = pd.read_csv(DATA_PATH + 'MRegularSeasonCompactResults.csv')
tourney_m = pd.read_csv(DATA_PATH + 'MNCAATourneyCompactResults.csv')
teams_m = pd.read_csv(DATA_PATH + 'MTeams.csv')

regular_m['tourney'] = 0
tourney_m['tourney'] = 1

data_m = pd.concat([regular_m, tourney_m])
data_m.sort_values(['Season', 'DayNum'], inplace=True)
data_m.reset_index(inplace=True, drop=True)

elo_df_men = create_elo_data(teams_m.TeamID, data_m)

elo_df_men.rename(columns={'Rating_Mean': 'T1_rating_mean', 'Rating_Median': 'T1_rating_median', 'Rating_Std': 'T1_rating_std',
       'Rating_Min': 'T1_rating_min', 'Rating_Max': 'T1_rating_max', 'Rating_Last': 'T1_rating_last', 'Rating_Trend': 'T1_rating_trend'}, inplace=True)
men_regular_stats = pd.merge(men_regular_stats, elo_df_men, left_on=['Season', 'T1_TeamID'], right_on=['Season', 'TeamID'], how='left')
men_regular_stats.drop(columns=['TeamID'], inplace=True)


regular_w = pd.read_csv(DATA_PATH + 'WRegularSeasonCompactResults.csv')
tourney_w = pd.read_csv(DATA_PATH + 'WNCAATourneyCompactResults.csv')
teams_w = pd.read_csv(DATA_PATH + 'WTeams.csv')

regular_w['tourney'] = 0
tourney_w['tourney'] = 1

data_w = pd.concat([regular_w, tourney_w])
data_w.sort_values(['Season', 'DayNum'], inplace=True)
data_w.reset_index(inplace=True, drop=True)

elo_df_women = create_elo_data(teams_w.TeamID, data_w)

elo_df_women.rename(columns={'Rating_Mean': 'T1_rating_mean', 'Rating_Median': 'T1_rating_median', 'Rating_Std': 'T1_rating_std',
       'Rating_Min': 'T1_rating_min', 'Rating_Max': 'T1_rating_max', 'Rating_Last': 'T1_rating_last', 'Rating_Trend': 'T1_rating_trend'}, inplace=True)
women_regular_stats = pd.merge(women_regular_stats, elo_df_women, left_on=['Season', 'T1_TeamID'], right_on=['Season', 'TeamID'], how='left')
women_regular_stats.drop(columns=['TeamID'], inplace=True)

men_regular_stats.head()

In [None]:
# Advanced stats ONLY for tourney teams (ONLY MEN)
adv_stats = pd.read_csv('/kaggle/input/march-madness-data/KenPom Barttorvik.csv')
men_teams_adv_stats = adv_stats['TEAM'].unique()
orig_men_teams = men_teams['TeamName'].unique()

team_dict = {
    'Abilene Christian': 'Abilene Chr',
    'Albany': 'SUNY Albany', 
    'American': 'American Univ',
    'Arkansas Pine Bluff': 'Ark Pine Bluff',
    'Boston University': 'Boston Univ',
    'Cal St. Bakersfield': 'CS Bakersfield',
    'Cal St. Fullerton': 'CS Fullerton',
    'Cal St. Northridge': 'CS Northridge',
    'Coastal Carolina': 'Coastal Car',
    'College of Charleston': 'Col Charleston',
    'East Tennessee St.': 'ETSU',
    'Eastern Kentucky': 'E Kentucky',
    'Eastern Washington': 'E Washington',
    'Fairleigh Dickinson': 'F Dickinson',
    'Florida Atlantic': 'FL Atlantic',
    'Florida Gulf Coast': 'FL Gulf Coast',
    'George Washington': 'G Washington',
    'Grambling St.': 'Grambling',
    'Green Bay': 'WI Green Bay',
    'Kennesaw St.': 'Kennesaw',
    'Kent St.': 'Kent',
    'Little Rock': 'Ark Little Rock',
    'Louisiana Lafayette': 'Louisiana',
    'Loyola Chicago': 'Loyola-Chicago',
    'Middle Tennessee': 'MTSU',
    'Milwaukee': 'WI Milwaukee',
    'Mississippi Valley St.': 'MS Valley St',
    "Mount St. Mary's": "Mt St Mary's",
    'North Carolina A&T': 'NC A&T',
    'North Carolina Central': 'NC Central',
    'North Carolina St.': 'NC State',
    'North Dakota St.': 'N Dakota St',
    'Northern Colorado': 'N Colorado',
    'Northern Kentucky': 'N Kentucky',
    'Northwestern St.': 'Northwestern LA',
    'Prairie View A&M': 'Prairie View',
    "Saint Joseph's": "St Joseph's PA",
    'Saint Louis': 'St Louis',
    "Saint Mary's": "St Mary's CA",
    "Saint Peter's": "St Peter's",
    'South Dakota St.': 'S Dakota St',
    'Southeast Missouri St.': 'SE Missouri St',
    'Southern': 'Southern Univ',
    'St. Bonaventure': "St Bonaventure",
    "St. John's": "St John's",
    'Stephen F. Austin': 'SF Austin',
    'Texas A&M Corpus Chris': 'TAM C. Christi',
    'Texas Southern': 'TX Southern',
    'UTSA': 'UT San Antonio',
    'Western Kentucky': 'WKU',
    'Western Michigan': 'W Michigan'
}

for index, row in adv_stats.iterrows():
    team_name = row['TEAM']
    if team_name in team_dict:
        adv_stats.at[index, 'TEAM'] = team_dict[team_name]
    elif team_name.endswith('.'):
        adv_stats.at[index, 'TEAM'] = team_name[:-1]
        
men_teams_adv_stats = adv_stats['TEAM'].unique()
missing_teams = [x for x in men_teams_adv_stats if x not in orig_men_teams]
print("Missing teams:", missing_teams)

adv_stats.drop(columns = ['CONF', 'CONF ID', 'QUAD NO', 'QUAD ID', 'TEAM NO', 'TEAM ID', 'SEED', 'ROUND', 'GAMES', 'W', 'L', 'WIN%'], inplace=True)

adv_stats = pd.merge(adv_stats, men_teams, left_on='TEAM', right_on='TeamName', how='inner').drop(columns = ['TeamName', 'TEAM'])
T1_adv_stats = adv_stats.copy().rename(columns={"YEAR": "Season", "TeamID": "T1_TeamID"})
T2_adv_stats = adv_stats.copy().rename(columns={"YEAR": "Season", "TeamID": "T2_TeamID"})

for col in T1_adv_stats.columns:
    if col.isupper():
        new_name = '_'.join(col.split())
        new_name = "T1_" + new_name
        T1_adv_stats.rename(columns={col: new_name}, inplace=True)
for col in T2_adv_stats.columns:
    if col.isupper():
        new_name = '_'.join(col.split())
        new_name = "T2_" + new_name
        T2_adv_stats.rename(columns={col: new_name}, inplace=True)
        
T2_adv_stats.head(5)

In [None]:
# Tourney data
men_team2_regular_stats = men_regular_stats.copy()
women_team2_regular_stats = women_regular_stats.copy()

for col in men_team2_regular_stats.columns:
    if col.startswith("T1_"):
        new_col = col.replace("T1_", "T2_")
        men_team2_regular_stats.rename(columns={col: new_col}, inplace=True)

for col in women_team2_regular_stats.columns:
    if col.startswith("T1_"):
        new_col = col.replace("T1_", "T2_")
        women_team2_regular_stats.rename(columns={col: new_col}, inplace=True)
        
men_tourney_data = pd.merge(men_tourney_data, men_regular_stats, on = ['Season', 'T1_TeamID'], how = 'left')
men_tourney_data = pd.merge(men_tourney_data, T1_adv_stats, on=['Season', 'T1_TeamID'], how='inner')
men_tourney_data = pd.merge(men_tourney_data, men_team2_regular_stats, on = ['Season', 'T2_TeamID'], how = 'left')
men_tourney_data = pd.merge(men_tourney_data, T2_adv_stats, on=['Season', 'T2_TeamID'], how='inner')
men_tourney_data["Seed_diff"] = men_tourney_data["T1_seed"] - men_tourney_data["T2_seed"]

women_tourney_data = pd.merge(women_tourney_data, women_regular_stats, on = ['Season', 'T1_TeamID'], how = 'left')
women_tourney_data = pd.merge(women_tourney_data, women_team2_regular_stats, on = ['Season', 'T2_TeamID'], how = 'left')
women_tourney_data["Seed_diff"] = women_tourney_data["T1_seed"] - women_tourney_data["T2_seed"]

men_tourney_data.head()

In [None]:
def fix_data(tourney_data):
    tourney_data = tourney_data.drop(columns = [ 'T1_FGM_mean',
     'T1_FGA_mean',
     'T1_FGM3_mean',
     'T1_FGA3_mean',
     'T1_FTM_mean',
     'T1_FTA_mean',
     'T1_OR_mean',
     'T1_DR_mean',
     'T1_Ast_mean',
     'T1_TO_mean',
     'T1_Stl_mean',
     'T1_Blk_mean',
     'T1_PF_mean',
     'T1_opponent_FGM_mean',
     'T1_opponent_FGA_mean',
     'T1_opponent_FGM3_mean',
     'T1_opponent_FGA3_mean',
     'T1_opponent_FTM_mean',
     'T1_opponent_FTA_mean',
     'T1_opponent_OR_mean',
     'T1_opponent_DR_mean',
     'T1_opponent_Ast_mean',
     'T1_opponent_TO_mean',
     'T1_opponent_Stl_mean',
     'T1_opponent_Blk_mean',
     'T1_opponent_PF_mean',
    'T2_FGM_mean',
     'T2_FGA_mean',
     'T2_FGM3_mean',
     'T2_FGA3_mean',
     'T2_FTM_mean',
     'T2_FTA_mean',
     'T2_OR_mean',
     'T2_DR_mean',
     'T2_Ast_mean',
     'T2_TO_mean',
     'T2_Stl_mean',
     'T2_Blk_mean',
     'T2_PF_mean',
     'T2_opponent_FGM_mean',
     'T2_opponent_FGA_mean',
     'T2_opponent_FGM3_mean',
     'T2_opponent_FGA3_mean',
     'T2_opponent_FTM_mean',
     'T2_opponent_FTA_mean',
     'T2_opponent_OR_mean',
     'T2_opponent_DR_mean',
     'T2_opponent_Ast_mean',
     'T2_opponent_TO_mean',
     'T2_opponent_Stl_mean',
     'T2_opponent_Blk_mean',
     'T2_opponent_PF_mean'])

    columns_to_diff = [
     'Score_mean',
     'opponent_Score_mean',
     'PointDiff_mean',
     'point_diff_wins',
     'point_diff_losses',
     'win_ratio',
     'win_ratio_14d',
     'win_ratio_28d',
     'rating_mean',
     'rating_median',
     'rating_std',
     'rating_min',
     'rating_max',
     'rating_last',
     'rating_trend',
     'K_TEMPO',
     'K_TEMPO_RANK',
     'KADJ_T',
     'KADJ_T_RANK',
     'K_OFF',
     'KO_RANK',
     'KADJ_O',
     'KADJ_O_RANK',
     'K_DEF',
     'KD_RANK',
     'KADJ_D',
     'KADJ_D_RANK',
     'KADJ_EM',
     'KADJ_EM_RANK',
     'BADJ_EM',
     'BADJ_O',
     'BADJ_D',
     'BARTHAG',
     'EFG%',
     'EFG%D',
     'FTR',
     'FTRD',
     'TOV%',
     'TOV%D',
     'OREB%',
     'DREB%',
     'OP_OREB%',
     'OP_DREB%',
     'RAW_T',
     '2PT%',
     '2PT%D',
     '3PT%',
     '3PT%D',
     'BLK%',
     'BLKED%',
     'AST%',
     'OP_AST%',
     '2PTR',
     '3PTR',
     '2PTRD',
     '3PTRD',
     'BADJ_T',
     'AVG_HGT',
     'EFF_HGT',
     'EXP',
     'TALENT',
     'FT%',
     'OP_FT%',
     'PPPO',
     'PPPD',
     'ELITE_SOS',
     'WAB',
     'BADJ_EM_RANK',
     'BADJ_O_RANK',
     'BADJ_D_RANK',
     'BARTHAG_RANK',
     'EFG%_RANK',
     'EFGD%_RANK',
     'FTR_RANK',
     'FTRD_RANK',
     'TOV%_RANK',
     'TOV%D_RANK',
     'OREB%_RANK',
     'DREB%_RANK',
     'OP_OREB%_RANK',
     'OP_DREB%_RANK',
     'RAW_T_RANK',
     '2PT%_RANK',
     '2PT%D_RANK',
     '3PT%_RANK',
     '3PT%D_RANK',
     'BLK%_RANK',
     'BLKED%_RANK',
     'AST%_RANK',
     'OP_AST%_RANK',
     '2PTR_RANK',
     '3PTR_RANK',
     '2PTRD_RANK',
     '3PTRD_RANK',
     'BADJT_RANK',
     'AVG_HGT_RANK',
     'EFF_HGT_RANK',
     'EXP_RANK',
     'TALENT_RANK',
     'FT%_RANK',
     'OP_FT%_RANK',
     'PPPO_RANK',
     'PPPD_RANK',
     'ELITE_SOS_RANK'
    ]

    # Loop through each column and compute the difference
    for col_suffix in columns_to_diff:
        t1_col = f'T1_{col_suffix}'
        t2_col = f'T2_{col_suffix}'
        diff_col = f'diff_{col_suffix}'

        # Compute the difference and assign it to the new column
        tourney_data[diff_col] = tourney_data[t1_col] - tourney_data[t2_col]

    off_def_pairs = [
        ('EFG%', 'EFG%D'),        # Effective Field Goal Percentage vs. Opponent Effective Field Goal Percentage
        ('FTR', 'FTRD'),          # Free Throw Rate vs. Opponent Free Throw Rate
        ('TOV%', 'TOV%D'),        # Turnover Percentage vs. Opponent Turnover Percentage
        ('OREB%', 'OP_OREB%'),    # Offensive Rebound Percentage vs. Opponent Offensive Rebound Percentage
        ('DREB%', 'OP_DREB%'),    # Defensive Rebound Percentage vs. Opponent Defensive Rebound Percentage
        ('2PT%', '2PT%D'),        # Two-Point Field Goal Percentage vs. Opponent Two-Point Field Goal Percentage
        ('3PT%', '3PT%D'),        # Three-Point Field Goal Percentage vs. Opponent Three-Point Field Goal Percentage
        ('AST%', 'OP_AST%'),       # Assist Percentage vs. Opponent Assist Percentage
        ('K_OFF', 'K_DEF'),       
        ('KADJ_O', 'KADJ_D'),       
        ('BADJ_O', 'BADJ_D'),   
        ('BLK%', 'BLKED%'),     
        ('PPPO', 'PPPD'),      # Pair 1: Points Per Possession Offense vs. Points Per Possession Defense
        ('FT%', 'OP_FT%')      # Pair 2: Free Throw Percentage vs. Opponent Free Throw Percentage
    ]

    for off_col, def_col in off_def_pairs:
        t1_off_col = 'T1_' + off_col
        t2_off_col = 'T2_' + off_col
        t1_def_col = 'T1_' + def_col
        t2_def_col = 'T2_' + def_col

        tourney_data['adj_T1_' + off_col] = tourney_data[t1_off_col] - tourney_data[t2_def_col]
        tourney_data['adj_T2_' + off_col] = tourney_data[t2_off_col] - tourney_data[t1_def_col]
    
    return tourney_data

men_tourney_data = fix_data(men_tourney_data)
men_tourney_data.head(5)

# Building models

In [None]:
# XGBoost model config
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess


param = {} 

# Testing:
# param['eval_metric'] =  'mae'
# param['booster'] = 'gbtree'
# param['eta'] = 0.05 #change to ~0.02 for final run (from 0.05)
# param['subsample'] = 0.35
# param['colsample_bytree'] = 0.7
# param['num_parallel_tree'] = 3 #recommend 10 (from 3)
# param['min_child_weight'] = 40 
# param['gamma'] = 10
# param['max_depth'] =  3
# param['silent'] = 1
# repeat_cv = 3 # recommend 10 (from 3)


# Submission:
param['eval_metric'] =  'mae'
param['booster'] = 'gbtree'
param['eta'] = 0.02 #change to ~0.02 for final run (from 0.05)
param['subsample'] = 0.35
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 10 #recommend 10 (from 3)
param['min_child_weight'] = 40 
param['gamma'] = 10
param['max_depth'] =  3
param['silent'] = 1
repeat_cv = 10 # recommend 10 (from 3)

In [None]:
def train_xgboost(dtrain):
    xgb_cv = []

    for i in range(repeat_cv): 
        print(f"Fold repeater {i}")
        xgb_cv.append(
            xgb.cv(
              params = param,
              dtrain = dtrain,
              obj = cauchyobj,
              num_boost_round = 3000,
              folds = KFold(n_splits = 5, shuffle = True, random_state = i),
              early_stopping_rounds = 25,
              verbose_eval = 50
            )
        )

    iteration_counts = [np.argmin(x['test-mae-mean'].values) for x in xgb_cv]
    men_val_mae = [np.min(x['test-mae-mean'].values) for x in xgb_cv]
    print(iteration_counts, men_val_mae, "\n\n")
    return xgb_cv, iteration_counts

In [None]:
def oof_predictions(X, y, iteration_counts):
    oof_preds = []
    for i in range(repeat_cv):
        print(f"Fold repeater {i}")
        preds = y.copy()
        kfold = KFold(n_splits = 5, shuffle = True, random_state = i)    
        for train_index, val_index in kfold.split(X,y):
            dtrain_i = xgb.DMatrix(X[train_index], label = y[train_index])
            dval_i = xgb.DMatrix(X[val_index], label = y[val_index])  
            model = xgb.train(
                  params = param,
                  dtrain = dtrain_i,
                  num_boost_round = iteration_counts[i],
                  verbose_eval = 50
            )
            preds[val_index] = model.predict(dval_i)
        oof_preds.append(np.clip(preds,-30,30))
    print("MAE: ", mean_absolute_error(sum(oof_preds)/len(oof_preds), y), "\n\n")
    return oof_preds

In [None]:
def raw_spline(X, y, oof_preds):
    spline_models = []
    for i in range(repeat_cv):
        dat = np.column_stack((oof_preds[i], np.where(y > 0, 1, 0)))
        dat = dat[np.argsort(dat[:, 0])]
        unique_preds, idx = np.unique(dat[:, 0], return_index=True)
        x_vals = unique_preds
        y_vals = np.array([np.mean(dat[dat[:, 0] == pred][:, 1]) for pred in x_vals])
        spline_model = UnivariateSpline(x_vals, y_vals)
        spline_models.append(spline_model)
        spline_fit = spline_model(oof_preds[i])
        print(f"logloss of cvsplit {i}: {log_loss(np.where(y > 0, 1, 0), spline_fit)}") 

    # Plotting
    pred_int = np.array(oof_preds[0], dtype=int)
    plot_df = pd.DataFrame({"pred": pred_int, "label": np.where(y > 0, 1, 0), "spline": spline_models[0](oof_preds[0])})
    plot_df = plot_df.groupby('pred')['spline', 'label'].mean().reset_index()
    plt.figure()
    plt.plot(plot_df['pred'], plot_df['spline'])
    plt.plot(plot_df['pred'], plot_df['label'])
    plt.show()

    return spline_models

In [None]:
def good_spline(raw_data, X, y, oof_preds):
    val_cv = []
    spline_model = []
    for i in range(repeat_cv):
        dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
        dat = sorted(dat, key = lambda x: x[0])
        datdict = {}
        for pred, label in dat:
            if pred not in datdict:
                datdict[pred] = []
            datdict[pred].append(label)
        x_vals = np.array(list(datdict.keys()))
        y_vals = np.array([np.mean(labels) for labels in datdict.values()])
        spline_model.append(UnivariateSpline(x_vals, y_vals))
        spline_fit = spline_model[i](oof_preds[i])
        spline_fit = np.clip(spline_fit,0.02,0.98)
#         spline_fit[(raw_data.T1_seed==1) & (raw_data.T2_seed==16) & (raw_data.T1_Score > raw_data.T2_Score)] = 1.0
#         spline_fit[(raw_data.T1_seed==2) & (raw_data.T2_seed==15) & (raw_data.T1_Score > raw_data.T2_Score)] = 1.0
#         spline_fit[(raw_data.T1_seed==3) & (raw_data.T2_seed==14) & (raw_data.T1_Score > raw_data.T2_Score)] = 1.0
#         spline_fit[(raw_data.T1_seed==4) & (raw_data.T2_seed==13) & (raw_data.T1_Score > raw_data.T2_Score)] = 1.0
#         spline_fit[(raw_data.T1_seed==16) & (raw_data.T2_seed==1) & (raw_data.T1_Score < raw_data.T2_Score)] = 0.0
#         spline_fit[(raw_data.T1_seed==15) & (raw_data.T2_seed==2) & (raw_data.T1_Score < raw_data.T2_Score)] = 0.0
#         spline_fit[(raw_data.T1_seed==14) & (raw_data.T2_seed==3) & (raw_data.T1_Score < raw_data.T2_Score)] = 0.0
#         spline_fit[(raw_data.T1_seed==13) & (raw_data.T2_seed==4) & (raw_data.T1_Score < raw_data.T2_Score)] = 0.0
        val_cv.append(pd.DataFrame({"y":np.where(y>0,1,0), "pred":spline_fit, "season":raw_data.Season}))
        print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 
    val_cv = pd.concat(val_cv)
    val_cv = val_cv.groupby('season').apply(lambda x: log_loss(x.y, x.pred))
    print(val_cv)
    plot_df = pd.DataFrame({"pred":oof_preds[0], "label":np.where(y>0,1,0), "spline":spline_model[0](oof_preds[0])})
    plot_df["pred_int"] = (plot_df["pred"]).astype(int)
    plot_df = plot_df.groupby('pred_int')['spline','label'].mean().reset_index()
    plt.figure()
    plt.plot(plot_df.pred_int,plot_df.spline)
    plt.plot(plot_df.pred_int,plot_df.label)
    return spline_model

In [None]:
good_cols = ['Season', 'T1_TeamID', 'T2_TeamID', 'T1_TeamName', 'T2_TeamName', 'T1_seed', 'T2_seed', 'T1_Score', 'T2_Score', 'PointDiff', 'T1_rating_last', 'T2_rating_last']
excluded = ['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID', 'T2_Score','PointDiff', 'T1_TeamName', 'T2_TeamName']
categorical_features = ['T1_Conference','T2_Conference', 'T1_power6', 'T2_power6']
men_ordinal_features = [c for c in men_tourney_data.columns if 'RANK' in c]
men_continuous_features = [c for c in men_tourney_data.columns if c not in categorical_features and c not in excluded and c not in men_ordinal_features]
women_continuous_features = [c for c in women_tourney_data.columns if c not in categorical_features and c not in excluded]

men_train_data = men_tourney_data.query("Season < 2024")
women_train_data = women_tourney_data.query("Season < 2024")

men_X_train = men_train_data.drop(columns=excluded)
men_y_train = men_train_data['PointDiff'].values

women_X_train = women_train_data.drop(columns=excluded)
women_y_train = women_train_data['PointDiff'].values

men_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features),  # One-hot encode categorical features
        ('num', StandardScaler(), men_continuous_features),  # Standardize numerical features
    ],
    remainder='passthrough'  # Pass through any features not explicitly transformed
)

women_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features),  # One-hot encode categorical features
        ('num', StandardScaler(), women_continuous_features),  # Standardize numerical features
    ],
    remainder='passthrough'  # Pass through any features not explicitly transformed
)

men_X_train_scaled = men_preprocessor.fit_transform(men_X_train)
women_X_train_scaled = women_preprocessor.fit_transform(women_X_train)

men_dtrain = xgb.DMatrix(men_X_train_scaled, label = men_y_train)
women_dtrain = xgb.DMatrix(women_X_train_scaled, label = women_y_train)

In [None]:
# MEN ONLY Transforming with conferences + p6
men_xgb_cv, men_iteration_counts = train_xgboost(men_dtrain)
men_oof_preds = oof_predictions(men_X_train_scaled, men_y_train, men_iteration_counts)
men_raw_spline_model = raw_spline(men_X_train_scaled, men_y_train, men_oof_preds)
men_spline_model = good_spline(men_train_data, men_X_train_scaled, men_y_train, men_oof_preds)

In [None]:
# WOMEN ONLY Transforming with conferences + p6
women_xgb_cv, women_iteration_counts = train_xgboost(women_dtrain)
women_oof_preds = oof_predictions(women_X_train_scaled, women_y_train, women_iteration_counts)
women_raw_spline_model = raw_spline(women_X_train_scaled, women_y_train, women_oof_preds)
women_spline_model = good_spline(women_train_data, women_X_train_scaled, women_y_train, women_oof_preds)

In [None]:
def train_final_model(dtrain, iteration_counts):
    xgboost_models = []
    for i in range(repeat_cv):
        print(f"Fold repeater {i}")
        xgboost_models.append(
            xgb.train(
              params = param,
              dtrain = dtrain,
              num_boost_round = int(iteration_counts[i] * 1.05),
              verbose_eval = 50
            )
        )
    return xgboost_models

def prediction(xgboost_models, spline_model, dmatrix):
    preds = []
    for i in range(repeat_cv):
        preds.append(np.clip(spline_model[i](np.clip(xgboost_models[i].predict(dmatrix),-30,30)),0.025,0.975))
    return sum(preds) / len(preds)

In [None]:
men_xgb_model = train_final_model(men_dtrain, men_iteration_counts)
women_xgb_model = train_final_model(women_dtrain, women_iteration_counts)

# Submission time!

In [None]:
def analyzeBrackets(bracket_start_num, num_brackets, brackets, season, display = True, mens_only = True):
    men_scores = []
    women_scores = []
    
    for num in range(bracket_start_num, num_brackets + bracket_start_num):
        reduced_bracket_M = brackets.loc[(brackets.Tournament == 'M') & (brackets.Bracket == num)]
        reduced_bracket_W = brackets.loc[(brackets.Tournament == 'W') & (brackets.Bracket == num)]

        assert bracket_checker.check_predicted_slots(reduced_bracket_M)
        assert bracket_checker.check_consistency(reduced_bracket_M)

        assert bracket_checker.check_predicted_slots(reduced_bracket_W)
        assert bracket_checker.check_consistency(reduced_bracket_W)
        
        men_score = evaluate(reduced_bracket_M, season, ['M']) if season != 2024 else 'N/A'
        women_score = evaluate(reduced_bracket_W, season, ['W']) if season != 2024 else 'N/A'

        men_scores.append(men_score)
        women_scores.append(women_score)
        
        print("Bracket #{}:".format(num))
        if display:
            print(f"Men's (Score: {men_score}): ")
            prev_letter = None
            max_team_name_length = max(len(team_name) for team_name in men_teams['TeamName'])  # Calculate the maximum length of team names
            for key, values in bracket_checker.calc_predicted_paths(reduced_bracket_M).items():
                current_letter = key[0]
                team_id = seeds_2024.loc[seeds_2024['Seed'] == key, 'TeamID'].values[0]
                team_name = men_teams.loc[men_teams['TeamID'] == team_id, 'TeamName'].values[0]
                
                padding = max_team_name_length - len(team_name)
                align_width = max_team_name_length + 3  # Additional 3 characters for the space and opening bracket
                if prev_letter is not None and current_letter != prev_letter:
                    print()
                prev_letter = current_letter
                print(f"{key} ({team_name}){' ' * padding}: {' '.join(map(str, values))}")
    
            if mens_only == False:
                print(f"\n\nWomen's (Score: {women_score}): ")
                prev_letter = None
                for key, values in bracket_checker.calc_predicted_paths(reduced_bracket_W).items():
                    current_letter = key[0]
                    team_id = seeds_2024.loc[(seeds_2024['Seed'] == key) & (seeds_2024['Tournament'] == 'W'), 'TeamID'].values[0]
                    team_name = women_teams.loc[women_teams['TeamID'] == team_id, 'TeamName'].values[0]

                    padding = max_team_name_length - len(team_name)
                    align_width = max_team_name_length + 3  # Additional 3 characters for the space and opening bracket
                    if prev_letter is not None and current_letter != prev_letter:
                        print()
                    prev_letter = current_letter
                    print(f"{key} ({team_name}){' ' * padding}: {' '.join(map(str, values))}")
            print("\n\n")
        else:
            print(f"Men's (Score: {men_score}),  Women's (Score: {women_score}): ")

    if season != 2024:
        print(f"\nMen's (Max Score): {max(men_scores) if season != 2024 else 'N/A'}")
        print(f"Men's (Average Score): {sum(men_scores) / len(men_scores) if season != 2024 else 'N/A'}")
        print(f"Women's (Max Score): {max(women_scores) if season != 2024 else 'N/A'}")
        print(f"Women's (Average Score): {sum(women_scores) / len(women_scores) if season != 2024 else 'N/A'}\n")

In [None]:
def findWinner(T1_TeamId, T2_TeamId, xgb_model, spline_model, mens = True, season = 2024, simulate = False):
    stats = men_regular_stats if mens else women_regular_stats
    stats = stats[stats['Season'] == season]
    strongStats = stats[stats['T1_TeamID'] == T1_TeamId].drop(columns = ['Season', 'T1_TeamID']).reset_index(drop=True)
    
    if mens:
        strongAdvStats = T1_adv_stats[(T1_adv_stats['T1_TeamID'] == T1_TeamId) & (T1_adv_stats['Season'] == season)].drop(columns = ['Season', 'T1_TeamID']).reset_index(drop=True)
        strongStats = pd.concat([strongStats, strongAdvStats], axis=1)
        
    weakStats = stats[stats['T1_TeamID'] == T2_TeamId].drop(columns = ['Season', 'T1_TeamID']).reset_index(drop=True)
    copyWeakStats = weakStats.copy()
    for col in copyWeakStats.columns:
        if col.startswith("T1_"):
            new_col = col.replace("T1_", "T2_")
            copyWeakStats.rename(columns={col: new_col}, inplace=True)
    if mens:
        weakAdvStats = T2_adv_stats[(T2_adv_stats['T2_TeamID'] == T2_TeamId) & (T2_adv_stats['Season'] == season)].drop(columns = ['Season', 'T2_TeamID']).reset_index(drop=True)
        copyWeakStats = pd.concat([copyWeakStats, weakAdvStats], axis=1)
        
    combinedStats = pd.concat([strongStats, copyWeakStats], axis=1)
    
    combinedStats['Seed_diff'] = combinedStats['T1_seed'] - combinedStats['T2_seed']
    if mens:
        combinedStats = fix_data(combinedStats)
        
    combinedStats = men_preprocessor.transform(combinedStats) if mens else women_preprocessor.transform(combinedStats)

    dmatrix = xgb.DMatrix(combinedStats)
    probability = prediction(xgb_model, spline_model, dmatrix)[0]
    if simulate:
        winner = np.random.choice([T1_TeamId, T2_TeamId], p=[probability, 1 - probability])
        return winner, probability
    else:
        if (probability > 0.5):
            return T1_TeamId, probability
        else:
            return T2_TeamId, 1-probability

In [None]:
def run_bracket(xgb_model = None, spline_model = None, mens = True, bracket = 1, season = 2024, simulate = False, matchup_cache = {}):
    tournament = "M" if mens else "W"
    seeds = seeds_2024.loc[seeds_2024['Tournament']==tournament]
    
    predictions = pd.read_csv(DATA_PATH + "sample_submission.csv")[:0] # Create Blank Dataframe
    predictions['Probability'] = -1
    predictions = predictions.drop("RowId", axis=1) # We will add back later
        
    slots = pd.read_csv(DATA_PATH + "MNCAATourneySlots.csv")
    slots = slots.loc[slots['Season']==2023].reset_index(drop=True) # Slots are the same for any year (except alst 4 play-in games, so take 2023)
    slots = slots.iloc[:-4] # 2024_tourney_seeds doesn't include first four    
    
    teams = seeds_2024.copy().loc[seeds_2024['Tournament']==tournament]
    
    slot2team = teams[["Seed","TeamID"]].set_index("Seed").to_dict()['TeamID'] # Get Team ID from slot
    team2slot = {slot2team[i]:i for i in slot2team.keys()}
    round_winners = {i:i for i in list(teams['Seed'])} # Initialize for first round, this will keep track of the seed of who wins each round

    for i,v in slots.iterrows():
        slot = v['Slot']

        strongSeed = round_winners[v['StrongSeed']]
        weakSeed = round_winners[v['WeakSeed']]
                    
        strongId = slot2team[strongSeed]
        weakId = slot2team[weakSeed]
               
        matchup_key = (strongId, weakId)
        if matchup_key in matchup_cache:
            pick, probability = matchup_cache[matchup_key]
            if simulate:
                winner = np.random.choice([strongId, weakId], p=[probability, 1 - probability])
                pick = team2slot[winner]
        else:
            if not xgb_model or not spline_model:
                pick = strongSeed 
                probability = 1.0
            else:
                pick, probability = findWinner(strongId, weakId, xgb_model, spline_model, mens, season, simulate)
                pick = team2slot[pick]
                matchup_cache[matchup_key] = (pick, probability)

        round_winners[slot] = round_winners[pick]
#         predictions.loc[len(predictions.index)] = [tournament, bracket, slot, pick, probability]
        predictions = predictions.append({'Tournament': tournament, 'Bracket': bracket, 'Slot': slot, 'Team': pick, 'Probability': probability}, ignore_index=True)

    return predictions

def generate_brackets(num_raw_brackets = 1, num_sim_brackets = 1, men_xgb_model = None, men_spline_model = None, women_xgb_model = None, women_spline_model = None, season = 2024):
    matchup_cache = {}
    brackets = pd.read_csv(DATA_PATH + "sample_submission.csv")[:0].drop("RowId", axis=1)
    for bracket in range(1, num_raw_brackets + 1):
        mens = run_bracket(men_xgb_model, men_spline_model, True, bracket, season, False, matchup_cache)
        womens = run_bracket(women_xgb_model, women_spline_model, False, bracket, season, False, matchup_cache)
        brackets = pd.concat([brackets, mens, womens], ignore_index=True, axis=0)
        
    for bracket in range(num_raw_brackets + 1, num_sim_brackets + num_raw_brackets + 1):
        mens = run_bracket(men_xgb_model, men_spline_model, True, bracket, season, True, matchup_cache)
        womens = run_bracket(women_xgb_model, women_spline_model, False, bracket, season, True, matchup_cache)
        brackets = pd.concat([brackets, mens, womens], ignore_index=True, axis=0)
        
    brackets['RowId'] = brackets.reset_index().index
    brackets = brackets[['RowId', 'Tournament', 'Bracket', 'Slot', 'Team', 'Probability']]
    return brackets

In [None]:
num_raw_brackets = 5
num_sim_brackets = 49994
season = 2024

brackets = generate_brackets(num_raw_brackets, num_sim_brackets, men_xgb_model, men_spline_model, women_xgb_model, women_spline_model, season)

# analyzeBrackets(1, num_raw_brackets + num_sim_brackets, brackets, season, True, True)

In [None]:
def submit(brackets):
    brackets = brackets.reset_index(drop=True).drop(columns=['Probability'])
    brackets['RowId'] = brackets.reset_index().index
    brackets.to_csv("/kaggle/working/submission.csv", index=False)
    return brackets

In [None]:
submit(brackets)