In [None]:
# check if IS_MASTER exists, this variable will only exist if it's being called by MASTER notebook.
# if it does not exist, set it to False
try: IS_MASTER
except: IS_MASTER = False

In [None]:
from tqdm import tqdm

class QuickTourneySimulator(object):
    def __init__(self, fname_slots, features, model, seed_dict, TeamDict):
        self.fname_slots = fname_slots
        self.df_slots = self._make_df_slots(self.fname_slots)
        self.rng = np.random.default_rng()
        self.features = features
        self.model = model
        self.seed_dict = seed_dict
        self.TeamDict = TeamDict

    @staticmethod
    def _make_df_slots(fname_slots):
        '''Reduced version of dataframe containing the slots information
        Parameters
        ----------
        fname_slots : str
            path to the file containing tournament Slots info
            Mens or Womens tournament should result in the same output here

        Returns
        -------
        df_slots : pandas DataFrame
            Slots info for NCAA tournament for Round 1 and later

        '''
        df_slots = pd.read_csv(fname_slots)

        # Only keep slots that are part of the traditional
        # tournament (no play-ins)
        df_slots = df_slots[df_slots['Slot'].str.startswith('R')]

        # except for play-ins (which we don't care about)
        # the tournament is the same structure every year
        # So, drop Season column and duplicate Slot entries
        df_slots.drop_duplicates('Slot', inplace=True)
        df_slots.drop(columns='Season', inplace=True)
        return df_slots

    def simulate(self, prob_func, **prob_func_kwargs):
        ''' Simulates a tournament for a given function that
        assigns probabilities to the games

        Parameters
        ----------
        prob_func: func
            Takes two arguments: seed1 and seed2 in format like 'W01' or X13', etc.
            Result should be the probability that seed1 beats seed2
        prob_func_kwargs: dict
            key word arguments for prob_func

        Returns
        -------
        df_tourney: pandas DataFrame
            Single simulated tournament with columns 'Slot' and 'Team'
        '''
        seeds = [f'{region}{num:02d}' for region in list('WXYZ') \
                 for num in range(1,17)]
        # Seeds is an array from W01 to Z16.
        # Possibly here map the W01 to the Team ID in a dictionary

        tourney = {s:s for s in seeds}

        #
        # Go through the matchups and predict which team wins.
        #
        for sl, ss, ws in zip(self.df_slots['Slot'],
                              self.df_slots['StrongSeed'],
                              self.df_slots['WeakSeed']):
            # probability that strong seed wins
            ssID = self.seed_dict[tourney[ss]]
            # probability that weak seed wins
            wsID = self.seed_dict[tourney[ws]]


            prob = prob_func(ssID, wsID, self.features, self.model, **prob_func_kwargs)

            result = (self.rng.random() < prob[:,1]) # This should do well on the probs near 50%
            line = f"{result[0]}, {prob[:,0][0]}, {prob[:,1][0]}, {ssID}, {wsID}, {ss}, {ws}, {self.TeamDict[ssID]}, {self.TeamDict[wsID]}\n"
            with open(f"{OUTPUT_FOLDER}inf_results.csv", "a") as myfile:
                myfile.write(line)
            # Here we build out the dictionary of slot_winner:winning_team
            if result == 0:
                tourney[sl] = tourney[ws]
            else:
                tourney[sl] = tourney[ss]

        df_tourney = pd.DataFrame({'Slot': tourney.keys(), 'Team': tourney.values()})
        df_tourney = df_tourney[df_tourney['Slot'].str.startswith('R')]
        return df_tourney.reset_index(drop=True)

    def simulate_multiple(self, n, prob_func, **prob_func_kwargs):
        ''' Simulates a multiple tournaments for a given function that
        assigns probabilities to the games

        Parameters
        ----------
        n: int
            number of simulations to perform
        prob_func: func
            Takes two arguments: seed1 and seed2 in format like 'W01' or X13', etc.
            Result should be the probability that seed1 beats seed2
        prob_func_kwargs: dict
            key word arguments for prob_func

        Returns
        -------
        df_out: pandas DataFrame
            n simulated tournaments with columns 'Bracket', Slot', and 'Team'
        '''
        df_out = []
        for i in tqdm(range(n)): # for each simulated bracket
            df = self.simulate(prob_func, **prob_func_kwargs)
            df['Bracket'] = i
            df_out.append(df)
        df_out = pd.concat(df_out)
        return df_out.reset_index(drop=True)


def prob_simple_seed_model(seed1, seed2, **kwargs):
    '''What I want to do here is do inference on the model for the two seeds passed in.
    Probably need to convert between slot identifier (W01) and team ID here because that's
    how my model is set up.
    '''
    # prob that seed1 wins
    # Convert seed1 and seed2 to the TeamID 1 and Team ID 2 here.
    int_seed1 = int(seed1[1:]) # this is just for the the silly seed model.
    int_seed2 = int(seed2[1:])
    # want to return the likelihood from inference.
    return 0.5 + 0.03*(int_seed2 - int_seed1)


"""
Here we need to get the teams in the tournament, the slot numbers, seeds, etc., then calculate their stats for 2024 season.
As we pick two teams at a time to compare, build the diffs, double the data, then do inference two at a time.
"""

# Open the season statistics pickle from the EDA notebook
with open(ALL_STATISTICS_OUTPUT, 'rb') as f:
    all_stats = pickle.load(f)

#mm = MinMaxScaler() # This is preferred since our data is mostly normally distributed already.
all_stats_sub = all_stats.reset_index().copy()

def get_inference_probs(teamA, teamB, final_features, model):
    submission = pd.DataFrame([[str(teamA), str(teamB), "2024"]], columns=['TeamA', 'TeamB', 'Season'])
    teamA_stats = submission.merge(all_stats_sub, left_on=['Season', 'TeamA'], right_on=['Season', 'level_1'])
    teamB_stats = submission.merge(all_stats_sub, left_on=['Season', 'TeamB'], right_on=['Season', 'level_1'])
    merged_team_stats = teamA_stats.merge(teamB_stats, on=['Season', 'TeamA', 'TeamB'], suffixes=['_A', '_B'])
    X_test = pd.DataFrame()

    for feature in final_features:
        X_test[feature] = merged_team_stats[feature[:-4] + '_A'] - merged_team_stats[feature[:-4] + '_B']
    X_test = X_test[final_features]
    #X_test['SeedDiff'] = 0 # We don't have seeds yet.

    pred = model.predict_proba(X_test)
    return pred

NameError: name 'ALL_STATISTICS_OUTPUT' is not defined

In [None]:
with open(MODEL_OUTPUT, 'rb') as f:
        clf = pickle.load(f)

seed_df = pd.read_csv(f'{DATA_FOLDER}2024_tourney_seeds.csv')

p_func = get_inference_probs

n_brackets_per_tournament = 50
df_submission = []
for tournament in ['M','W']:
    seed_fil = seed_df['Tournament'] == tournament
    seed_df_tournament = seed_df[seed_fil]
    seed_dict = dict(zip(seed_df_tournament['Seed'], seed_df_tournament['TeamID']))
    if tournament == 'M':
        TeamDict = mTeamDict
    else:
        TeamDict = wTeamDict

    qts = QuickTourneySimulator(f'{DATA_FOLDER}{tournament}NCAATourneySlots.csv', features_noseed, clf, seed_dict, TeamDict)
    #qts = QuickTourneySimulator('/kaggle/input/march-machine-learning-mania-2024/MNCAATourneySlots.csv')
    tmp = qts.simulate_multiple(n_brackets_per_tournament,p_func)
    tmp['Tournament'] = tournament
    df_submission.append(tmp)

df_submission = pd.concat(df_submission)

# add RowId column
df_submission['RowId'] = np.arange(df_submission.shape[0])
# reorder columns
df_submission = df_submission[['RowId','Tournament','Bracket','Slot','Team']]
# write out for submission
#df_submission.to_csv('/kaggle/working/submission.csv',index=False)
df_submission.to_csv(f'{DATA_FOLDER}submission.csv',index=False)

38.86500000022352