In [None]:
import pandas as pd

In [None]:
# win probabilities outputted by M3_Predictions.py
submission = pd.read_csv('mydata/mens/original_probabilities.csv')
submission.head()

In [None]:
# Switches Pred to the (1 - Pred) if TeamID_x is not the lower ID as Pred needs to be the probability for the lower ID team
def switch_teams(row):
    idlist = row['ID'].split('_')
    prob = row['Pred']
    if row['TeamID_x'] != int(idlist[1]):
        row['Pred'] = 1 - prob
    row['TeamID_x'] = int(idlist[1])
    row['TeamID_y'] = int(idlist[2])
    return row

submission = submission.apply(switch_teams, axis = 1)

In [None]:
# function to set a min probability for the favorite based on the difference in seed
# used for 1 v 16, 2 v 15, 3 v 14
def set_prob_to_k(row, k, diff):
    if (row['SeedDiff'] < diff) and (row['Pred'] < k):
        if diff == -10 and row['Seed_y'] < 14:
            return row['Pred']
        else:
            return k
    else:
        return row['Pred']
    
submission['Pred'] = submission.apply(set_prob_to_k, axis = 1, args = (.80, -10)) # 3 vs 14 set to minimum 80%
submission['Pred'] = submission.apply(set_prob_to_k, axis = 1, args = (.92, -12)) # 2 v 15 set to minumim 92%
submission['Pred'] = submission.apply(set_prob_to_k, axis = 1, args = (.97, -14)) # 1 v 16 set to minimum 97%

In [None]:
# File for names of teams
team_names = pd.read_csv('ncaam-march-mania-2021/MTeams.csv')[['TeamID', 'TeamName']]
team_names.head()

In [None]:
# Seeds for 2021
seeds = pd.read_csv('ncaam-march-mania-2021/NCAATourneySeeds.csv').query('Season == 2021')[['TeamID', 'Seed']]
seeds.head()

In [None]:
# join names with seeds
team_names_seeds = pd.merge(team_names, seeds, on = 'TeamID')
team_names_seeds.head()

In [None]:
# Tournament slots for 2021
slots = pd.read_csv('ncaam-march-mania-2021/NCAATourneySlots.csv').query('Season == 2021')[['Slot', 'StrongSeed', 'WeakSeed']]
slots.head()

In [None]:
# join slots with names and seeds
team_slots = pd.merge(slots, team_names_seeds, left_on = ['StrongSeed'], right_on = ['Seed'])
team_slots = pd.merge(team_slots, team_names_seeds, left_on = ['WeakSeed'], right_on = ['Seed']).drop(columns = ['Slot', 'StrongSeed', 'WeakSeed', 'Seed_x', 'Seed_y'])
team_slots.head()

In [None]:
# function to switch slots so the lower id is in first slot
def switch_slots(row):
    if row['TeamID_x'] > row['TeamID_y']:
        team_id = row['TeamID_x']
        name = row['TeamName_x']
        row['TeamID_x'] = row['TeamID_y']
        row['TeamID_y'] = team_id
        row['TeamName_x'] = row['TeamName_y']
        row['TeamName_y'] = name
    return row

team_slots = team_slots.apply(switch_slots, axis = 1)
team_slots.head()

In [None]:
# merge submission with team slots to look at Round 1 probabilities
submission_r1 = pd.merge(submission, team_slots, on = ['TeamID_x', 'TeamID_y'])
submission_r1

In [None]:
ids = list(submission_r1['ID'])

#The ML odds from Draftkings, inputted manually, in order of IDs in submission_r1
vegas_odds_x = [] # ML odds for TeamID_x (lower team id)
vegas_odds_y = [] # ML odds for TeamID_y (higher team id)

# make datframe of these odds
ml_odds = pd.DataFrame({'ID': ids, 'ML_x': vegas_odds_x, 'ML_y': vegas_odds_y})
ml_odds.head()

In [None]:
# converts the ML odds to a win probability
def ml_to_win_probability(row, ml_col):
    
    # if team has plus odds (underdog)
    if row[ml_col] > 0:
        return (100 / (100 + row[ml_col]))
    
    # if team has negative odds (favorite)
    else:
        return ((-1 * row[ml_col] / (100 - row[ml_col])))
    
# makes columns of win probabilities from ML odds
ml_odds['WinProb_x'] = ml_odds.apply(lambda x: ml_to_win_probability(x, 'ML_x'), axis = 1)
ml_odds['WinProb_y'] = ml_odds.apply(lambda x: ml_to_win_probability(x, 'ML_y'), axis = 1)

ml_odds.head()

In [None]:
# The ML odds are hedged so that Vegas makes money, so need to average them out to get the true win probability
# Calculating for team x only as the win probabilities in the Kaggle submission are for team x
ml_odds['VegasWinProb'] = 0.5 * (ml_odds['WinProb_x'] + (1 - ml_odds['WinProb_y']))

# only need the true win probability and ID
ml_odds = ml_odds[['ID', 'VegasWinProb']]
ml_odds.head()

In [None]:
# merge vegas odds with submission probabilities for round 1
submission_r1 = pd.merge(submission_r1, ml_odds, on = 'ID')

# Calculate absolute difference between my probabilities and Vegas'
submission_r1['AbsDiff'] = abs(submission_r1['VegasWinProb'] - submission_r1['Pred'])

# print submission_r1 ordering by the difference in win probability
submission_r1.sort_values('AbsDiff')

In [None]:
# adjusts all probabiltiies for a given team
# positive adj means team is undervalued by model
# negative adj means team is overvalued by model
def team_adjustment(row, team_id, adj):
    idlist = row['ID'].split('_')
    
    # factor to add or subtract from the win probability, smaller at the tails than at the center and never goes outside of 0 and 1
    if row['Pred'] > 0.5:
        adj_actor = abs(adj) * (1 - row['Pred'])
    else:
        adj_factor = abs(adj) * row['Pred']
    
    # increase probability if given team is TeamID_x and adjustment is positive or given team is TeamID_y and adjustment is negative
    if (team_id == int(idlist[1]) and adj > 0) or (team_id == int(idlist[2]) and adj < 0):
        return row['Pred'] + adj_factor
        
    # decrease probability if given team is TeamID_y and adjustment is positive or given team is TeamID_x and adjustment is negative
    else if (team_id == int(idlist[2]) and adj > 0) or (team_id == int(idlist[1]) and adj < 0):
        return row['Pred'] - adj_factor
        
    else:
        return row['Pred']
    
# look at Vegas odds, and see which teams the model undervalues that make sense (injuries, COVID, other reasons)
# and adjust their win probabilities for all games manually using function above
#submission['Pred'] = submission.apply(lambda x: team_adjustment(x, , ), axis = 1)

In [None]:
# run same code to compare new altered probabilities to Vegas odds

# merge submission with team slots to look at Round 1 probabilities
submission_r1 = pd.merge(submission, team_slots, on = ['TeamID_x', 'TeamID_y'])

# merge vegas odds with submission probabilities for round 1
submission_r1 = pd.merge(submission_r1, ml_odds, on = 'ID')

# Calculate absolute difference between my probabilities and Vegas'
submission_r1['AbsDiff'] = abs(submission_r1['VegasWinProb'] - submission_r1['Pred'])

# print submission_r1 ordering by the difference in win probability
submission_r1.sort_values('AbsDiff')

In [None]:
# make manual adjustments to round 1 win probabilities based on Vegas odds

# choose the more "aggressive" odds, as there are many entries in the Kaggle competition so 
# increasing the variance increases the win probability

# make sure the adjustments are on submission and not submission_r1

### Kaggle Alterations

Kaggle allows two submission files, so it's best to alter the submission probabilities to maximize your chance of winning. To do this, I will take the most even first round matchup, and assign a probability of 1 in the first submission and a probability of 0 in the second submission. This matchup is guaranteed to happen (as it's the opening round) and I will benefit from the improvement in log loss on that matchup going from a log loss of around 0.69 to a log loss of 0

In [None]:
# Assign probabiltiy 1 to the most even first round matchup
# juding from submission_r1 above, the most even first round matchup is...
submission1 = submission.copy()
submission2 = submission.copy()
submission1.iloc[2031, 1] = 1   # Oklahoma vs Miss, give Mississippi the full 100%
submission2.iloc[2031, 1] = 0   # Oklahoma vs Miss, give Oklahoma the full 100%

In [None]:
submission1.iloc[2031, ]

In [None]:
submission2.iloc[2031, ]

In [None]:
# write submissions to csv
submission.to_csv('mydata/mens/altered_probabilities.csv', index = False)
submission1.to_csv('mydata/mens/submission1_prob.csv', index = False)
submission2.to_csv('mydata/mens/submission2_prob.csv', index = False)