In [1]:
# basic imports
import pandas as pd
import pickle

# load the model
with open("model.pkl", "rb") as f:
    model = pickle.load(f)

In [2]:
# create a dataframe of each team's statistics
df_stats = pd.DataFrame()

year = 2022

# load the previously generated csv files
df_total = pd.read_csv(f"stats/{year}-total-stats.csv")
df_records = pd.read_csv(f"records/{year}-records.csv")
# drop the year column to prevent repitition
df_total = df_total.drop("year", axis=1)
# merge the records and stats of tournament teams
df1 = pd.merge(df_records, df_total, on="team", how="left")

# concatenate the year to the rest of the dataframe
df_stats = pd.concat([df_stats, df1], ignore_index=True)

In [3]:
# function to generate the stat per game columns
def stats_per_game(df):
    num_games = df_stats["wins"] + df_stats["losses"]
    # columns to be averaged per game
    total_cols = ['pts', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']
    # append pg to the column names above
    per_game_cols = [c + "pg" for c in total_cols]
    for stat, stat_per_game in zip(total_cols, per_game_cols):
        # add the per game columns to the dataframe
        df_stats.insert(df_stats.columns.get_loc(stat)+1, stat_per_game, df_stats[stat]/num_games)
stats_per_game(df_stats)

In [4]:
# insert the win percentage column
num_games = df_stats["wins"] + df_stats["losses"]
df_stats.insert(df_stats.columns.get_loc("losses")+1, "winp", df_stats["wins"]/num_games)

In [5]:
# insert the conference win percentage column
num_cgames = df_stats["cwins"] + df_stats["closses"]
df_stats.insert(df_stats.columns.get_loc("closses")+1, "cwinp", df_stats["cwins"]/num_cgames)

In [6]:
# insert the field goal percentage column
df_stats.insert(df_stats.columns.get_loc("fga")+1, "fgp", df_stats["fgm"]/df_stats["fga"])

In [7]:
# insert the 3 point percentage column
df_stats.insert(df_stats.columns.get_loc("3pa")+1, "3pp", df_stats["3pm"]/df_stats["3pa"])

In [8]:
# insert the free throw percentage column
df_stats.insert(df_stats.columns.get_loc("fta")+1, "ftp", df_stats["ftm"]/df_stats["fta"])

In [9]:
# round everything in the dataframe to 3 decimal places
df_stats = df_stats.round(3)

In [10]:
df_stats

Unnamed: 0,year,team,seed,wins,losses,winp,cwins,closses,cwinp,pts,...,ast,astpg,stl,stlpg,blk,blkpg,tov,tovpg,pf,pfpg
0,2022,Arizona,1,30,4,0.882,18,2,0.900,3107,...,726,21.353,249,7.324,210,6.176,497,14.618,619,18.206
1,2022,Kansas,1,27,6,0.818,14,4,0.778,3129,...,613,18.576,251,7.606,172,5.212,485,14.697,636,19.273
2,2022,Gonzaga,1,25,4,0.862,13,1,0.929,2790,...,578,19.931,205,7.069,183,6.310,369,12.724,497,17.138
3,2022,Baylor,1,25,7,0.781,14,4,0.778,2619,...,542,16.938,298,9.312,117,3.656,421,13.156,548,17.125
4,2022,Duke,2,27,7,0.794,16,4,0.800,3122,...,642,18.882,249,7.324,219,6.441,400,11.765,532,15.647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,2022,Texas A&M-CC,16,23,11,0.676,7,7,0.500,2682,...,528,15.529,298,8.765,62,1.824,491,14.441,711,20.912
64,2022,Bryant,16,22,9,0.710,15,2,0.882,2497,...,444,14.323,207,6.677,143,4.613,444,14.323,544,17.548
65,2022,Wright State,16,21,13,0.618,15,7,0.682,2729,...,506,14.882,212,6.235,107,3.147,440,12.941,523,15.382
66,2022,Texas Southern,16,18,12,0.600,13,5,0.722,2209,...,346,11.533,178,5.933,165,5.500,476,15.867,565,18.833


In [11]:
# create a dataframe of each tournament game
df_games = pd.DataFrame()

# load the previously generated csv files
df_bracket = pd.read_csv("test-input.csv")
# df_bracket = pd.read_csv(f"brackets/{year}-bracket.csv")
# df_bracket = df_bracket[df_bracket["round"] > 0]
df_games = pd.concat([df_games, df_bracket], ignore_index=True)
df_games["year"] = df_games["year"].astype(int)

In [47]:
df_games

Unnamed: 0,year,team0,team1,region
0,2022,Gonzaga,Georgia State,West Regional
1,2022,Boise State,Memphis,West Regional
2,2022,Connecticut,New Mexico State,West Regional
3,2022,Arkansas,Vermont,West Regional
4,2022,Alabama,Notre Dame,West Regional
5,2022,Texas Tech,Montana State,West Regional
6,2022,Michigan State,Davidson,West Regional
7,2022,Duke,Cal State Fullerton,West Regional
8,2022,Baylor,Norfolk State,East Regional
9,2022,North Carolina,Marquette,East Regional


In [86]:
df = pd.DataFrame()

# for round in range(1, 7):
#     if i != 1:
#         continue
for i, game in df_games.iterrows():

    team0 = df_stats[(df_stats["team"] == game["team0"]) & (df_stats["year"] == game["year"])]
    team1 = df_stats[(df_stats["team"] == game["team1"]) & (df_stats["year"] == game["year"])]
    team0_team1 = pd.Series(dtype=object)
    team0_team1["year"] = game["year"]
    team0_team1["team0"] = game["team0"]
    team0_team1["team1"] = game["team1"]
#     team0_team1["region"] = game["region"]
    team0_team1 = team0_team1.append(team0.squeeze()[2:] - team1.squeeze()[2:])
    df = pd.concat([df, team0_team1.to_frame().T], ignore_index=True)
    
    
cols_to_drop = ['year', 'team0', 'team1',
         'losses', 'winp', 'cwins',
         'closses', 'pts', 'fgm',
         'fga', '3pm', '3pa',
         'ftm', 'fta', 'orb',
         'orbpg', 'drb', 'drbpg',
         'reb', 'ast', 'stl',
         'blk', 'tov', 'pf']

# df["pred"] = 
print(df_games)
model.predict_proba(df.drop(cols_to_drop, axis=1))
# df_games = 

# winners = []
# for j, match in df.iterrows():
#     if match.pred == 0:
#         winners.append(match.team0)
#     else:
#         winners.append(match.team1)

    year            team0                team1            region
0   2022          Gonzaga        Georgia State     West Regional
1   2022      Boise State              Memphis     West Regional
2   2022      Connecticut     New Mexico State     West Regional
3   2022         Arkansas              Vermont     West Regional
4   2022          Alabama           Notre Dame     West Regional
5   2022       Texas Tech        Montana State     West Regional
6   2022   Michigan State             Davidson     West Regional
7   2022             Duke  Cal State Fullerton     West Regional
8   2022           Baylor        Norfolk State     East Regional
9   2022   North Carolina            Marquette     East Regional
10  2022     Saint Mary's              Indiana     East Regional
11  2022             UCLA                Akron     East Regional
12  2022            Texas        Virginia Tech     East Regional
13  2022           Purdue                 Yale     East Regional
14  2022     Murray State

array([[0.97623644, 0.02376356],
       [0.75776068, 0.24223932],
       [0.44342835, 0.55657165],
       [0.87248395, 0.12751605],
       [0.46842183, 0.53157817],
       [0.82360535, 0.17639465],
       [0.44725952, 0.55274048],
       [0.97960127, 0.02039873],
       [0.95502793, 0.04497207],
       [0.91030395, 0.08969605],
       [0.74239679, 0.25760321],
       [0.78660953, 0.21339047],
       [0.54781248, 0.45218752],
       [0.9886059 , 0.0113941 ],
       [0.64186912, 0.35813088],
       [0.93706101, 0.06293899],
       [0.98963909, 0.01036091],
       [0.39490741, 0.60509259],
       [0.88797925, 0.11202075],
       [0.29073161, 0.70926839],
       [0.84911976, 0.15088024],
       [0.88095431, 0.11904569],
       [0.26715783, 0.73284217],
       [0.91886917, 0.08113083],
       [0.99578859, 0.00421141],
       [0.33747698, 0.66252302],
       [0.8087756 , 0.1912244 ],
       [0.54183028, 0.45816972],
       [0.55117725, 0.44882275],
       [0.93419817, 0.06580183],
       [0.

In [76]:
df

Unnamed: 0,year,team0,team1,seed,wins,losses,winp,cwins,closses,cwinp,...,astpg,stl,stlpg,blk,blkpg,tov,tovpg,pf,pfpg,pred
0,2022,Gonzaga,Georgia State,-15,7,-6,0.219,4,-4,0.286,...,5.967,-50,-2.038,53,1.667,30,0.617,31,0.495,0
1,2022,Boise State,Memphis,-1,4,-1,0.055,2,-2,0.111,...,-5.104,-67,-2.759,-60,-2.277,-104,-4.556,-95,-4.561,0
2,2022,Connecticut,New Mexico State,-7,-5,5,-0.156,0,2,-0.081,...,-0.156,26,0.813,74,2.313,-90,-2.813,16,0.5,1
3,2022,Arkansas,Vermont,-9,-2,2,-0.06,-4,4,-0.222,...,-0.091,108,3.273,69,2.091,126,3.818,149,4.515,0
4,2022,Alabama,Notre Dame,-5,-6,6,-0.188,-6,4,-0.25,...,-0.219,51,1.593,90,2.813,116,3.625,163,5.093,1
5,2022,Texas Tech,Montana State,-11,-2,2,-0.059,-4,2,-0.133,...,1.294,113,3.324,2,0.059,48,1.411,-41,-1.206,0
6,2022,Michigan State,Davidson,-3,-5,6,-0.171,-4,6,-0.283,...,1.527,48,1.271,93,2.657,127,3.439,105,2.639,1
7,2022,Duke,Cal State Fullerton,-13,6,-3,0.117,4,-1,0.094,...,7.269,16,-0.192,156,4.409,12,-0.751,26,-0.676,0
8,2022,Baylor,Norfolk State,-15,1,1,-0.019,2,2,-0.079,...,4.071,91,2.412,6,-0.044,-14,-1.344,1,-1.108,0
9,2022,North Carolina,Marquette,-1,8,-7,0.231,4,-3,0.171,...,1.704,-48,-1.751,-13,-0.573,59,1.446,8,-0.31,0


In [62]:
df

Unnamed: 0,year,team0,team1,region,seed,wins,losses,winp,cwins,closses,...,astpg,stl,stlpg,blk,blkpg,tov,tovpg,pf,pfpg,pred
0,2022,Gonzaga,Georgia State,West Regional,-15,7,-6,0.219,4,-4,...,5.967,-50,-2.038,53,1.667,30,0.617,31,0.495,0
1,2022,Boise State,Memphis,West Regional,-1,4,-1,0.055,2,-2,...,-5.104,-67,-2.759,-60,-2.277,-104,-4.556,-95,-4.561,0
2,2022,Connecticut,New Mexico State,West Regional,-7,-5,5,-0.156,0,2,...,-0.156,26,0.813,74,2.313,-90,-2.813,16,0.5,1
3,2022,Arkansas,Vermont,West Regional,-9,-2,2,-0.06,-4,4,...,-0.091,108,3.273,69,2.091,126,3.818,149,4.515,0
4,2022,Alabama,Notre Dame,West Regional,-5,-6,6,-0.188,-6,4,...,-0.219,51,1.593,90,2.813,116,3.625,163,5.093,1
5,2022,Texas Tech,Montana State,West Regional,-11,-2,2,-0.059,-4,2,...,1.294,113,3.324,2,0.059,48,1.411,-41,-1.206,0
6,2022,Michigan State,Davidson,West Regional,-3,-5,6,-0.171,-4,6,...,1.527,48,1.271,93,2.657,127,3.439,105,2.639,1
7,2022,Duke,Cal State Fullerton,West Regional,-13,6,-3,0.117,4,-1,...,7.269,16,-0.192,156,4.409,12,-0.751,26,-0.676,0
8,2022,Baylor,Norfolk State,East Regional,-15,1,1,-0.019,2,2,...,4.071,91,2.412,6,-0.044,-14,-1.344,1,-1.108,0
9,2022,North Carolina,Marquette,East Regional,-1,8,-7,0.231,4,-3,...,1.704,-48,-1.751,-13,-0.573,59,1.446,8,-0.31,0


In [46]:
df

Unnamed: 0,year,team0,team1,seed,wins,losses,winp,cwins,closses,cwinp,...,astpg,stl,stlpg,blk,blkpg,tov,tovpg,pf,pfpg,pred
0,2022,Gonzaga,Georgia State,-15,7,-6,0.219,4,-4,0.286,...,5.967,-50,-2.038,53,1.667,30,0.617,31,0.495,0
1,2022,Boise State,Memphis,-1,4,-1,0.055,2,-2,0.111,...,-5.104,-67,-2.759,-60,-2.277,-104,-4.556,-95,-4.561,0
2,2022,Connecticut,New Mexico State,-7,-5,5,-0.156,0,2,-0.081,...,-0.156,26,0.813,74,2.313,-90,-2.813,16,0.5,1
3,2022,Arkansas,Vermont,-9,-2,2,-0.06,-4,4,-0.222,...,-0.091,108,3.273,69,2.091,126,3.818,149,4.515,0
4,2022,Alabama,Notre Dame,-5,-6,6,-0.188,-6,4,-0.25,...,-0.219,51,1.593,90,2.813,116,3.625,163,5.093,1
5,2022,Texas Tech,Montana State,-11,-2,2,-0.059,-4,2,-0.133,...,1.294,113,3.324,2,0.059,48,1.411,-41,-1.206,0
6,2022,Michigan State,Davidson,-3,-5,6,-0.171,-4,6,-0.283,...,1.527,48,1.271,93,2.657,127,3.439,105,2.639,1
7,2022,Duke,Cal State Fullerton,-13,6,-3,0.117,4,-1,0.094,...,7.269,16,-0.192,156,4.409,12,-0.751,26,-0.676,0
8,2022,Baylor,Norfolk State,-15,1,1,-0.019,2,2,-0.079,...,4.071,91,2.412,6,-0.044,-14,-1.344,1,-1.108,0
9,2022,North Carolina,Marquette,-1,8,-7,0.231,4,-3,0.171,...,1.704,-48,-1.751,-13,-0.573,59,1.446,8,-0.31,0


In [None]:
# save the dataframe to a csv file
# df.to_csv('current-feature-vectors.csv', index=False) 

In [43]:
winners

['Gonzaga',
 'Boise State',
 'New Mexico State',
 'Arkansas',
 'Notre Dame',
 'Texas Tech',
 'Davidson',
 'Duke',
 'Baylor',
 'North Carolina',
 "Saint Mary's",
 'UCLA',
 'Texas',
 'Purdue',
 'Murray State',
 'Kentucky',
 'Arizona',
 'Texas Christian',
 'Houston',
 'Chattanooga',
 'Colorado State',
 'Tennessee',
 'Loyola (IL)',
 'Villanova',
 'Kansas',
 'Creighton',
 'Iowa',
 'Providence',
 'LSU',
 'Wisconsin',
 'Miami (FL)',
 'Auburn']