In [15]:
# basic imports
import pandas as pd

In [16]:
# create a dataframe of each team's statistics
df_stats = pd.DataFrame()

for year in range(2003, 2022):
    if year == 2020 or year == 2021:
        continue
    # load the previously generated csv files
    df_tour = pd.read_csv(f"stats/{year}-tournament-stats.csv")
    df_total = pd.read_csv(f"stats/{year}-total-stats.csv")
    df_records = pd.read_csv(f"records/{year}-records.csv")
    # drop the year column to prevent repitition
    df_total = df_total.drop("year", axis=1)
    df_tour = df_tour.drop("year", axis=1)
    # merge the records and stats of tournament teams
    df1 = pd.merge(df_records, df_total, on="team", how="left")
    df2 = pd.merge(df_records, df_tour, on="team", how="left")
    # subtract the tournament stats from the total stats so there is no data-leakage
    for col in ['pts', 'fgm', 'fga', '3pm', '3pa', 'ftm', 'fta', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']:
        df1[col] = df1[col] - df2[col]
    
    # concatenate the year to the rest of the dataframe
    df_stats = pd.concat([df_stats, df1], ignore_index=True)

In [17]:
# function to generate the stat per game columns
def stats_per_game(df):
    num_games = df_stats["wins"] + df_stats["losses"]
    # columns to be averaged per game
    total_cols = ['pts', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']
    # append pg to the column names above
    per_game_cols = [c + "pg" for c in total_cols]
    for stat, stat_per_game in zip(total_cols, per_game_cols):
        # add the per game columns to the dataframe
        df_stats.insert(df_stats.columns.get_loc(stat)+1, stat_per_game, df_stats[stat]/num_games)
stats_per_game(df_stats)

In [18]:
# insert the win percentage column
num_games = df_stats["wins"] + df_stats["losses"]
df_stats.insert(df_stats.columns.get_loc("losses")+1, "winp", df_stats["wins"]/num_games)

In [19]:
# insert the conference win percentage column
num_cgames = df_stats["cwins"] + df_stats["closses"]
df_stats.insert(df_stats.columns.get_loc("closses")+1, "cwinp", df_stats["cwins"]/num_cgames)

In [20]:
# insert the field goal percentage column
df_stats.insert(df_stats.columns.get_loc("fga")+1, "fgp", df_stats["fgm"]/df_stats["fga"])

In [21]:
# insert the 3 point percentage column
df_stats.insert(df_stats.columns.get_loc("3pa")+1, "3pp", df_stats["3pm"]/df_stats["3pa"])

In [22]:
# insert the free throw percentage column
df_stats.insert(df_stats.columns.get_loc("fta")+1, "ftp", df_stats["ftm"]/df_stats["fta"])

In [23]:
# round everything in the dataframe to 3 decimal places
df_stats = df_stats.round(3)

In [24]:
df_stats

Unnamed: 0,year,team,seed,wins,losses,winp,cwins,closses,cwinp,pts,...,ast,astpg,stl,stlpg,blk,blkpg,tov,tovpg,pf,pfpg
0,2003,Oklahoma,1,23,7,0.767,12,4,0.750,2135.0,...,425.0,14.167,208.0,6.933,113.0,3.767,353.0,11.767,558.0,18.600
1,2003,Kentucky,1,27,5,0.844,16,0,1.000,2481.0,...,519.0,16.219,248.0,7.750,166.0,5.188,447.0,13.969,557.0,17.406
2,2003,Texas,1,21,7,0.750,13,3,0.812,2208.0,...,406.0,14.500,179.0,6.393,108.0,3.857,375.0,13.393,570.0,20.357
3,2003,Arizona,1,25,3,0.893,17,1,0.944,2386.0,...,493.0,17.607,240.0,8.571,118.0,4.214,412.0,14.714,497.0,17.750
4,2003,Wake Forest,2,22,7,0.759,12,4,0.750,2274.0,...,423.0,14.586,186.0,6.414,130.0,4.483,431.0,14.862,534.0,18.414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,2019,Prairie View A&M,16,22,12,0.647,17,1,0.944,2551.0,...,428.0,12.588,300.0,8.824,54.0,1.588,432.0,12.706,739.0,21.735
1128,2019,Fairleigh Dickinson,16,20,13,0.606,12,6,0.667,2489.0,...,466.0,14.121,253.0,7.667,120.0,3.636,443.0,13.424,554.0,16.788
1129,2019,North Dakota State,16,18,15,0.545,9,7,0.562,2416.0,...,383.0,11.606,159.0,4.818,82.0,2.485,349.0,10.576,492.0,14.909
1130,2019,North Carolina Central,16,18,15,0.545,10,6,0.625,2355.0,...,509.0,15.424,207.0,6.273,91.0,2.758,506.0,15.333,590.0,17.879


In [25]:
# create a dataframe of each tournament game
df_games = pd.DataFrame()
for year in range(2003, 2022):
    if year == 2020 or year == 2021:
        continue
    # load the previously generated csv files
    df_bracket = pd.read_csv(f"brackets/{year}-bracket.csv")
    df_bracket = df_bracket[df_bracket["round"] > 0]
    df_games = pd.concat([df_games, df_bracket], ignore_index=True)
df_games["year"] = df_games["year"].astype(int)

In [26]:
df_games

Unnamed: 0,year,team0,team1,winner,loser,region,round
0,2003,Kentucky,IUPUI,Kentucky,IUPUI,Midwest Regional,1.0
1,2003,Oregon,Utah,Utah,Oregon,Midwest Regional,1.0
2,2003,Wisconsin,Weber State,Wisconsin,Weber State,Midwest Regional,1.0
3,2003,Dayton,Tulsa,Tulsa,Dayton,Midwest Regional,1.0
4,2003,Missouri,Southern Illinois,Missouri,Southern Illinois,Midwest Regional,1.0
...,...,...,...,...,...,...,...
1066,2019,Tennessee,Purdue,Purdue,Tennessee,South Regional,3.0
1067,2019,Virginia,Purdue,Virginia,Purdue,South Regional,4.0
1068,2019,Michigan State,Texas Tech,Texas Tech,Michigan State,none,5.0
1069,2019,Virginia,Auburn,Virginia,Auburn,none,5.0


In [27]:
df = pd.DataFrame()

for i, game in df_games.iterrows():
    winner = df_stats[(df_stats["team"] == game["winner"]) & (df_stats["year"] == game["year"])]
    loser = df_stats[(df_stats["team"] == game["loser"]) & (df_stats["year"] == game["year"])]
    winner_loser = pd.Series(dtype=object)
    winner_loser["year"] = game["year"]
    winner_loser["team0"] = game["winner"]
    winner_loser["team1"] = game["loser"]
    winner_loser = winner_loser.append(winner.squeeze()[2:] - loser.squeeze()[2:])
    winner_loser["result"] = 0
    df = pd.concat([df, winner_loser.to_frame().T], ignore_index=True)
    loser_winner = pd.Series(dtype=object)
    loser_winner["year"] = game["year"]
    loser_winner["team0"] = game["loser"]
    loser_winner["team1"] = game["winner"]
    loser_winner = loser_winner.append(loser.squeeze()[2:] - winner.squeeze()[2:])
    loser_winner["result"] = 1
    df = pd.concat([df, loser_winner.to_frame().T], ignore_index=True)
    
df

Unnamed: 0,year,team0,team1,seed,wins,losses,winp,cwins,closses,cwinp,...,astpg,stl,stlpg,blk,blkpg,tov,tovpg,pf,pfpg,result
0,2003,Kentucky,IUPUI,-15,7,-8,0.238,6,-4,0.286,...,2.886,-34.0,-0.795,106.0,3.37,-71.0,-1.728,-84.0,-2.018,0
1,2003,IUPUI,Kentucky,15,-7,8,-0.238,-6,4,-0.286,...,-2.886,34.0,0.795,-106.0,-3.37,71.0,1.728,84.0,2.018,1
2,2003,Utah,Oregon,1,3,-4,0.118,1,-5,0.23,...,-4.211,-108.0,-3.225,29.0,1.025,-100.0,-2.738,-53.0,-1.034,0
3,2003,Oregon,Utah,-1,-3,4,-0.118,-1,5,-0.23,...,4.211,108.0,3.225,-29.0,-1.025,100.0,2.738,53.0,1.034,1
4,2003,Wisconsin,Weber State,-7,-4,2,-0.08,-2,4,-0.25,...,-0.918,-4.0,0.323,58.0,2.054,-88.0,-2.165,-151.0,-3.914,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137,2019,Michigan State,Texas Tech,-1,-1,4,-0.111,2,0,0.022,...,4.457,-58.0,-2.378,29.0,0.406,41.0,0.079,6.0,-1.443,1
2138,2019,Virginia,Auburn,-4,-1,-3,0.074,5,-5,0.278,...,0.654,-150.0,-3.623,-46.0,-0.865,-138.0,-2.877,-167.0,-3.042,0
2139,2019,Auburn,Virginia,4,1,3,-0.074,-5,5,-0.278,...,-0.654,150.0,3.623,46.0,0.865,138.0,2.877,167.0,3.042,1
2140,2019,Virginia,Texas Tech,-2,-1,1,-0.032,2,-2,0.111,...,0.484,-56.0,-1.807,-35.0,-1.13,-107.0,-3.451,-100.0,-3.226,0


In [28]:
# save the dataframe to a csv file
df.to_csv('feature-vectors.csv', index=False) 