In [3]:
# basic imports
import pandas as pd

all_stats = pd.DataFrame({})
all_brackets = pd.DataFrame({})

for year in range(2003, 2025):
    if year in [2020, 2021]:
        continue
    # load the previously generated csv files

    df_tour_stats = pd.read_csv(f"stats/{year}_tournament_stats.csv")
    df_stats = pd.read_csv(f"stats/{year}_total_stats.csv")
    df_records = pd.read_csv(f"records/{year}_records.csv")
    
    # which cols do we want to substract the tour from the total stats
    cols = ['pts', 'fgm', 'fga', '3pm',
            '3pa', 'ftm', 'fta', 'orb',
            'drb', 'reb', 'ast', 'stl',
            'blk', 'tov', 'pf']

    # merge the total and tour stats, then substract the stat columns
    df_stats = pd.merge(df_stats, df_tour_stats, on=["year", "team"], how="left", suffixes=["_total", "_tour"])
    for col in cols:
        total = f"{col}_total"
        tour = f"{col}_tour"
        df_stats[col] = df_stats[total] - df_stats[tour]
        df_stats = df_stats.drop([total, tour], axis=1)
    
    # merge in the records and drop na
    df_stats = pd.merge(df_stats, df_records, on=["year", "team"], how="left")
    df_stats = df_stats.dropna()
    
    # total number of games played
    num_games = df_stats["wins"] + df_stats["losses"]
    
    # columns to be averaged per game
    total_cols = ['pts', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']
    
    # append pg to the column names above
    per_game_cols = [col + "pg" for col in total_cols]
    for stat, stat_per_game in zip(total_cols, per_game_cols):
        # add the per game columns to the dataframe
        df_stats.insert(df_stats.columns.get_loc(stat)+1, stat_per_game, df_stats[stat]/num_games)
    df_stats.insert(df_stats.columns.get_loc("losses")+1, "winp", df_stats["wins"]/num_games)
    
    # insert the conference win percentage column
    num_cgames = df_stats["cwins"] + df_stats["closses"]
    df_stats.insert(df_stats.columns.get_loc("closses")+1, "cwinp", df_stats["cwins"]/num_cgames)
    
    # insert the field goal percentage column
    df_stats.insert(df_stats.columns.get_loc("fga")+1, "fgp", df_stats["fgm"]/df_stats["fga"])
    
    # insert the 3 point percentage column
    df_stats.insert(df_stats.columns.get_loc("3pa")+1, "3pp", df_stats["3pm"]/df_stats["3pa"])
    
    # insert the free throw percentage column
    df_stats.insert(df_stats.columns.get_loc("fta")+1, "ftp", df_stats["ftm"]/df_stats["fta"])
    
    # round everything in the dataframe to 3 decimal places
    df_stats = df_stats.round(3)
    
    # load in the current year bracket
    df_bracket = pd.read_csv(f"brackets/{year}_bracket.csv")

    # append the current year to the list of all years
    all_stats = pd.concat([all_stats, df_stats], ignore_index=True) 
    all_brackets = pd.concat([all_brackets, df_bracket], ignore_index=True) 

    
all_stats.to_csv("stats/all_stats.csv", index=False)
all_stats.to_csv("brackets/all_brackets.csv", index=False)


In [37]:
# merge the game history and stats history to obtain data prepared for the model
all_data = pd.merge(all_brackets, all_stats, how='left', left_on=['year', 'team0'], right_on=['year', 'team'])
all_data = pd.merge(all_data, all_stats, how='left', left_on=['year', 'team1'], right_on=['year', 'team'], suffixes=['_team0', '_team1'])
all_data = all_data.drop(['team_team0', 'team_team1'], axis=1)
cols = [
    'pts', 'ptspg', 'fgm', 'fga', 'fgp', '3pm',
    '3pa', '3pp', 'ftm', 'fta', 'ftp',
    'orb', 'orbpg', 'drb', 'drbpg', 'reb',
    'rebpg', 'ast', 'astpg', 'stl', 'stlpg',
    'blk', 'blkpg', 'tov', 'tovpg', 'pf',
    'pfpg', 'seed', 'wins', 'losses', 'winp',
    'cwins', 'closses', 'cwinp'
]
for col in cols:
    all_data[f'{col}_diff'] = all_data[f'{col}_team0'] - all_data[f'{col}_team1']

all_data.to_csv("all_data.csv", index=False)