In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
# basic imports
import pandas as pd

In [3]:
all_stats = pd.DataFrame({})
addi = 0

for year in range(2003, 2023):
    if year == 2020 or year == 2021:
        continue
    # load the previously generated csv files
    df_tour_stats = pd.read_csv(f"stats/{year}-tournament-stats.csv")
    df_stats = pd.read_csv(f"stats/{year}-total-stats.csv")
    df_records = pd.read_csv(f"records/{year}-records.csv")
    
    # which cols do we want to substract the tour from the total stats
    cols = ['pts', 'fgm', 'fga', '3pm',
            '3pa', 'ftm', 'fta', 'orb',
            'drb', 'reb', 'ast', 'stl',
            'blk', 'tov', 'pf']
    
    # merge the total and tour stats, then substract the stat columns
    df_stats = pd.merge(df_stats, df_tour_stats, on=["year", "team"], how="left", suffixes=["_total", "_tour"])
    for col in cols:
        total = f"{col}_total"
        tour = f"{col}_tour"
        df_stats[col] = df_stats[total] - df_stats[tour]
        df_stats = df_stats.drop([total, tour], axis=1)
    
    # merge in the records and drop na
    df_stats = pd.merge(df_stats, df_records, on=["year", "team"], how="left")
    df_stats = df_stats.dropna()
    
    # total number of games played
    num_games = df_stats["wins"] + df_stats["losses"]
    
    # columns to be averaged per game
    total_cols = ['pts', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']
    
    # append pg to the column names above
    per_game_cols = [col + "pg" for col in total_cols]
    for stat, stat_per_game in zip(total_cols, per_game_cols):
        # add the per game columns to the dataframe
        df_stats.insert(df_stats.columns.get_loc(stat)+1, stat_per_game, df_stats[stat]/num_games)
    df_stats.insert(df_stats.columns.get_loc("losses")+1, "winp", df_stats["wins"]/num_games)
    
    # insert the conference win percentage column
    num_cgames = df_stats["cwins"] + df_stats["closses"]
    df_stats.insert(df_stats.columns.get_loc("closses")+1, "cwinp", df_stats["cwins"]/num_cgames)
    
    # insert the field goal percentage column
    df_stats.insert(df_stats.columns.get_loc("fga")+1, "fgp", df_stats["fgm"]/df_stats["fga"])
    
    # insert the 3 point percentage column
    df_stats.insert(df_stats.columns.get_loc("3pa")+1, "3pp", df_stats["3pm"]/df_stats["3pa"])
    
    # insert the free throw percentage column
    df_stats.insert(df_stats.columns.get_loc("fta")+1, "ftp", df_stats["ftm"]/df_stats["fta"])
    
    # round everything in the dataframe to 3 decimal places
    df_stats = df_stats.round(3)
    
    # append the current year to the list of all years
    all_stats = pd.concat([all_stats, df_stats], ignore_index=True) 
    
    

In [4]:
all_stats.to_csv("all_stats.csv")

In [None]:
# all_stats.columns

In [None]:
# df_tour_stats = df_tour_stats.set_index(["year", "team"])
# df_tour_stats
# # dfB = dfB.set_index(['Department', 'Speciality', 'TargetMonth'])

In [None]:
# df_stats.dropna()

In [None]:
# df_tour_stats[df_tour_stats["team"]=="Yale"]

In [None]:
# df_stats[df_stats["team"]=="Yale"]

In [None]:
# df_records[df_records["team"]=="Yale"]

In [None]:
# df_stats[df_stats["team"].isin(teams)]

In [None]:
# # # create a dataframe of each team's statistics
# # df_stats = pd.DataFrame()

# # for year in range(2003, 2023):
# #     if year == 2020 or year == 2021:
# #         continue
# #     # load the previously generated csv files
# #     df_tour = pd.read_csv(f"stats/{year}-tournament-stats.csv")
#     df_total = pd.read_csv(f"stats/{year}-total-stats.csv")
#     df_records = pd.read_csv(f"records/{year}-records.csv")
#     # drop the year column to prevent repitition
#     df_total = df_total.drop("year", axis=1)
#     df_tour = df_tour.drop("year", axis=1)
#     # merge the records and stats of tournament teams
#     df1 = pd.merge(df_records, df_total, on="team", how="left")
#     df2 = pd.merge(df_records, df_tour, on="team", how="left")
#     # subtract the tournament stats from the total stats so there is no data-leakage
#     for col in ['pts', 'fgm', 'fga', '3pm', '3pa', 'ftm', 'fta', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']:
#         df1[col] = df1[col] - df2[col]
    
#     # concatenate the year to the rest of the dataframe
#     df_stats = pd.concat([df_stats, df1], ignore_index=True)

In [None]:
#     # total number of games played
#     num_games = df_stats["wins"] + df_stats["losses"]
    
#     # columns to be averaged per game
#     total_cols = ['pts', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']
    
#     # append pg to the column names above
#     per_game_cols = [col + "pg" for col in total_cols]
#     for stat, stat_per_game in zip(total_cols, per_game_cols):
#         # add the per game columns to the dataframe
#         df_stats.insert(df_stats.columns.get_loc(stat)+1, stat_per_game, df_stats[stat]/num_games)
#     df_stats.insert(df_stats.columns.get_loc("losses")+1, "winp", df_stats["wins"]/num_games)
    
#     # insert the conference win percentage column
#     num_cgames = df_stats["cwins"] + df_stats["closses"]
#     df_stats.insert(df_stats.columns.get_loc("closses")+1, "cwinp", df_stats["cwins"]/num_cgames)
    
#     # insert the field goal percentage column
#     df_stats.insert(df_stats.columns.get_loc("fga")+1, "fgp", df_stats["fgm"]/df_stats["fga"])
    
#     # insert the 3 point percentage column
#     df_stats.insert(df_stats.columns.get_loc("3pa")+1, "3pp", df_stats["3pm"]/df_stats["3pa"])
    
#     # insert the free throw percentage column
#     df_stats.insert(df_stats.columns.get_loc("fta")+1, "ftp", df_stats["ftm"]/df_stats["fta"])
    
#     # round everything in the dataframe to 3 decimal places
#     df_stats = df_stats.round(3)

In [None]:
# # insert the win percentage column
# num_games = df_stats["wins"] + df_stats["losses"]
# df_stats.insert(df_stats.columns.get_loc("losses")+1, "winp", df_stats["wins"]/num_games)

In [None]:
# # insert the conference win percentage column
# num_cgames = df_stats["cwins"] + df_stats["closses"]
# df_stats.insert(df_stats.columns.get_loc("closses")+1, "cwinp", df_stats["cwins"]/num_cgames)

In [None]:
# # insert the field goal percentage column
# df_stats.insert(df_stats.columns.get_loc("fga")+1, "fgp", df_stats["fgm"]/df_stats["fga"])

In [None]:
# # insert the 3 point percentage column
# df_stats.insert(df_stats.columns.get_loc("3pa")+1, "3pp", df_stats["3pm"]/df_stats["3pa"])

In [None]:
# # insert the free throw percentage column
# df_stats.insert(df_stats.columns.get_loc("fta")+1, "ftp", df_stats["ftm"]/df_stats["fta"])

In [None]:
# # round everything in the dataframe to 3 decimal places
# df_stats = df_stats.round(3)

In [None]:
# df_stats

In [None]:
# # create a dataframe of each tournament game
# df_games = pd.DataFrame()
# for year in range(2003, 2023):
#     if year == 2020 or year == 2021:
#         continue
#     # load the previously generated csv files
#     df_bracket = pd.read_csv(f"brackets/{year}-bracket.csv")
#     df_bracket = df_bracket[df_bracket["round"] > 0]
#     df_games = pd.concat([df_games, df_bracket], ignore_index=True)
# df_games["year"] = df_games["year"].astype(int)

In [None]:
# df_games

In [None]:
# df = pd.DataFrame()

# for i, game in df_games.iterrows():
#     winner = df_stats[(df_stats["team"] == game["winner"]) & (df_stats["year"] == game["year"])]
#     loser = df_stats[(df_stats["year"] == game["year"]) & (df_stats["team"] == game["loser"])]
#     winner_loser = pd.Series(dtype=object)
#     winner_loser["year"] = game["year"]
#     winner_loser["team0"] = game["winner"]
#     winner_loser["team1"] = game["loser"]
#     winner_loser = winner_loser.append(winner.squeeze()[2:] - loser.squeeze()[2:])
#     winner_loser["result"] = 0
#     df = pd.concat([df, winner_loser.to_frame().T], ignore_index=True)
#     loser_winner = pd.Series(dtype=object)
#     loser_winner["year"] = game["year"]
#     loser_winner["team0"] = game["loser"]
#     loser_winner["team1"] = game["winner"]
#     loser_winner = loser_winner.append(loser.squeeze()[2:] - winner.squeeze()[2:])
#     loser_winner["result"] = 1
#     df = pd.concat([df, loser_winner.to_frame().T], ignore_index=True)
    
# df

In [None]:
# # save the dataframe to a csv file
# df.to_csv('feature-vectors.csv', index=False) 