In [1]:
# basic imports
import pandas as pd

In [2]:
# create a dataframe to eventually export to csv
df = pd.DataFrame()

for year in range(2003, 2023):
    if year == 2020 or year == 2021:
        continue
    # load the previously generated csv files
    df_tour = pd.read_csv(f"stats/{year}-tournament-stats.csv")
    df_total = pd.read_csv(f"stats/{year}-total-stats.csv")
    df_records = pd.read_csv(f"records/{year}-records.csv")
    # drop the year column to prevent repitition
    df_total = df_total.drop("year", axis=1)
    df_tour = df_tour.drop("year", axis=1)
    # merge the records and stats of tournament teams
    df1 = pd.merge(df_records, df_total, on="team", how="left")
    df2 = pd.merge(df_records, df_tour, on="team", how="left")
    # subtract the tournament stats from the total stats so there is no data-leakage
    for col in ['pts', 'fgm', 'fga', '3pm', '3pa', 'ftm', 'fta', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']:
        df1[col] = df1[col] - df2[col]
    # concatenate the year to the rest of the dataframe
    df = pd.concat([df, df1], ignore_index=True)

In [3]:
# function to generate the stat per game columns
def stats_per_game(df):
    num_games = df["wins"] + df["losses"]
    # columns to be averaged per game
    total_cols = ['pts', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf']
    # append pg to the column names above
    per_game_cols = [c + "pg" for c in total_cols]
    for stat, stat_per_game in zip(total_cols, per_game_cols):
        # add the per game columns to the dataframe
        df.insert(df.columns.get_loc(stat)+1, stat_per_game, df[stat]/num_games)
stats_per_game(df)

In [4]:
# insert the win percentage column
num_games = df["wins"] + df["losses"]
df.insert(df.columns.get_loc("losses")+1, "winp", df["wins"]/num_games)

In [5]:
# insert the conference win percentage column
num_cgames = df["cwins"] + df["closses"]
df.insert(df.columns.get_loc("closses")+1, "cwinp", df["cwins"]/num_cgames)

In [6]:
# insert the field goal percentage column
df.insert(df.columns.get_loc("fga")+1, "fgp", df["fgm"]/df["fga"])

In [7]:
# insert the 3 point percentage column
df.insert(df.columns.get_loc("3pa")+1, "3pp", df["3pm"]/df["3pa"])

In [8]:
# insert the free throw percentage column
df.insert(df.columns.get_loc("fta")+1, "ftp", df["ftm"]/df["fta"])

In [9]:
# round everything in the dataframe to 3 decimal places
df = df.round(3)

In [10]:
# save the dataframe to a csv file
df.to_csv('stats-df.csv', index=False) 