In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Process Data

### Elements

In [4]:
playtypes = ['Isolation', 'P&R Ball Handler', 'Post-Up', 
             'P&R Roll Man', 'Spot Up', 'Off Screen', 
             'Hand Off', 'Cut', 'Offensive Rebounds', 
             'Transition', 
#              'Miscellaneous'
            ]

In [5]:
situations = ['Pick and Rolls Including Passes',
              'Isolations Including Passes',
              'Post-Ups Including Passes']

In [6]:
shottypes = ['Catch and Shoot', 'At Rim',
             'Long (3 point jump shots)',
             'All Jump Shots off the Dribble']

### Functions

In [7]:
def process_all_fg_sheet(df, season):
    df['Season'] = season
    df = df[['Player', 'Team', 'Season', 'GP', 'FG Made', 'FG Att']]\
         .rename(columns={'FG Made': 'FGM',
                          'FG Att': 'FGA'})
    for col in ['FGM', 'FGA']:
        df[col] = df[col].apply(np.float)
    return df.sort_values(['Team', 'FGA'], ascending=False)

In [8]:
def process_shottype_sheet(shottype, df, season):
    df['Season'] = season
    df = df[['Player', 'Team', 'Season', 'GP', 'FG Made', 'FG Att']]\
         .rename(columns={'FG Made': 'FGM',
                          'FG Att': 'FGA'})
    for col in ['FGM', 'FGA']:
        df[col] = df[col].apply(np.float) 
    df['FG%'] = df['FGM'] / df['FGA'] * 1.0
    return df.rename(columns={col: f'{shottype}_{col}'
                              for col in df.columns
                              if 'FG' in col})

In [9]:
def process_playtype_sheet(playtype, df, season):
    df['Season'] = season
    df = df[['Player', 'Team', 'Season', 'GP', 'Poss', 'PPP']]
    df['PPP'] = df['PPP'].apply(np.float)
    df['Poss'] = df['Poss'].apply(np.float) 
    return df.rename(columns={col: f'{playtype}_{col}'
                              for col in ['Poss', 'PPP']})

In [10]:
def process_situation_sheet(situation, df, season):
    df['Season'] = season
    df = df[['Player', 'Team', 'Season', 'GP', 'Poss']]
    df['Poss'] = df['Poss'].apply(np.float) 
    return df.rename(columns={'Poss': f'{situation}_Poss'})

In [11]:
def process_season(league, season,
                   shottypes=shottypes, 
                   playtypes=playtypes,
                   situations=situations):
    season_dir = f'{league}_{season}'

    # Initialize
    for sheet_fn in os.listdir(season_dir):
        if 'All field goal attempts' in sheet_fn:
            df = pd.read_csv(f'{season_dir}/{sheet_fn}', 
                             header=1)
            break
    df = process_all_fg_sheet(df, season)
    print('Shape after Initialization:', df.shape)

    # Process shottype
    for shottype in shottypes:
        for sheet_fn in os.listdir(season_dir):
            if shottype in sheet_fn:
                st_df = pd.read_csv(f'{season_dir}/{sheet_fn}', 
                                    header=1)
                break
        st_df = process_shottype_sheet(shottype, st_df, season)
        df = df.merge(st_df, 
                      on=['Player', 'Team', 'Season', 'GP'],
                      how='left')\
               .fillna(0)
        df[f'{shottype}_FGA%'] = df[f'{shottype}_FGA'] / df['FGA'] * 1.0

    # Process playtype
    for playtype in playtypes:
        for sheet_fn in os.listdir(season_dir):
            if playtype in sheet_fn and 'Including Passes' not in sheet_fn:
                pt_df = pd.read_csv(f'{season_dir}/{sheet_fn}', 
                                    header=1)
                break
        pt_df = process_playtype_sheet(playtype, pt_df, season)
        df = df.merge(pt_df,
                      on=['Player', 'Team', 'Season', 'GP'],
                      how='left')\
               .fillna(0)
    df['Overall_Poss'] = df[[col for col in df.columns
                             if 'Poss' in col]].sum(axis=1)
    for playtype in playtypes:
        df[f'{playtype}_Poss%'] = df[f'{playtype}_Poss'] / df['Overall_Poss'] * 1.0

    # Process situations
    for situation in situations:
        for sheet_fn in os.listdir(season_dir):
            if situation in sheet_fn:
                sit_df = pd.read_csv(f'{season_dir}/{sheet_fn}', 
                                     header=1)
                break
        sit_df = process_situation_sheet(situation, sit_df, season)
        df = df.merge(sit_df,
                      on=['Player', 'Team', 'Season', 'GP'],
                      how='left')\
               .fillna(0)
    
    # Add
    df['P&R Ball Handler_End_Rt'] = \
    df['P&R Ball Handler_Poss'] / \
    df['Pick and Rolls Including Passes_Poss'].apply(lambda x: 1 
                                                     if x == 0 
                                                     else x) * 1.0
    
    print('Shape after Processing:', df.shape)
    
    return df

### Process NBA

In [12]:
league = 'NBA'
dfs = []
for start_year in range(2017, 2023):
    end_year = start_year + 1
    season = f'{start_year}-{end_year}'
    print(season)
    dfs.append(process_season(league, season))
    print()
    
df = pd.concat(dfs).reset_index().drop('index', axis=1)
print(df.shape)

2017-2018
Shape after Initialization: (602, 6)
Shape after Processing: (602, 57)

2018-2019
Shape after Initialization: (618, 6)
Shape after Processing: (618, 57)

2019-2020
Shape after Initialization: (589, 6)
Shape after Processing: (589, 57)

2020-2021
Shape after Initialization: (624, 6)
Shape after Processing: (624, 57)

2021-2022
Shape after Initialization: (700, 6)
Shape after Processing: (700, 57)

2022-2023
Shape after Initialization: (607, 6)
Shape after Processing: (607, 57)

(3740, 57)


In [13]:
df.to_csv(f'{league}_offense.csv', index=False)

### Process Euroleague

In [14]:
league = 'Euroleague'
dfs = []
for start_year in range(2017, 2022):
    end_year = start_year + 1
    season = f'{start_year}-{end_year}'
    print(season)
    dfs.append(process_season(league, season))
    print()
    
df = pd.concat(dfs).reset_index().drop('index', axis=1)
print(df.shape)

2017-2018
Shape after Initialization: (248, 6)
Shape after Processing: (248, 57)

2018-2019
Shape after Initialization: (247, 6)
Shape after Processing: (247, 57)

2019-2020
Shape after Initialization: (290, 6)
Shape after Processing: (290, 57)

2020-2021
Shape after Initialization: (290, 6)
Shape after Processing: (290, 57)

2021-2022
Shape after Initialization: (283, 6)
Shape after Processing: (283, 57)

(1358, 57)


In [15]:
df.to_csv(f'{league}_offense.csv', index=False)

### Process EuroCup

In [16]:
league = 'EuroCup'
dfs = []
for start_year in range(2017, 2022):
    end_year = start_year + 1
    season = f'{start_year}-{end_year}'
    print(season)
    dfs.append(process_season(league, season))
    print()
    
df = pd.concat(dfs).reset_index().drop('index', axis=1)
print(df.shape)

2017-2018
Shape after Initialization: (333, 6)
Shape after Processing: (333, 57)

2018-2019
Shape after Initialization: (354, 6)
Shape after Processing: (354, 57)

2019-2020
Shape after Initialization: (331, 6)
Shape after Processing: (331, 57)

2020-2021
Shape after Initialization: (351, 6)
Shape after Processing: (351, 57)

2021-2022
Shape after Initialization: (299, 6)
Shape after Processing: (299, 57)

(1668, 57)


In [17]:
df.to_csv(f'{league}_offense.csv', index=False)

### Process NCAA

In [14]:
league = 'College Men'
dfs = []
for start_year in range(2017, 2023):
    end_year = start_year + 1
    season = f'{start_year}-{end_year}'
    print(season)
    dfs.append(process_season(league, season))
    print()
    
df = pd.concat(dfs).reset_index().drop('index', axis=1)
print(df.shape)

2017-2018
Shape after Initialization: (4786, 6)
Shape after Processing: (4786, 57)

2018-2019
Shape after Initialization: (4780, 6)
Shape after Processing: (4780, 57)

2019-2020
Shape after Initialization: (4791, 6)
Shape after Processing: (4791, 57)

2020-2021
Shape after Initialization: (4898, 6)
Shape after Processing: (4898, 57)

2021-2022
Shape after Initialization: (5027, 6)
Shape after Processing: (5027, 57)

2022-2023
Shape after Initialization: (4962, 6)
Shape after Processing: (4962, 57)

(29244, 57)


In [15]:
df.to_csv(f'{league}_offense.csv', index=False)

### Process CBA

In [20]:
league = 'CBA'
dfs = []
for start_year in range(2017, 2022):
    end_year = start_year + 1
    season = f'{start_year}-{end_year}'
    print(season)
    dfs.append(process_season(league, season))
    print()
    
df = pd.concat(dfs).reset_index().drop('index', axis=1)
print(df.shape)

2017-2018
Shape after Initialization: (325, 6)
Shape after Processing: (325, 57)

2018-2019
Shape after Initialization: (364, 6)
Shape after Processing: (364, 57)

2019-2020
Shape after Initialization: (361, 6)
Shape after Processing: (361, 57)

2020-2021
Shape after Initialization: (370, 6)
Shape after Processing: (370, 57)

2021-2022
Shape after Initialization: (380, 6)
Shape after Processing: (380, 57)

(1800, 57)


In [21]:
df.to_csv(f'{league}_offense.csv', index=False)