In [1]:
import os
import sys

# Get the current directory of the notebook
notebook_dir = os.path.dirname(os.path.abspath('__file__'))

# Get the parent directory (one level up)
parent_dir = os.path.dirname(notebook_dir)

# Add the parent directory to the system path
sys.path.append(parent_dir)
from utils import get_categorical_columns, get_numeric_columns, get_matchups

In [2]:
import pandas as pd
pd.set_option('display.max_columns', 50)

### **Concatenating data**

In [3]:
dir_path = './raw_data/teams'
csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]

In [4]:
dfs = []
for csv_file in csv_files:
    csv_path = os.path.join(dir_path, csv_file)
    df = pd.read_csv(csv_path)
    dfs.append(df)

df_raw = pd.concat(dfs, ignore_index=True)

### **Data transformation**

#### Building matchups

In [5]:
cat_cols = get_categorical_columns(df_raw)
num_cols = get_numeric_columns(df_raw)

In [6]:
unuseful_num_cols = ['GP', 'TEAM_ID', 'W_PCT', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK', 'DEF_RATING_RANK', 'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK', 'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK', 'TM_TOV_PCT_RANK', 'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK', 'PIE_RANK']
unuseful_cat_cols = []


unuseful_cols = unuseful_num_cols + unuseful_cat_cols

In [7]:
df_raw.columns

Index(['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'E_OFF_RATING',
       'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING', 'E_NET_RATING',
       'NET_RATING', 'AST_PCT', 'AST_TO', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT',
       'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'E_PACE', 'PACE',
       'PACE_PER40', 'POSS', 'PIE', 'GP_RANK', 'W_RANK', 'L_RANK',
       'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK', 'DEF_RATING_RANK',
       'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK', 'AST_RATIO_RANK',
       'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK', 'TM_TOV_PCT_RANK',
       'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK', 'PIE_RANK'],
      dtype='object')

In [8]:
df_cleaned = df_raw.drop(columns=unuseful_cols)

In [9]:
df_cleaned.head()

Unnamed: 0,TEAM_NAME,W,L,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TO,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0,Atlanta Hawks,38,44,3966.0,103.4,104.9,104.1,105.7,-0.7,-0.7,0.667,1.63,18.8,0.256,0.711,0.485,0.159,0.515,0.554,96.9,95.41,79.51,7892,0.502
1,Boston Celtics,25,57,3946.0,99.7,101.9,105.2,106.5,-5.5,-4.6,0.576,1.37,16.2,0.314,0.708,0.505,0.163,0.477,0.517,95.9,94.28,78.57,7745,0.461
2,Brooklyn Nets,44,38,3976.0,104.4,105.4,104.9,106.7,-0.6,-1.3,0.585,1.44,16.7,0.26,0.686,0.475,0.155,0.514,0.555,93.7,92.42,77.02,7663,0.492
3,Charlotte Bobcats,43,39,3981.0,101.2,103.0,101.2,103.3,0.1,-0.4,0.597,1.76,16.9,0.269,0.73,0.5,0.131,0.481,0.522,94.7,92.93,77.44,7712,0.508
4,Chicago Bulls,48,34,3986.0,99.7,101.7,97.8,99.9,1.9,1.9,0.654,1.52,17.6,0.321,0.713,0.518,0.162,0.471,0.518,92.7,90.84,75.7,7549,0.528


In [10]:
df_cleaned.to_csv('./data/teams.csv', index=False)