In [1]:
import os
import sys

# Get the current directory of the notebook
notebook_dir = os.path.dirname(os.path.abspath('__file__'))

# Get the parent directory (one level up)
parent_dir = os.path.dirname(notebook_dir)

# Add the parent directory to the system path
sys.path.append(parent_dir)
from utils import get_categorical_columns, get_numeric_columns, get_matchups

In [2]:
import pandas as pd
pd.set_option('display.max_columns', 50)

### **Concatenating data**

In [3]:
dir_path = './raw_data/players'
csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]

In [4]:
dfs = []
for csv_file in csv_files:
    csv_path = os.path.join(dir_path, csv_file)
    df = pd.read_csv(csv_path)
    dfs.append(df)

df_raw = pd.concat(dfs, ignore_index=True)

### **Data transformation**

#### Building matchups

In [5]:
home_teams, away_teams = get_matchups(df_raw)

df_raw['HOME_TEAM'] = home_teams
df_raw['AWAY_TEAM'] = away_teams

In [6]:
team_names_dict = {abbrev: full_name for abbrev, full_name in zip(df_raw['TEAM_ABBREVIATION'], df_raw['TEAM_NAME'])}

df_raw['HOME_TEAM'].replace(team_names_dict, inplace=True)
df_raw['AWAY_TEAM'].replace(team_names_dict, inplace=True)

#### Features selection

In [7]:
get_categorical_columns(df_raw)

['SEASON_YEAR',
 'PLAYER_NAME',
 'NICKNAME',
 'TEAM_ABBREVIATION',
 'TEAM_NAME',
 'GAME_DATE',
 'MATCHUP',
 'WL',
 'HOME_TEAM',
 'AWAY_TEAM']

In [8]:
get_numeric_columns(df_raw)

['PLAYER_ID',
 'TEAM_ID',
 'GAME_ID',
 'MIN',
 'FGM',
 'FGA',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'TOV',
 'STL',
 'BLK',
 'BLKA',
 'PF',
 'PFD',
 'PTS',
 'PLUS_MINUS',
 'NBA_FANTASY_PTS',
 'DD2',
 'TD3',
 'WNBA_FANTASY_PTS',
 'GP_RANK',
 'W_RANK',
 'L_RANK',
 'W_PCT_RANK',
 'MIN_RANK',
 'FGM_RANK',
 'FGA_RANK',
 'FG_PCT_RANK',
 'FG3M_RANK',
 'FG3A_RANK',
 'FG3_PCT_RANK',
 'FTM_RANK',
 'FTA_RANK',
 'FT_PCT_RANK',
 'OREB_RANK',
 'DREB_RANK',
 'REB_RANK',
 'AST_RANK',
 'TOV_RANK',
 'STL_RANK',
 'BLK_RANK',
 'BLKA_RANK',
 'PF_RANK',
 'PFD_RANK',
 'PTS_RANK',
 'PLUS_MINUS_RANK',
 'NBA_FANTASY_PTS_RANK',
 'DD2_RANK',
 'TD3_RANK',
 'WNBA_FANTASY_PTS_RANK',
 'VIDEO_AVAILABLE_FLAG']

In [9]:
unuseful_num_cols = ['PLAYER_ID', 'TEAM_ID', 'GAME_ID', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'NBA_FANTASY_PTS', 'DD2', 'TD3', 'WNBA_FANTASY_PTS', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK', 'WNBA_FANTASY_PTS_RANK', 'VIDEO_AVAILABLE_FLAG']
unuseful_cat_cols = ['SEASON_YEAR', 'NICKNAME', 'TEAM_ABBREVIATION', 'MATCHUP', 'WL']

unuseful_cols = unuseful_num_cols + unuseful_cat_cols

### **Data analysis**

In [10]:
df_cleaned = df_raw.drop(columns=unuseful_cols)

In [11]:
df_cleaned.head()

Unnamed: 0,PLAYER_NAME,TEAM_NAME,GAME_DATE,MIN,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,HOME_TEAM,AWAY_TEAM
0,Steven Adams,Oklahoma City Thunder,2014-04-16T00:00:00,10.7,0.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-7.0,Oklahoma City Thunder,Detroit Pistons
1,Steven Adams,Oklahoma City Thunder,2014-04-14T00:00:00,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,New Orleans Pelicans,Oklahoma City Thunder
2,Steven Adams,Oklahoma City Thunder,2014-04-13T00:00:00,13.1,2.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,3.0,1.0,4.0,-5.0,Indiana Pacers,Oklahoma City Thunder
3,Steven Adams,Oklahoma City Thunder,2014-04-11T00:00:00,10.1,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,-3.0,Oklahoma City Thunder,New Orleans Pelicans
4,Steven Adams,Oklahoma City Thunder,2014-04-09T00:00:00,19.6,2.0,2.0,0.0,0.0,3.0,5.0,2.0,5.0,7.0,0.0,0.0,1.0,1.0,0.0,3.0,3.0,7.0,5.0,LA Clippers,Oklahoma City Thunder


In [12]:
df_cleaned.to_csv('./data/players.csv', index=False)