In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import math

In [42]:
##LOADING TEAM DATA (2015-2016, 2016-2017, 2017-2018, 2018-2019 NBA Seasons)

train_years = [2016,2017,2018]
test_years = [2019]

train = None
test = None

for year in train_years:
    filepath = '../Data/Team/'+str(year)+'/misc.csv'
    year_misc = pd.read_csv(filepath, skiprows = 1)
    #create playoffs flag
    year_misc['Playoffs'] = year_misc.apply(lambda row: 1 if '*' in row.Team else 0, axis=1)
    #remove * from team
    year_misc['Team'] = year_misc.apply(lambda row: row.Team.split('*')[0] if '*' in row.Team else row.Team , axis=1)
    #add year to end of team name
    year_misc['Team_year'] = year_misc['Team'] + (['/'+ str(year)] * len(year_misc))
    
    if train is None:
        train = year_misc
    else:
        train = train.append(year_misc, ignore_index = True)
        
for year in test_years:
    filepath = '../Data/Team/'+str(year)+'/misc.csv'
    year_misc = pd.read_csv(filepath, skiprows = 1)
    #create playoffs flag
    year_misc['Playoffs'] = year_misc.apply(lambda row: 1 if '*' in row.Team else 0, axis=1)
    #remove * from team
    year_misc['Team'] = year_misc.apply(lambda row: row.Team.split('*')[0] if '*' in row.Team else row.Team , axis=1)
    #add year to end of team name
    year_misc['Team_year'] = year_misc['Team'] + (['/'+ str(year)] * len(year_misc))
    
    if test is None:
        test = year_misc
    else:
        test = test.append(year_misc, ignore_index = True)
        

In [43]:
#Rename Misc columns to includ O4F and D4F prefix
train.rename(columns={'eFG%.1':'D4F_eFG%'}, inplace=True)
train.rename(columns={'FT/FGA.1':'D4F_FT/FGA'}, inplace=True)
train.rename(columns={'TOV%.1':'D4F_TOV%'}, inplace=True)
train.rename(columns={'DRB%':'D4F_DRB%'}, inplace=True)

train.rename(columns={'eFG%':'O4F_eFG%'}, inplace=True)
train.rename(columns={'FT/FGA':'O4F_FT/FGA'}, inplace=True)
train.rename(columns={'TOV%':'O4F_TOV%'}, inplace=True)
train.rename(columns={'ORB%':'O4F_DRB%'}, inplace=True)


test.rename(columns={'eFG%.1':'D4F_eFG%'}, inplace=True)
test.rename(columns={'FT/FGA.1':'D4F_FT/FGA'}, inplace=True)
test.rename(columns={'TOV%.1':'D4F_TOV%'}, inplace=True)
test.rename(columns={'DRB%':'D4F_DRB%'}, inplace=True)

test.rename(columns={'eFG%':'O4F_eFG%'}, inplace=True)
test.rename(columns={'FT/FGA':'O4F_FT/FGA'}, inplace=True)
test.rename(columns={'TOV%':'O4F_TOV%'}, inplace=True)
test.rename(columns={'ORB%':'O4F_DRB%'}, inplace=True)




In [44]:
#Summary of full table
#Calculate a normalized standard dev = std/mean of column

summary = train.describe().T
sigma = np.array(summary[['std']])
mu = np.array(summary[['mean']])
summary['norm_std'] = np.absolute(sigma/mu)
summary.sort_values(by='norm_std', axis = 0, ascending = False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,norm_std
MOV,93.0,0.00043,4.525282,-10.23,-3.04,0.2,2.72,11.63,10521.281438
SRS,93.0,-0.001935,4.351999,-9.92,-2.74,0.03,2.6,11.35,2248.533
NRtg,90.0,0.013333,4.717984,-10.4,-3.275,0.25,3.025,11.6,353.848782
SOS,93.0,-0.001935,0.287611,-0.56,-0.2,0.0,0.18,0.64,148.599052
Playoffs,93.0,0.516129,0.502448,0.0,0.0,1.0,1.0,1.0,0.973494
Rk,90.0,15.5,8.703932,1.0,8.0,15.5,23.0,30.0,0.561544
L,90.0,41.0,12.339759,9.0,34.0,40.0,50.0,72.0,0.30097
W,90.0,41.0,12.339759,10.0,32.0,42.0,48.0,73.0,0.30097
PW,93.0,40.956989,11.389272,16.0,33.0,42.0,49.0,67.0,0.278079
PL,93.0,41.043011,11.389272,15.0,33.0,40.0,49.0,66.0,0.277496


In [51]:
# ADD TEAM ABBREVIATIONS

tm_abrv = np.array([['ATL','Atlanta Hawks'],
['BKN','Brooklyn Nets'],
['BOS','Boston Celtics'],
['CHA','Charlotte Hornets'],
['CHI','Chicago Bulls'],
['CLE','Cleveland Cavaliers'],
['DAL','Dallas Mavericks'],
['DEN','Denver Nuggets'],
['DET','Detroit Pistons'],
['GSW','Golden State Warriors'],
['HOU','Houston Rockets'],
['IND','Indiana Pacers'],
['LAC','Los Angeles Clippers'],
['LAL','Los Angeles Lakers'],
['MEM','Memphis Grizzlies'],
['MIA','Miami Heat'],
['MIL','Milwaukee Bucks'],
['MIN','Minnesota Timberwolves'],
['NOP','New Orleans Pelicans'],
['NYK','New York Knicks'],
['OKC','Oklahoma City Thunder'],
['ORL','Orlando Magic'],
['PHI','Philadelphia 76ers'],
['PHX','Phoenix Suns'],
['POR','Portland Trail Blazers'],
['SAC','Sacramento Kings'],
['SAS','San Antonio Spurs'],
['TOR','Toronto Raptors'],
['UTA','Utah Jazz'],
['WAS','Washington Wizards']])

tm_abrv = pd.DataFrame(tm_abrv, columns = ['Abrv', 'Team_full'])

train_1 = train.merge(tm_abrv, left_on = 'Team', right_on = 'Team_full')
test_1 = test.merge(tm_abrv, left_on = 'Team', right_on = 'Team_full')

train_1.drop(['Team_full'], axis = 1, inplace = True)
test_1.drop(['Team_full'], axis = 1, inplace = True)


In [53]:
test_1

Unnamed: 0,Rk,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,O4F_eFG%,O4F_TOV%,O4F_DRB%,O4F_FT/FGA,D4F_eFG%,D4F_TOV%,D4F_DRB%,D4F_FT/FGA,Arena,Attend.,Attend./G,Playoffs,Team_year,Abrv
0,1.0,Milwaukee Bucks,26.9,60.0,22.0,61,21,8.87,-0.82,8.04,113.8,105.2,8.6,103.3,0.255,0.419,0.583,0.55,12.0,20.8,0.197,0.503,11.5,80.3,0.162,Fiserv Forum,721692,17602,1,Milwaukee Bucks/2019,MIL
1,2.0,Golden State Warriors,28.4,57.0,25.0,56,26,6.46,-0.04,6.42,115.9,109.5,6.4,100.9,0.227,0.384,0.596,0.565,12.6,22.5,0.182,0.508,11.7,77.1,0.205,Oracle Arena,803436,19596,1,Golden State Warriors/2019,GSW
2,3.0,Toronto Raptors,27.3,58.0,24.0,56,26,6.09,-0.6,5.49,113.1,107.1,6.0,100.2,0.247,0.379,0.579,0.543,12.4,21.9,0.198,0.509,13.1,77.1,0.19,Scotiabank Arena,812822,19825,1,Toronto Raptors/2019,TOR
3,4.0,Utah Jazz,27.3,50.0,32.0,54,28,5.26,0.03,5.28,110.9,105.7,5.2,100.3,0.295,0.394,0.572,0.538,13.4,22.9,0.217,0.507,12.4,80.3,0.189,Vivint Smart Home Arena,750546,18306,1,Utah Jazz/2019,UTA
4,5.0,Houston Rockets,29.2,53.0,29.0,53,29,4.77,0.19,4.96,115.5,110.7,4.8,97.9,0.279,0.519,0.581,0.542,12.0,22.8,0.221,0.525,13.4,74.4,0.21,Toyota Center,740392,18058,1,Houston Rockets/2019,HOU
5,6.0,Portland Trail Blazers,26.2,53.0,29.0,51,31,4.2,0.24,4.43,114.7,110.5,4.2,99.1,0.258,0.339,0.568,0.528,12.1,26.6,0.21,0.516,11.0,77.9,0.195,Moda Center,799345,19496,1,Portland Trail Blazers/2019,POR
6,7.0,Denver Nuggets,24.9,54.0,28.0,51,31,3.95,0.24,4.19,113.0,108.9,4.1,97.7,0.232,0.348,0.558,0.527,11.9,26.6,0.175,0.521,12.3,78.0,0.194,Pepsi Center,756457,18450,1,Denver Nuggets/2019,DEN
7,8.0,Boston Celtics,25.7,49.0,33.0,52,30,4.44,-0.54,3.9,112.2,107.8,4.4,99.6,0.215,0.381,0.567,0.534,11.5,21.6,0.173,0.514,13.4,77.0,0.198,TD Garden,763584,18624,1,Boston Celtics/2019,BOS
8,9.0,Oklahoma City Thunder,25.7,49.0,33.0,50,32,3.4,0.15,3.56,110.3,107.0,3.3,102.8,0.266,0.347,0.545,0.514,11.7,26.0,0.19,0.523,14.4,78.2,0.206,Chesapeake Energy Arena,746323,18203,1,Oklahoma City Thunder/2019,OKC
9,10.0,Indiana Pacers,27.0,48.0,34.0,50,32,3.33,-0.57,2.76,109.9,106.5,3.4,98.1,0.242,0.292,0.561,0.53,12.4,21.9,0.182,0.516,14.1,76.2,0.184,Bankers Life Fieldhouse,689310,16812,1,Indiana Pacers/2019,IND


In [54]:
train_1.to_csv('train_teams_cleaned.csv')
test_1.to_csv('test_teams_cleaned.csv')