In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
with open("../data/processed/dataset.pickle","rb") as reader:
    df=pickle.load(reader)

In [3]:
df.shape

(2906, 51)

In [4]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Player,Tm,Salary,Year,Pos,Age,G,GS,MP,FG,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
4,Gordon Hayward,Boston Celtics,31214295,2018,SF,27.0,1.0,1.0,5.0,1.0,...,0.0,17.9,0.0,0.0,0.0,0.056,-5.9,-6.1,-12.0,0.0
30,Rudy Gobert,Utah Jazz,23241573,2018,C,25.0,56.0,56.0,32.4,4.9,...,15.7,16.9,4.2,3.9,8.1,0.214,-0.3,4.6,4.3,2.9
32,DeAndre Jordan,New York Knicks,22897200,2018,C,29.0,77.0,77.0,31.5,4.8,...,16.1,15.2,6.0,3.4,9.4,0.186,0.5,1.6,2.1,2.5
54,Tristan Thompson,Cleveland Cavaliers,17469565,2018,C,26.0,53.0,22.0,20.2,2.5,...,11.8,12.6,1.8,0.7,2.5,0.113,-1.6,-0.9,-2.5,-0.1
80,Marcin Gortat,LA Clippers,13565218,2018,C,33.0,82.0,82.0,25.3,3.5,...,13.5,15.5,2.4,2.4,4.9,0.113,-1.6,1.9,0.3,1.2
91,Miles Plumlee,Atlanta Hawks,12500000,2018,C,29.0,55.0,35.0,16.7,1.9,...,23.4,12.8,0.1,0.7,0.9,0.046,-3.8,1.3,-2.5,-0.1
107,Omer Asik,Chicago Bulls,11286516,2018,C,31.0,18.0,0.0,10.1,0.5,...,24.5,8.7,-0.2,0.2,-0.1,-0.021,-8.1,0.0,-8.1,-0.3
131,Kosta Koufos,Sacramento Kings,8739500,2018,C,28.0,71.0,13.0,19.6,3.1,...,10.4,15.2,2.3,1.5,3.9,0.134,-0.8,1.4,0.5,0.9
159,Boban Marjanovic,Philadelphia 76ers,7000000,2018,C,29.0,39.0,1.0,8.6,2.0,...,15.9,29.4,1.0,0.5,1.5,0.215,-0.3,-0.3,-0.6,0.1
194,Alan Williams,Brooklyn Nets,5000000,2018,PF,25.0,5.0,0.0,14.0,1.4,...,24.2,17.5,-0.1,0.1,0.0,-0.004,-6.5,2.8,-3.7,0.0


Before we proceed, we can eliminate some of the columns as they can be defined in terms of the other. 

$FG\% = \frac{FG}{FGA}$

$3P\% = \frac{3P}{3PA}$

$2P\% = \frac{2P}{2PA}$

$FT\% = \frac{FT}{FTA}$

$TRB = ORB+DRB$

$WS = OWS+DWS$

We can eliminate at least 5 columns to make things easier. Also I choose to leave our year since we no longer need it for the categorization although it may be put back in for improvements.

In [8]:
remove=['FG%','3P%','2P%','FT%','TRB','WS','Year']
rest = df.columns.difference(remove)
df_smaller = df[rest]

In [10]:
df.corr()

Unnamed: 0,Salary,Year,Age,G,GS,MP,FG,FGA,FG%,3P,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
Salary,1.0,0.208715,0.114716,0.240775,0.514883,0.5775,0.640661,0.613245,0.174041,0.291209,...,-0.108058,0.432291,0.539029,0.484911,0.575182,0.39116,0.44359,0.151424,0.45599,0.546965
Year,0.208715,1.0,0.00789,0.029292,0.004592,-0.018352,0.028392,0.032203,0.007754,0.175602,...,-0.040094,0.027989,-0.00125,-0.013428,-0.006229,0.036913,0.057741,0.011469,0.054761,0.003987
Age,0.114716,0.00789,1.0,-0.059423,-0.02895,-0.034743,-0.086906,-0.081809,-0.045868,0.033205,...,0.044308,-0.144776,-0.02426,-0.00802,-0.020604,0.00695,-0.016688,0.042188,0.009451,-0.002994
G,0.240775,0.029292,-0.059423,1.0,0.546194,0.547593,0.434397,0.409111,0.212858,0.256747,...,-0.178298,0.114605,0.48006,0.627261,0.589573,0.378654,0.435025,0.222153,0.488793,0.406043
GS,0.514883,0.004592,-0.02895,0.546194,1.0,0.804221,0.711159,0.682892,0.199023,0.296346,...,-0.115189,0.316071,0.602481,0.678686,0.696985,0.373735,0.474687,0.240643,0.532015,0.625551
MP,0.5775,-0.018352,-0.034743,0.547593,0.804221,1.0,0.877818,0.880056,0.135727,0.46922,...,-0.202218,0.462635,0.639878,0.646504,0.710858,0.405044,0.633075,0.142452,0.609814,0.627416
FG,0.640661,0.028392,-0.086906,0.434397,0.711159,0.877818,1.0,0.977691,0.204332,0.427309,...,-0.260903,0.7523,0.707142,0.588846,0.7365,0.470861,0.674964,0.031778,0.582951,0.653949
FGA,0.613245,0.032203,-0.081809,0.409111,0.682892,0.880056,0.977691,1.0,0.041864,0.529594,...,-0.278835,0.780592,0.63249,0.525618,0.658326,0.363311,0.657149,-0.056325,0.51871,0.597867
FG%,0.174041,0.007754,-0.045868,0.212858,0.199023,0.135727,0.204332,0.041864,1.0,-0.283692,...,0.089954,-0.027869,0.334587,0.301316,0.357361,0.643822,0.287255,0.368584,0.446737,0.257726
3P,0.291209,0.175602,0.033205,0.256747,0.296346,0.46922,0.427309,0.529594,-0.283692,1.0,...,-0.289186,0.341724,0.357022,0.12451,0.304212,0.140607,0.595487,-0.32842,0.314871,0.339485
