In [124]:
%matplotlib inline

import math
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model, cross_validation, metrics
from sklearn import feature_selection as fs
from sklearn import preprocessing

# data from 1985-2014
df = pd.read_csv('../data/cleanedSeasonStats.csv')
for col in df.columns:
    if 'Unnamed' in col or col[-1] == '-': # meaningless columns
        del df[col]
        
# Scikit breaks with NaNs, replace with 0s
df.fillna(0, inplace=True)

In [125]:
df.head()

Unnamed: 0,year,player,totals-Rk,totals-Pos,totals-Age,totals-Tm,totals-G,totals-GS,totals-MP,totals-FG,...,advanced-OWS,advanced-DWS,advanced-WS,advanced-WS/48,advanced-OBPM,advanced-DBPM,advanced-BPM,advanced-VORP,curr-eff,next-eff
0,1985,Kareem Abdul-Jabbar,1,C,37,LAL,79,79,2630,723,...,7.6,3.6,11.2,0.204,3.6,1.3,4.9,4.6,26.1,24.2
1,1985,Alvan Adams,2,PF,30,PHO,82,69,2136,476,...,3.5,3.3,6.8,0.152,2.3,2.1,4.4,3.5,18.3,15.5
2,1985,Mark Aguirre,3,SF,25,DAL,80,79,2699,794,...,5.3,1.9,7.2,0.128,3.6,-1.9,1.7,2.5,21.2,20.1
3,1985,Danny Ainge,4,SG,25,BOS,75,73,2564,419,...,3.9,2.8,6.6,0.124,1.3,0.4,1.7,2.4,16.3,13.8
4,1985,Ron Anderson,7,SF,26,CLE,36,7,520,84,...,0.0,0.3,0.3,0.032,-2.4,-1.0,-3.5,-0.2,5.3,9.7


In [126]:
# Get features columns that don't overlap 
def is_redundant(col):
    return 'totals' in col or 'per_game' in col or 'per_minute' in col

# use only per-possession features
similar_features = set(filter(lambda col: is_redundant(col), df.columns.values))
# some advanced stats == per-100 stats, so remove
redundant = set(['advanced-Age','advanced-G','advanced-MP'])

# define features
useless_features = set(['year','player','totals-Rk','totals-Pos','totals-Tm','per_game-Rk','per_game-Pos','per_game-Tm','per_minute-Rk','per_minute-Pos','per_minute-Tm','per_poss-Rk','per_poss-Pos','per_poss-Tm','advanced-Rk','advanced-Pos','advanced-Tm'])
bias_features = set(['next-eff'])
feature_cols = list(set(df.columns)-useless_features-bias_features-similar_features-redundant)

data = df[feature_cols]
#data = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(data), columns=data.columns)

# Get y
target = df['next-eff']

In [134]:
# Standard linear regression
lin_reg = linear_model.LinearRegression(normalize=True)
lin_reg.fit(data,target)
kfold = cross_validation.KFold(len(target), n_folds=5, shuffle=True)
print "Linear Regression standard features: {}".format(cross_validation.cross_val_score(lin_reg, data, target, cv=kfold, scoring='r2').mean())

Linear Regression standard features: 0.740990262838


In [128]:
coefficients = sorted([(feature_cols[i], lin_reg.coef_[i]) for i in range(len(feature_cols))], key=lambda tup: abs(tup[1]), reverse=True)
t = filter(lambda tup: abs(tup[1]) != 0, coefficients)
t[:15]

[('advanced-WS/48', 17.891038357551373),
 ('advanced-TS%', -6.0142328819555742),
 ('advanced-STL%', -3.6769204610583017),
 ('per_poss-STL', 3.6759937765596535),
 ('per_poss-FG%', -2.0418409318655923),
 ('advanced-WS', 1.2450863028161969),
 ('advanced-OWS', -1.2214382035245412),
 ('per_poss-TRB', -1.068524935010343),
 ('advanced-DWS', -1.0341205725760108),
 ('per_poss-DRB', 0.91634568363996793),
 ('per_poss-FG', 0.86786098791226685),
 ('curr-eff', 0.81463973496940023),
 ('per_poss-2P%', 0.740718357724932),
 ('advanced-3PAr', 0.72618037409744352),
 ('advanced-FTr', 0.69012507602921891)]

In [132]:
lasso = linear_model.LassoCV(max_iter=5000, normalize=True, cv=5)
lasso.fit(data, target)

# 5-fold Cross Validation
kfold = cross_validation.KFold(len(target), n_folds=5, shuffle=True)
print "Lasso standard features: {}".format(cross_validation.cross_val_score(lasso, data, target, cv=kfold, scoring='r2').mean())

Lasso standard features: 0.740405857171


In [133]:
coefficients = sorted([(feature_cols[i], lasso.coef_[i]) for i in range(len(feature_cols))], key=lambda tup: abs(tup[1]), reverse=True)
t = filter(lambda tup: abs(tup[1]) != 0, coefficients)
t[:15]

[('advanced-TS%', -5.5819418582255516),
 ('advanced-WS/48', 5.1442903195585314),
 ('curr-eff', 0.78729933165804311),
 ('advanced-FTr', 0.41220480557687261),
 ('per_poss-3P%', -0.24803926700194612),
 ('advanced-VORP', 0.24328890440049392),
 ('per_poss-Age', -0.234335050220186),
 ('per_poss-TOV', 0.18970398495209673),
 ('per_poss-AST', 0.18596220298434157),
 ('per_poss-2P%', 0.17930777324885874),
 ('advanced-DWS', 0.16391259199680991),
 ('per_poss-PF', -0.085454702946412081),
 ('advanced-USG%', 0.079345329761370625),
 ('advanced-WS', 0.078827727108910894),
 ('advanced-PER', -0.067098467446330939)]