In [59]:
from __future__ import division
import numpy as np
import pandas as pd
import requests
from sklearn.linear_model import Ridge

In [60]:
# TODO: modularize
HEADERS = {
    'Host': 'stats.nba.com',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Referer': 'stats.nba.com',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
PLAYER_ENDPOINT = 'http://stats.nba.com/stats/commonplayerinfo?PlayerID={}'

PLAYERS_INDEX = ['AWAY_P1_ID', 'AWAY_P2_ID', 'AWAY_P3_ID', 'AWAY_P4_ID', 'AWAY_P5_ID', 'HOME_P1_ID', 'HOME_P2_ID', 'HOME_P3_ID', 'HOME_P4_ID', 'HOME_P5_ID']
AWAY_PLAYERS_INDEX = ['AWAY_P1_ID', 'AWAY_P2_ID', 'AWAY_P3_ID', 'AWAY_P4_ID', 'AWAY_P5_ID']
HOME_PLAYERS_INDEX = ['HOME_P1_ID', 'HOME_P2_ID', 'HOME_P3_ID', 'HOME_P4_ID', 'HOME_P5_ID']

In [61]:
def get_all_players(stints_df):
  return pd.unique(stints_df[PLAYERS_INDEX].values.ravel('K'))

In [62]:
def build_matrix():
  stints_df = pd.read_pickle('data/2017_18_stints.pkl')
  unique_players = get_all_players(stints_df)
  X = pd.DataFrame(columns=unique_players)
  y = pd.DataFrame(columns=['SCALED_MARGIN'])
  for i, stint in stints_df.iterrows():
    print i
    # assign -1 for away team players, +1 for home team players
    away_players = stint[AWAY_PLAYERS_INDEX]
    home_players = stint[HOME_PLAYERS_INDEX]
    X.loc[i, away_players] = -1
    X.loc[i, home_players] = 1
    y.loc[i] = (stint['MARGIN'] / stint['POSSESSIONS']) * 100
  X.to_pickle('matrix.pkl')
  y.to_pickle('scaled_margin.pkl')

In [152]:
def train_ridge_regression(X, y):
  # TODO: research alpha further
  ridge_regr = Ridge(alpha=4000)
  return ridge_regr.fit(X, y)

In [153]:
X = pd.read_pickle('data/matrix.pkl')
y = pd.read_pickle('data/scaled_margin.pkl')

In [154]:
stints_df = pd.read_pickle('data/2017_18_stints.pkl')
empty_stints = (stints_df.loc[:, 'POSSESSIONS'] == 0.) | (stints_df.loc[:, 'TIME_SECONDS'] == 0.)
print X.shape
print y.shape
X = X[~empty_stints]
y = y[~empty_stints]
print X.shape
print y.shape

(56537, 540)
(56537, 1)


(32092, 540)
(32092, 1)


In [155]:
X = X.fillna(0)
regr = train_ridge_regression(X, y)
weights = regr.coef_

In [156]:
player_weights_df = pd.DataFrame(np.column_stack([X.columns, weights[0]]), columns=['PLAYER_ID', 'WEIGHT'])
player_weights_df = player_weights_df.sort_values(by=['WEIGHT'], ascending=False)

In [158]:
player_weights_top_50_df = player_weights_df[:50]

In [161]:
player_names = []
for i, row in player_weights_top_50_df.iterrows():
  player_info_json = requests.get(PLAYER_ENDPOINT.format(int(row['PLAYER_ID'])), headers=HEADERS).json()
  df = pd.DataFrame(player_info_json['resultSets'][0]['rowSet'])
  df.columns = player_info_json['resultSets'][0]['headers']
  player_names.append(df['DISPLAY_FIRST_LAST'].values[0])
player_weights_top_50_df.assign(PLAYER_NAME=player_names)

Unnamed: 0,PLAYER_ID,WEIGHT,PLAYER_NAME
256,201566.0,1.627093,Russell Westbrook
145,2546.0,1.35692,Carmelo Anthony
197,2594.0,1.341402,Kyle Korver
23,203496.0,1.331459,Robert Covington
146,203500.0,1.283875,Steven Adams
323,101108.0,1.112314,Chris Paul
33,203076.0,1.046195,Anthony Davis
95,1628378.0,0.990337,Donovan Mitchell
7,200782.0,0.975562,PJ Tucker
156,202710.0,0.961509,Jimmy Butler
