In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize

In [5]:
sns.set()

## Data Prep

### Read Synergy Data

In [6]:
s_df = pd.read_csv('nba_offense.csv')
s_df = s_df[s_df['Season'] != 2016]
s_df['Player'] = s_df['Player'].apply(lambda x: ' '.join(x.split(' ')[1:]))
s_df.shape

(3135, 74)

Include labels regarding half court offense only

In [7]:
s_df['Overall_Poss'] = s_df['Overall_Poss'] - s_df['Miscellaneous_Poss'] - s_df['Transition_Poss']

In [8]:
poss_weights_cols = ['Spot Up_Poss%', 'P&R Ball Handler_Poss%', 
                     'Hand Off_Poss%', 'Off Screen_Poss%',
                     'Cut_Poss%', 'Offensive Rebounds (put backs)_Poss%',
                     'P&R Roll Man_Poss%', 'Isolation_Poss%', 
                     'Post-Up_Poss%']

In [9]:
for col in poss_weights_cols:
    s_df[col] = s_df[col.replace('%', '')] / s_df['Overall_Poss']
    s_df[col] = s_df[col].fillna(0)

Filter in players with 250 shot possessions

In [10]:
s_df = s_df[s_df['Overall_Poss'] >= 250]
s_df.shape

(1421, 74)

### Read BBR Data

In [11]:
bbr_adv_df = pd.read_csv('bbr_nba_advanced_stats.csv')
bbr_adv_df = bbr_adv_df[bbr_adv_df['season'] != 2016]
bbr_adv_df.shape

(3159, 28)

In [12]:
bbr_ros_df = pd.read_csv('bbr_nba_roster_stats.csv')
bbr_ros_df = bbr_ros_df[bbr_ros_df['season'] != 2016]
bbr_ros_df.shape

(2954, 12)

In [13]:
for col in ['player', 'player_alt']:
    bbr_ros_df[col] = bbr_ros_df[col].apply(lambda x: x.replace('(TW)', '').strip())

17-20

In [14]:
bbr_adv_df_17_20 = bbr_adv_df[bbr_adv_df['season'] != 2021].copy()
bbr_ros_df_17_20 = bbr_ros_df[bbr_ros_df['season'] != 2021].copy()
bbr_df_17_20 = bbr_adv_df_17_20.merge(bbr_ros_df_17_20, how='left',
                                      on=['player', 'season', 'team', 'player_alt'])
bbr_adv_df_17_20.shape, bbr_ros_df_17_20.shape, bbr_df_17_20.shape

((2444, 28), (2446, 12), (2444, 36))

21-22

In [15]:
bbr_adv_df_21 = bbr_adv_df[bbr_adv_df['season'] == 2021].copy()
bbr_ros_df_21 = bbr_ros_df[bbr_ros_df['season'] == 2021].copy()
bbr_df_21 = bbr_adv_df_21.merge(bbr_ros_df_21, how='left',
                                on=['player', 'season', 'player_alt'])\
                         .rename(columns={'team_x': 'team'})\
                         .drop('team_y', axis=1)
bbr_adv_df_21.shape, bbr_ros_df_21.shape, bbr_df_21.shape

((715, 28), (508, 12), (715, 36))

In [16]:
bbr_df = pd.concat([bbr_df_17_20, bbr_df_21])\
           .reset_index().drop('index', axis=1)

Filter in players that played over 200 minutes

In [17]:
bbr_df = bbr_df[bbr_df['mp'] >= 200]
bbr_df.shape

(2323, 36)

Convert heights

In [18]:
def convert_height(height):
    feet, inches = height.split('-')
    cm = int(feet) * 12 + int(inches)
    return round(cm * 2.54, 1)

In [19]:
bbr_df[bbr_df['height'].isna()].index

Int64Index([2480, 2500, 2507, 2553, 2576, 2627, 2715, 2718, 2719, 2720, 2762,
            2810, 2835, 2928, 2929, 2952, 3017, 3020, 3024, 3026, 3070],
           dtype='int64')

In [20]:
bbr_df.loc[2480, 'height'] = '6-10'
bbr_df.loc[2480, 'weight'] = 250
bbr_df.loc[2500, 'height'] = '6-7'
bbr_df.loc[2500, 'weight'] = 240
bbr_df.loc[2507, 'height'] = '6-5'
bbr_df.loc[2507, 'weight'] = 210
bbr_df.loc[2553, 'height'] = '6-7'
bbr_df.loc[2553, 'weight'] = 215
bbr_df.loc[2576, 'height'] = '6-4'
bbr_df.loc[2576, 'weight'] = 220
bbr_df.loc[2627, 'height'] = '6-6'
bbr_df.loc[2627, 'weight'] = 205
bbr_df.loc[2715, 'height'] = '6-0'
bbr_df.loc[2715, 'weight'] = 180
bbr_df.loc[2718, 'height'] = '6-5'
bbr_df.loc[2718, 'weight'] = 230
bbr_df.loc[2719, 'height'] = '6-3'
bbr_df.loc[2719, 'weight'] = 210
bbr_df.loc[2720, 'height'] = '6-6'
bbr_df.loc[2720, 'weight'] = 230
bbr_df.loc[2762, 'height'] = '6-8'
bbr_df.loc[2762, 'weight'] = 215
bbr_df.loc[2810, 'height'] = '6-8'
bbr_df.loc[2810, 'weight'] = 215
bbr_df.loc[2835, 'height'] = '6-6'
bbr_df.loc[2835, 'weight'] = 240
bbr_df.loc[2928, 'height'] = '7-0'
bbr_df.loc[2928, 'weight'] = 237
bbr_df.loc[2929, 'height'] = '6-8'
bbr_df.loc[2929, 'weight'] = 216
bbr_df.loc[2952, 'height'] = '6-0'
bbr_df.loc[2952, 'weight'] = 170
bbr_df.loc[3017, 'height'] = '6-9'
bbr_df.loc[3017, 'weight'] = 245
bbr_df.loc[3020, 'height'] = '6-2'
bbr_df.loc[3020, 'weight'] = 205
bbr_df.loc[3024, 'height'] = '6-11'
bbr_df.loc[3024, 'weight'] = 240
bbr_df.loc[3026, 'height'] = '6-3'
bbr_df.loc[3026, 'weight'] = 205
bbr_df.loc[3070, 'height'] = '6-9'
bbr_df.loc[3070, 'weight'] = 245

In [21]:
bbr_df['height_cm'] = bbr_df['height'].apply(convert_height)

Convert weights

In [22]:
bbr_df['weight_kg'] = bbr_df['weight'].apply(lambda x: round(0.453592 * x, 1))

### Merge Synergy Data & BBR Data

In [23]:
s_df['Player_Alt'] = s_df['Player'].apply(lambda x: x.replace(' ', '').lower())\
                                   .apply(lambda x: x.replace("'", ''))\
                                   .apply(lambda x: x.replace('.', ''))

In [24]:
bbr_df['player_alt_alt'] = bbr_df['player_alt'].apply(lambda x: x.replace(' ', ''))\
                                               .apply(lambda x: x.replace("'", ''))\
                                               .apply(lambda x: x.replace('.', ''))\
                                               .apply(lambda x: x.lower())

In [25]:
synergy_name_dict = {'kevinknoxii': 'kevinknox',
                     'danilogalinari': 'danilogallinari',
                     'louiswilliams': 'louwilliams',
                     'eneskanter': 'enesfreedom',
                     'terryrozieriii': 'terryrozier',
                     'cameronreddish': 'camreddish',
                     'jahilokafor': 'jahlilokafor',
                     'danuelhouse': 'danuelhousejr',
                     'alfarouqaminu': 'al-farouqaminu',
                     'nenehilario': 'nene',
                     'frankmason': 'frankmasoniii',
                     'larrynance': 'larrynancejr',
                     'wesleyiwundu': 'wesiwundu',
                     'robertwilliamsiii': 'robertwilliams',
                     'harrygilesiii': 'harrygiles',
                     'jamesennis': 'jamesennisiii',
                     'mohamedbamba': 'mobamba',
                     'xaviertillman': 'xaviertillmansr',
                     'patrickbeverly': 'patrickbeverley',
                     'malcomdelaney': 'malcolmdelaney',
                     'marcusmorrissr': 'marcusmorris',
                     'nicolasclaxton': 'nicclaxton',
                     'juanhernangomez': 'juanchohernangomez',
                     'guillermohernangomez': 'willyhernangomez',
                     'timothéluwawu-cabarrot': 'timotheluwawu-cabarrot'}
s_df['Player_Alt'] = s_df['Player_Alt'].apply(lambda x: synergy_name_dict[x] 
                                              if x in synergy_name_dict.keys()
                                              else x)

In [26]:
df = s_df.merge(bbr_df, how='left', 
                left_on=['Player_Alt', 'Team', 'Season'],
                right_on=['player_alt_alt', 'team', 'season'])
df = df[~df['player_alt_alt'].isna()].reset_index().drop('index', axis=1)
df.shape

(1420, 114)

In [27]:
# check 
df[['Player_Alt', 'Team', 'Season']].drop_duplicates().shape

(1420, 3)

## Outlier Detection

In [28]:
from iforest import *

In [30]:
X = df[poss_weights_cols].values

In [36]:
model = IsolationTreeEnsemble(sample_size=256)
model.fit(X)

<iforest.IsolationTreeEnsemble at 0x7fa7a5de3fa0>

In [37]:
df['anomaly_score'] = model.anomaly_score(X)
df['anomaly_score'].describe()

count    1420.000000
mean        0.433647
std         0.054215
min         0.351362
25%         0.392289
50%         0.417471
75%         0.462099
max         0.643274
Name: anomaly_score, dtype: float64

In [44]:
df[df['anomaly_score'] > 0.55].shape[0] / df.shape[0]

0.04154929577464789

In [46]:
profile_cols = ['Player', 'Team', 'Season', 'anomaly_score']
score_cond_1 = df['anomaly_score'] > 0.55
df[score_cond_1][profile_cols].sort_values('anomaly_score', ascending=False)

Unnamed: 0,Player,Team,Season,anomaly_score
270,Wayne Ellington,Miami Heat,2017,0.643274
387,Kevon Looney,Golden State Warriors,2018,0.631423
1361,Jarred Vanderbilt,Minnesota Timberwolves,2021,0.622546
128,Kyle Korver,Cleveland Cavaliers,2017,0.615032
497,Allen Crabbe,Brooklyn Nets,2018,0.614565
143,Andre Drummond,Detroit Pistons,2017,0.603059
607,JaVale McGee,Los Angeles Lakers,2019,0.598155
752,Russell Westbrook,Houston Rockets,2019,0.59403
808,JJ Redick,New Orleans Pelicans,2019,0.592445
1086,Willy Hernangomez,New Orleans Pelicans,2020,0.592186
