# Transforming NBA Shooting Data by Player

In [243]:
import numpy as np
import pandas as pd
import pickle

import plotly.graph_objects as go
import plotly.express as px

from nba_api.stats.endpoints.playerdashptshots import PlayerDashPtShots
from nba_api.stats.endpoints.playerprofilev2 import PlayerProfileV2

from functions import *

## Transforming NBA API Data
- Engineer points per shot (PPS) and FG2 < 10ft columns
- Add games played and team(s) columns to player (by season)

In [8]:
seasons = [
    '2013-14',
    '2014-15',
    '2015-16',
    '2016-17',
    '2017-18',
    '2018-19',
    '2019-20',
]

In [151]:
overall = shooting_by_seasons(seasons, 4)
tenplus = shooting_by_seasons(seasons, 5)

In [168]:
with open('overall.p', 'rb') as f:
    overall = pickle.load(f)

with open('tenplus.p', 'rb') as f:
    tenplus = pickle.load(f)

In [172]:
# Rename columns
overall.rename(
    columns={
        'PLAYER_ID': 'player_id',
        'PLAYER_NAME_LAST_FIRST': 'last_first',
        'CLOSE_DEF_DIST_RANGE': 'closest_def',
        'FG3M': 'threeM',
        'FG3A': 'threeA',
    },
    inplace=True
)

tenplus.rename(
    columns={
        'PLAYER_ID': 'player_id',
        'PLAYER_NAME_LAST_FIRST': 'last_first',
        'CLOSE_DEF_DIST_RANGE': 'closest_def',
        'FG2>=10M': 'midM',
        'FG2>=10A': 'midA',
    },
    inplace=True
)

In [174]:
# Keep relevant data
overall = overall[['player_id', 'last_first', 'season', 'closest_def', 'FG2M', 'FG2A', 'threeM', 'threeA']]
tenplus = tenplus[['player_id', 'last_first', 'season', 'closest_def', 'midM', 'midA']]

In [177]:
df = pd.merge(overall, tenplus, how='left', on=['player_id', 'last_first', 'season', 'closest_def'])

In [179]:
# Deal with NaN values
df['midM'] = np.where(df['midM'].isna(), 0, df['midM'])
df['midA'] = np.where(df['midA'].isna(), 0, df['midA'])

In [181]:
# Create FG2<10 columns
df['shortM'] = df['FG2M'] - df['midM']
df['shortA'] = df['FG2A'] - df['midA']

# Drop old columns
df.drop(columns=['FG2M', 'FG2A'], inplace=True)

In [183]:
# Create PPS columns
df['threePPS'] = 3 * df['threeM'] / df['threeA']
df['midPPS'] = 2 * df['midM'] / df['midA']
df['shortPPS'] = 2 * df['shortM'] / df['shortA']

In [185]:
# Deal with NaN values
df['threePPS'] = np.where(df['threePPS'].isna(), 0, df['threePPS'])
df['midPPS'] = np.where(df['midPPS'].isna(), 0, df['midPPS'])
df['shortPPS'] = np.where(df['shortPPS'].isna(), 0, df['shortPPS'])

In [162]:
# All players in seasons
players = list(df['player_id'].unique())

In [101]:
def _split_teams(team):
    l = len(team)
    splits = [ (3*i, 3*i + 3) for i in range(int(l/3)) ]
    teams = [ team[split[0]:split[1]] for split in splits ]
    return teams

In [132]:
# Helper function to get teams and games played by season for players
def _get_team_gp(player_id):
    temp = PlayerProfileV2(player_id).get_data_frames()[0]
    temp = temp.loc[
        (temp['SEASON_ID'].isin(seasons))& # Seasons with shooting data
        (temp['TEAM_ABBREVIATION'] != 'TOT')& # Remove season totals (total data not complete)
        (temp['LEAGUE_ID'] == '00') # Only keep NBA teams
    ]
    teams = temp.groupby('SEASON_ID')['TEAM_ABBREVIATION'].sum()
    teams = teams.map(_split_teams)
    gp = temp.groupby('SEASON_ID')['GP'].sum()
    temp = pd.concat([teams, gp], axis=1).reset_index()
    temp['player_id'] = player_id
    temp.rename(columns={'SEASON_ID': 'season', 'TEAM_ABBREVIATION': 'team_ab'}, inplace=True)
    return temp

In [134]:
gp_list = [ _get_team_gp(player) for player in players ]

In [137]:
temp_df = pd.concat(gp_list)

In [138]:
with open('temp_df.p', 'wb') as f:
    pickle.dump(temp_df, f)

In [187]:
team_gp = temp_df.set_index(['season', 'player_id']).to_dict()

In [218]:
# Create season, player_id tuple
temp = df.apply(lambda x: (x['season'], x['player_id']), axis=1)

# Map teams and games played from tuple
df['teams'] = temp.map(team_gp['team_ab'])
df['GP'] = temp.map(team_gp['GP'])

## Shooting Analysis by Player

In [358]:
# Group by player (across 2013-14 to 2019-20 seasons)
by_player = df.groupby('last_first')['threeM', 'threeA', 'midM', 'midA', 'shortM', 'shortA'].sum()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [359]:
# Add PPS
by_player['threePPS'] = 3 * by_player['threeM'] / by_player['threeA']
by_player['midPPS'] = 2 * by_player['midM'] / by_player['midA']
by_player['shortPPS'] = 2 * by_player['shortM'] / by_player['shortA']

In [360]:
# Deal with NaN values
by_player['threePPS'] = np.where(by_player['threePPS'].isna(), 0, by_player['threePPS'])
by_player['midPPS'] = np.where(by_player['midPPS'].isna(), 0, by_player['midPPS'])
by_player['shortPPS'] = np.where(by_player['shortPPS'].isna(), 0, by_player['shortPPS'])

In [364]:
by_player.reset_index(inplace=True)

In [366]:
temp = df.groupby(['last_first', 'season'])['GP'].mean().reset_index()
temp = temp.groupby('last_first')['GP'].sum().reset_index()
by_player = pd.merge(by_player, temp, on='last_first')

In [368]:
# Add per game attempts and makes
by_player['threeA/G'] = by_player['threeA'] / by_player['GP']
by_player['threeM/G'] = by_player['threeM'] / by_player['GP']
by_player['midA/G'] = by_player['midA'] / by_player['GP']
by_player['midM/G'] = by_player['midM'] / by_player['GP']
by_player['shortA/G'] = by_player['shortA'] / by_player['GP']
by_player['shortM/G'] = by_player['shortM'] / by_player['GP']
by_player.reset_index(inplace=True)

In [370]:
by_player.loc[by_player['last_first'] == 'Westbrook, Russell']

Unnamed: 0,index,last_first,threeM,threeA,midM,midA,shortM,shortA,threePPS,midPPS,shortPPS,GP,threeA/G,threeM/G,midA/G,midM/G,shortA/G,shortM/G
1025,1025,"Westbrook, Russell",722,2356,1309.0,3313.0,2381.0,3960.0,0.919355,0.79022,1.202525,484.0,4.867769,1.491736,6.845041,2.704545,8.181818,4.919421


In [378]:
temp = overall.groupby('last_first')['FG2A', 'threeA'].sum().reset_index()
temp.loc[temp['last_first'] == 'Westbrook, Russell']


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,last_first,FG2A,threeA
1025,"Westbrook, Russell",7273,2356


In [371]:
46 + 67 + 80 + 81 + 80 + 73 + 57

484

In [372]:
791 + 1471 + 1444 + 1941 + 1687 + 1473 + 1281

10088

In [373]:
2356 + 3313 + 3960

9629

In [347]:
df.loc[df['last_first'] == 'Durant, Kevin']

Unnamed: 0,player_id,last_first,season,closest_def,threeM,threeA,midM,midA,shortM,shortA,threePPS,midPPS,shortPPS,teams,GP
19,201142,"Durant, Kevin",2013-14,0-2 Feet - Very Tight,7,29,49.0,102.0,132.0,204.0,0.724138,0.960784,1.294118,[OKC],81
481,201142,"Durant, Kevin",2013-14,2-4 Feet - Tight,58,157,137.0,334.0,159.0,238.0,1.10828,0.820359,1.336134,[OKC],81
945,201142,"Durant, Kevin",2013-14,4-6 Feet - Open,90,226,73.0,147.0,69.0,79.0,1.19469,0.993197,1.746835,[OKC],81
1660,201142,"Durant, Kevin",2013-14,6+ Feet - Wide Open,37,75,13.0,30.0,25.0,25.0,1.48,0.866667,2.0,[OKC],81
2201,201142,"Durant, Kevin",2014-15,0-2 Feet - Very Tight,0,5,7.0,23.0,29.0,49.0,0.0,0.608696,1.183673,[OKC],27
2574,201142,"Durant, Kevin",2014-15,2-4 Feet - Tight,24,63,45.0,96.0,37.0,50.0,1.142857,0.9375,1.48,[OKC],27
3300,201142,"Durant, Kevin",2014-15,4-6 Feet - Open,24,64,28.0,45.0,14.0,17.0,1.125,1.244444,1.647059,[OKC],27
3786,201142,"Durant, Kevin",2014-15,6+ Feet - Wide Open,16,26,4.0,10.0,10.0,11.0,1.846154,0.8,1.818182,[OKC],27
3944,201142,"Durant, Kevin",2015-16,0-2 Feet - Very Tight,6,22,25.0,53.0,74.0,112.0,0.818182,0.943396,1.321429,[OKC],72
4249,201142,"Durant, Kevin",2015-16,2-4 Feet - Tight,63,167,160.0,315.0,121.0,172.0,1.131737,1.015873,1.406977,[OKC],72


In [231]:
# Over the course of 2013-14 to 2019-20 seasons, players with better midrange PPS than 3-pt PPS
mid_over_three = by_player.loc[by_player['threePPS'] < by_player['midPPS']].index

In [237]:
by_season = df.groupby(['last_first', 'season', 'GP'])['threeM', 'threeA', 'midM', 'midA', 'shortM', 'shortA'].sum().reset_index()

  """Entry point for launching an IPython kernel.


In [333]:
def shot_by_sea(df, player):
    if player == 'overall':
        temp = df.groupby('season')['threeM', 'threeA', 'midM', 'midA', 'shortM', 'shortA'].sum().reset_index()
        temp['totalA'] = temp['threeA'] + temp['midA'] + temp['shortA']
        temp['threePCT'] = temp['threeA'] / temp['totalA'] * 100
        temp['midPCT'] = temp['midA'] / temp['totalA'] * 100
        temp['shortPCT'] = temp['shortA'] / temp['totalA'] * 100
        
    else:
        temp = df.loc[df['last_first'] == player].copy()
        temp['threeA/G'] = temp['threeA'] / temp['GP']
        temp['midA/G'] = temp['midA'] / temp['GP']
        temp['shortA/G'] = temp['shortA'] / temp['GP']
        temp['totalA/G'] = temp['threeA/G'] + temp['midA/G'] + temp['shortA/G']
        temp['threePCT'] = temp['threeA/G'] / temp['totalA/G'] * 100
        temp['midPCT'] = temp['midA/G'] / temp['totalA/G'] * 100
        temp['shortPCT'] = temp['shortA/G'] / temp['totalA/G'] * 100
    
    return temp
    
def plot_shot_sel(df, player):
    temp = pd.melt(
        df,
        id_vars='season',
        value_vars=['threeA/G', 'midA/G', 'shortA/G'],
        var_name='shot_type',
        value_name='A/G'
    )
    player = player.split()
    player[0] = player[0][:-1]
    player = player[1] + ' ' + player[0]
    fig = px.line(data_frame=temp, x='season', y='A/G', color='shot_type')
    fig.update_layout(title=f'{player} Shot Attempts by Season')
    
    return fig

def plot_shot_share(df, player):
    temp = pd.melt(
        df,
        id_vars='season',
        value_vars=['threePCT', 'midPCT', 'shortPCT'],
        var_name='shot_type',
        value_name='PCT'
    )
    
    if player != 'overall':
        player = player.split()
        player[0] = player[0][:-1]
        player = player[1] + ' ' + player[0]
        
    else:
        player = 'Overall'
        
    fig = px.line(data_frame=temp, x='season', y='PCT', color='shot_type')
    fig.update_layout(title=f'{player} Shot Shares by Season')
    
    return fig

In [334]:
player = mid_over_three[14]
player_df = shot_by_sea(by_season, 'overall')
player_df


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,season,threeM,threeA,midM,midA,shortM,shortA,totalA,threePCT,midPCT,shortPCT
0,2013-14,18962,52346,24150.0,59244.0,49281.0,80193.0,191783.0,27.29439,30.891163,41.814447
1,2014-15,19247,54588,23768.0,58168.0,49023.0,80493.0,193249.0,28.247494,30.100026,41.652479
2,2015-16,20914,58751,23183.0,56308.0,49795.0,80430.0,195489.0,30.053353,28.803667,41.14298
3,2016-17,23656,65747,21678.0,52033.0,50338.0,79900.0,197680.0,33.259308,26.321833,40.418859
4,2017-18,25585,70209,19505.0,46934.0,51546.0,81049.0,198192.0,35.42474,23.681077,40.894183
5,2018-19,27822,78395,17583.0,43194.0,55116.0,96689.0,218278.0,35.915209,19.788527,44.296264
6,2019-20,25671,71711,14140.0,34174.0,46046.0,80560.0,186445.0,38.462281,18.329266,43.208453


In [331]:
plot_shot_sel(player_df, player)

KeyError: "The following 'value_vars' are not present in the DataFrame: ['midA/G', 'shortA/G', 'threeA/G']"

In [335]:
plot_shot_share(player_df, 'overall')