# Acquiring Data and Feature Engineering

In [215]:
import numpy as np
import pandas as pd
import pickle
import json

import plotly

from nba_api.stats.endpoints.playerdashptshots import PlayerDashPtShots
from nba_api.stats.endpoints.shotchartdetail import ShotChartDetail
from nba_api.stats.static import players, teams

In [216]:
pd.options.display.max_columns = None

## Notes on Data from NBA API
- **Closest Defender Shooting Data:**
    - Is available starting the 2013-14 season
    - Only includes data in arenas with SportVu cameras
    - 2-point field goal attempts are broken down into less than 10 feet and 10+ feet
    
- **Shot Chart Data:**
    - Mid-range shots are defined as 2-point shots greater or equal to 8 feet

In [217]:
# Seasons available with closest defender data
seasons = [
    '2013-14',
    '2014-15',
    '2015-16',
    '2016-17',
    '2017-18',
    '2018-19',
    '2019-20',
]

In [218]:
def shooting_by_seasons(seasons, df_select):
    '''
    Function to get closest defender shooting data by season
    ----------
    Parameters
    ----------
    seasons: List of seasons (strings)
    df_select: DataFrame to return (int)
    '''
    results = []
    for season in seasons:
        temp = PlayerDashPtShots(team_id=0, player_id=0, season=season).get_data_frames()[df_select]
        temp['season'] = season
        results.append(temp)
    
    df = pd.concat(results)
    return df

In [219]:
overall = shooting_by_seasons(seasons, 4)
tenplus = shooting_by_seasons(seasons, 5)

In [220]:
# Get all players
players_list = players.get_players()

# Get active players dictionary {player name: player id}
players_dict = {
    f'{player["last_name"]}, {player["first_name"]}': player['id']
    for player in players_list
    if player['is_active'] == True
}

print('Number of Active Players:', len(players_dict.keys()))

Number of Active Players: 519


### Cleaning and merging closest defender shooting data

In [221]:
# Keep relevant data and reorganize DataFrames for easy inspection
clean_overall = overall.groupby(['PLAYER_ID', 'PLAYER_NAME_LAST_FIRST', 'season', 'CLOSE_DEF_DIST_RANGE'])['FG2M', 'FG2A', 'FG3M', 'FG3A'].sum().reset_index()
clean_tenplus = tenplus.groupby(['PLAYER_ID', 'PLAYER_NAME_LAST_FIRST', 'season', 'CLOSE_DEF_DIST_RANGE'])['FG2M', 'FG2A'].sum().reset_index()

  """Entry point for launching an IPython kernel.
  


In [222]:
# Merge DataFrames
cd_df = pd.merge(
    clean_overall,
    clean_tenplus,
    on=['PLAYER_ID', 'PLAYER_NAME_LAST_FIRST', 'season', 'CLOSE_DEF_DIST_RANGE'],
)

cd_df['shortM'] = cd_df['FG2M_x'] - cd_df['FG2M_y']
cd_df['shortA'] = cd_df['FG2A_x'] - cd_df['FG2A_y']

cd_df.drop(columns=['FG2M_x', 'FG2A_x'], inplace=True)

In [223]:
# Rename columns
cd_df.rename(
    columns={
        'PLAYER_ID': 'player_id',
        'PLAYER_NAME_LAST_FIRST': 'last_first',
        'CLOSE_DEF_DIST_RANGE': 'closest_def',
        'FG3M': 'threeM',
        'FG3A': 'threeA',
        'FG2M_y': 'midM',
        'FG2A_y': 'midA',
    },
    inplace=True,
)

In [224]:
# Deal with NaN values
for x in ['threeM', 'threeA', 'midM', 'midA', 'shortM', 'shortA']:
    cd_df[x] = np.where(cd_df[x].isna(), 0, cd_df[x])

In [227]:
cd_df.head()

Unnamed: 0,player_id,last_first,season,closest_def,threeM,threeA,midM,midA,shortM,shortA
0,708,"Garnett, Kevin",2013-14,0-2 Feet - Very Tight,0,0,5,15,9,31
1,708,"Garnett, Kevin",2013-14,2-4 Feet - Tight,0,0,24,67,27,46
2,708,"Garnett, Kevin",2013-14,4-6 Feet - Open,0,0,36,77,12,18
3,708,"Garnett, Kevin",2013-14,6+ Feet - Wide Open,0,3,37,80,1,2
4,708,"Garnett, Kevin",2014-15,0-2 Feet - Very Tight,0,0,4,12,13,27


### Functions to transform player shooting data

In [177]:
def player_shooting(last_first, seasons, players_dict):
    '''
    Function to get player shooting by zone and season, returns DataFrame
    ----------
    Parameters
    ----------
    last_first: Player name (str) in '{last name}, {first name}' format
    seasons: Seasons (list of str) in 'xxxx-xx' format (ex. '2013-14')
    players_dict: Dictionary for converting name (str) to player_id (int)
    '''
    player_id = players_dict[last_first]
    data = []
    for season in seasons:
        response = ShotChartDetail(
            team_id=0,
            player_id=player_id,
            context_measure_simple='FGA',
            season_nullable=season,
            season_type_all_star='Regular Season',
        )
        temp = response.get_data_frames()[0]
        
        if len(temp) == 0:
            pass
    
        else:
            temp['season'] = season
            temp['player_id'] = player_id
            temp['last_first'] = last_first
            data.append(temp)
            
    return pd.concat(data)

In [178]:
def clean_shooting_data(df):
    # Keep relevant data
    df = df[[
        'player_id',
        'last_first',
        'season',
        'SHOT_TYPE',
        'SHOT_ZONE_BASIC',
        'SHOT_ZONE_AREA',
        'SHOT_ZONE_RANGE',
        'SHOT_DISTANCE',
        'SHOT_MADE_FLAG',
    ]]

    # Rename columns
    df = df.rename(columns={
        'SHOT_TYPE': 'shot_type',
        'SHOT_ZONE_BASIC': 'zone',
        'SHOT_ZONE_AREA': 'area',
        'SHOT_ZONE_RANGE': 'range',
        'SHOT_DISTANCE': 'distance',
        'SHOT_MADE_FLAG': 'shot_made',
    })
    
    return df

In [179]:
def seasonal_breakdown(df):
    '''
    Function to convert shooting data to seasonal breakdowns
    '''
    df['threeA'] = np.where(df['shot_type'] == '3PT Field Goal', 1, 0)
    df['threeM'] = np.where(
        (df['shot_type'] == '3PT Field Goal') & (df['shot_made'] == 1), 1, 0
    )
    df['midA'] = np.where(df['zone'] == 'Mid-Range', 1, 0)
    df['midM'] = np.where(
        (df['zone'] == 'Mid-Range') & (df['shot_made'] == 1), 1, 0
    )
    df['shortA'] = np.where(df['zone'].isin(['Restricted Area', 'In The Paint (Non-RA)']), 1, 0)
    df['shortM'] = np.where(
        (df['zone'].isin(['Restricted Area', 'In The Paint (Non-RA)'])) & (df['shot_made'] == 1), 1, 0
    )
    
    temp = df.drop(columns=['shot_type', 'zone', 'area', 'range', 'distance', 'shot_made'])
    temp = temp.groupby(['player_id', 'last_first', 'season'])['threeA', 'threeM', 'midA', 'midM', 'shortA', 'shortM'].sum()
    temp['totalA'] = temp['threeA'] + temp['midA'] + temp['shortA']
    return temp

In [181]:
# Sample player
sample_player = 'Westbrook, Russell'

df = player_shooting(sample_player, seasons, players_dict)
detailed = clean_shooting_data(df)
detailed.head()

Unnamed: 0,player_id,last_first,season,shot_type,zone,area,range,distance,shot_made
0,201566,"Westbrook, Russell",2013-14,3PT Field Goal,Above the Break 3,Left Side Center(LC),24+ ft.,24,0
1,201566,"Westbrook, Russell",2013-14,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,2,1
2,201566,"Westbrook, Russell",2013-14,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,0,1
3,201566,"Westbrook, Russell",2013-14,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,0,0
4,201566,"Westbrook, Russell",2013-14,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,2,0


In [182]:
simple = seasonal_breakdown(df)
simple



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,threeA,threeM,midA,midM,shortA,shortM,totalA
player_id,last_first,season,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
201566,"Westbrook, Russell",2013-14,214,68,223,97,354,181,791
201566,"Westbrook, Russell",2014-15,288,86,445,164,738,377,1471
201566,"Westbrook, Russell",2015-16,341,101,336,144,767,411,1444
201566,"Westbrook, Russell",2016-17,583,200,595,216,763,408,1941
201566,"Westbrook, Russell",2017-18,326,97,530,211,832,450,1688
201566,"Westbrook, Russell",2018-19,411,119,355,113,708,398,1474
201566,"Westbrook, Russell",2019-20,213,55,297,121,771,428,1281
