In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
import sqlite3
import seaborn as sns
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests   
import shutil      
import datetime
from scipy.stats import norm
import os
import winsound

home_folder = 'C:\\Users\\Travis\\OneDrive\\Data Science\\Personal_Projects\\Sports\\NBA_Prediction_V3_1'
os.chdir(home_folder)

## Scrape Tracking Data

Note: No Playoff Data Available (which is fine)

In [2]:
def replace_name_values2(filename):
        # replace values with dashes for compatibility
    filename = filename.replace('%','_')
    filename = filename.replace('=','_')
    filename = filename.replace('?','_')
    filename = filename.replace('&','_')
    filename = filename.replace('20Season_','')
    filename = filename.replace('_20Season','')
    filename = filename.replace('SeasonType_','')
    filename = filename.replace('sort_gdate_dir_-1_','')
    filename = filename.replace('SeasonYear_','')
    return filename

In [3]:
def trans_urls(url):
    new_url = str(url)[34:].replace('/', '_')
    filename = replace_name_values2(new_url)
    filename = filename.replace('SeasonYear_', '')
    filename = filename.replace('_Season_','')
    return filename

In [4]:
def grab_player_tracking_stats(url_list, file_folder):
        driver = webdriver.Chrome()
        i = 0
        for u in url_list:
                driver.get(u)
                time.sleep(1)
                # if the page does not load, go to the next in the list
                try:
                        xpath = '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select/option[1]'
                        elem = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, xpath)))
                except:
                        print(f'{u} did not load. Moving to next url.')
                        continue
                
                elem = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, xpath)))
                driver.find_element(by=By.XPATH, value=xpath).click()
                src = driver.page_source
                parser = BeautifulSoup(src, "lxml")
                table = parser.find("table", attrs = {"class":"Crom_table__p1iZz"})
                
                # get the headers

                headers = table.findAll('th')
                headerlist = [h.text.strip() for h in headers[0:]] 

                # if there are repreated headers in headerlist, delete them
                headerlist = [i for n, i in enumerate(headerlist) if i not in headerlist[:n]]

                row_names = table.findAll('a')             
                row_list = [b.text.strip() for b in row_names[0:]] 
                rows = table.findAll('tr')[0:]
                player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
                tot_cols = len(headerlist)
                headerlist = headerlist[:tot_cols]       
                stats = pd.DataFrame(player_stats, columns = headerlist)


                filename = file_folder + str(u[34:]).replace('/', '_') + '.csv'
                filename = replace_name_values2(filename)
                filename = filename.replace('_Season_','')

                # save
                pd.DataFrame.to_csv(stats, filename)
                
                # increment counter
                i += 1
                lu = len(url_list)
                print(f'{filename} Completed Successfully! {i} / {lu} Complete!')

In [5]:
def append_the_data(folder, data_prefix, filename_selector):
    # Appending data together via folder and/or file name

    path = folder
    p = os.listdir(path)
    pf = pd.DataFrame(p)


    # filter for files that contain the filename_selector
    pf_reg = pf.loc[pf[0].astype(str).str.contains(filename_selector)] 

    appended_data = []
    for file in pf_reg[0]:
        data = pd.read_csv(folder + '/' + file)
        # if "Season" a column, drop it
        if 'Season' in data.columns:
            data = data.drop(columns = ['Season'])
        
        data['season'] = file[(file.find('20')):(file.find('20'))+4]
        data['season_type'] = np.where('Regular' in file, 'Regular', 'Playoffs')
        # add prefix to columns
        data = data.add_prefix(data_prefix)
        data.columns = data.columns.str.lower()
        appended_data.append(data)
    
    appended_data = pd.concat(appended_data)
    return appended_data

In [7]:
# Get URLS
def get_urls():
        drives = 'https://www.nba.com/stats/players/drives/?Season='    #=2018-19&SeasonType=Regular%20Season'
        defensive_impact = 'https://www.nba.com/stats/players/defensive-impact/?Season='    #=2018-19&SeasonType=Regular%20Season'
        catch_n_shoot = 'https://www.nba.com/stats/players/catch-shoot/?Season=' 
        passing = 'https://www.nba.com/stats/players/passing/?Season='
        touches = 'https://www.nba.com/stats/players/touches/?Season='
        pullup_shooting = 'https://www.nba.com/stats/players/pullup/?Season='
        rebounds = 'https://www.nba.com/stats/players/rebounding/?Season='
        offensive_rebounding = 'https://www.nba.com/stats/players/offensive-rebounding/?Season='
        defensive_rebounding = 'https://www.nba.com/stats/players/defensive-rebounding/?Season='
        shooting_efficiency = 'https://www.nba.com/stats/players/shooting-efficiency/?Season='
        speed_distance = 'https://www.nba.com/stats/players/speed-distance/?Season='
        elbow_touch = 'https://www.nba.com/stats/players/elbow-touch/?Season='
        postups= 'https://www.nba.com/stats/players/tracking-post-ups/?Season='
        paint_touches = 'https://www.nba.com/stats/players/paint-touch/?Season='

        tracking_stats = [drives, defensive_impact, catch_n_shoot, passing, touches, pullup_shooting, rebounds, offensive_rebounding, defensive_rebounding, shooting_efficiency, speed_distance, elbow_touch, postups, paint_touches]
        seasonz = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14']

        tracking_urls = []
        for s in seasonz:
                for t in tracking_stats:
                        tracking_urls.append(t + s + '&SeasonType=Regular%20Season')

        to_download = pd.DataFrame(tracking_urls, columns = ['urls'])

        # create new column with filename
        to_download['filename'] = to_download.apply(lambda row: trans_urls(row['urls']), axis=1)
        return to_download

In [8]:
to_download = get_urls()

In [9]:
# Check to see if the files already exist
folder = 'data/player/tracking/regular_season'
file_list = os.listdir(folder)
file_list = [f.replace('.csv', '') for f in file_list]

# get list of files not yet downloaded
to_download = to_download.loc[~to_download['filename'].isin(file_list)]
to_download = to_download['urls'].tolist()
len(to_download)


0

In [10]:
# download the files
grab_player_tracking_stats(to_download, 'data/player/tracking/regular_season/')

### Update This Year

In [11]:
drives = 'https://www.nba.com/stats/players/drives/?Season='    #=2018-19&SeasonType=Regular%20Season'
defensive_impact = 'https://www.nba.com/stats/players/defensive-impact/?Season='    #=2018-19&SeasonType=Regular%20Season'
catch_n_shoot = 'https://www.nba.com/stats/players/catch-shoot/?Season=' 
passing = 'https://www.nba.com/stats/players/passing/?Season='
touches = 'https://www.nba.com/stats/players/touches/?Season='
pullup_shooting = 'https://www.nba.com/stats/players/pullup/?Season='
rebounds = 'https://www.nba.com/stats/players/rebounding/?Season='
offensive_rebounding = 'https://www.nba.com/stats/players/offensive-rebounding/?Season='
defensive_rebounding = 'https://www.nba.com/stats/players/defensive-rebounding/?Season='
shooting_efficiency = 'https://www.nba.com/stats/players/shooting-efficiency/?Season='
speed_distance = 'https://www.nba.com/stats/players/speed-distance/?Season='
elbow_touch = 'https://www.nba.com/stats/players/elbow-touch/?Season='
postups= 'https://www.nba.com/stats/players/tracking-post-ups/?Season='
paint_touches = 'https://www.nba.com/stats/players/paint-touch/?Season='

tracking_stats = [drives, defensive_impact, catch_n_shoot, passing, touches, pullup_shooting, rebounds, offensive_rebounding, defensive_rebounding, shooting_efficiency, speed_distance, elbow_touch, postups, paint_touches]

update_urls = []

for t in tracking_stats:
        update_urls.append(t + '2022-23' + '&SeasonType=Regular%20Season')

In [13]:
driver = webdriver.Chrome()
grab_player_tracking_stats(update_urls, 'data/player/tracking/')
driver.close()

data/player/tracking/drives_2022-23_Regular.csv Completed Successfully! 1 / 14 Complete!
data/player/tracking/defensive-impact_2022-23_Regular.csv Completed Successfully! 2 / 14 Complete!
data/player/tracking/catch-shoot_2022-23_Regular.csv Completed Successfully! 3 / 14 Complete!
data/player/tracking/passing_2022-23_Regular.csv Completed Successfully! 4 / 14 Complete!
data/player/tracking/touches_2022-23_Regular.csv Completed Successfully! 5 / 14 Complete!
data/player/tracking/pullup_2022-23_Regular.csv Completed Successfully! 6 / 14 Complete!
data/player/tracking/rebounding_2022-23_Regular.csv Completed Successfully! 7 / 14 Complete!
data/player/tracking/offensive-rebounding_2022-23_Regular.csv Completed Successfully! 8 / 14 Complete!
data/player/tracking/defensive-rebounding_2022-23_Regular.csv Completed Successfully! 9 / 14 Complete!
data/player/tracking/shooting-efficiency_2022-23_Regular.csv Completed Successfully! 10 / 14 Complete!
data/player/tracking/speed-distance_2022-23_Reg

In [14]:
# move files to proper folders
files = os.listdir('data/player/tracking/')
for f in files:
    if '.csv' in f:
        if 'Regular' in f:
            shutil.move('data/player/tracking/' + f, 'data/player/tracking/regular_season/' + f)
        elif 'Playoffs' in f:
            shutil.move('data/player/tracking/' + f, 'data/player/tracking/playoffs/' + f)

In [11]:
# Check Again

to_download = get_urls()

In [12]:
# Check to see if the files already exist
folder = 'data/player/tracking/regular_season'
file_list = os.listdir(folder)
file_list = [f.replace('.csv', '') for f in file_list]

# get list of files not yet downloaded
to_download = to_download.loc[~to_download['filename'].isin(file_list)]
to_download = to_download['urls'].tolist()
len(to_download)

0

In [13]:
if len(to_download) == 0:
    print('All files downloaded!')
else:
    print('Some files not downloaded')

All files downloaded!


### Append

In [16]:
# Append
drives_df = append_the_data('data/player/tracking/regular_season/', 'tracking_drives__', 'drives')
defensive_impact_df = append_the_data('data/player/tracking/regular_season/', 'tracking_defensive_impact__', 'defensive-impact')
catch_n_shoot_df = append_the_data('data/player/tracking/regular_season/', 'tracking_catch_n_shoot__', 'catch-shoot')
passing_df = append_the_data('data/player/tracking/regular_season/', 'tracking_passing__', 'passing')
touches_df = append_the_data('data/player/tracking/regular_season/', 'tracking_touches__', 'touches')
pullup_shooting_df = append_the_data('data/player/tracking/regular_season/', 'tracking_pullup_shooting__', 'pullup')
offensive_rebounding_df = append_the_data('data/player/tracking/regular_season/', 'tracking_offensive_rebounding__', 'offensive-rebounding')
defensive_rebounding_df = append_the_data('data/player/tracking/regular_season/', 'tracking_defensive_rebounding__', 'defensive-rebounding')
shooting_efficiency_df = append_the_data('data/player/tracking/regular_season/', 'tracking_shooting_efficiency__', 'shooting-efficiency')
speed_distance_df = append_the_data('data/player/tracking/regular_season/', 'tracking_speed_distance__', 'speed-distance')
elbow_touch_df = append_the_data('data/player/tracking/regular_season/', 'tracking_elbow_touch__', 'elbow-touch')
postups_df = append_the_data('data/player/tracking/regular_season/', 'tracking_postups__', 'post-ups')
paint_touches_df = append_the_data('data/player/tracking/regular_season/', 'tracking_paint_touches__', 'paint-touch')

# Save
drives_df.to_csv('data/player/aggregates/tracking_drives.csv')
defensive_impact_df.to_csv('data/player/aggregates/tracking_defensive_impact.csv')
catch_n_shoot_df.to_csv('data/player/aggregates/tracking_catch_n_shoot.csv')
passing_df.to_csv('data/player/aggregates/tracking_passing.csv')
touches_df.to_csv('data/player/aggregates/tracking_touches.csv')
pullup_shooting_df.to_csv('data/player/aggregates/tracking_pullup_shooting.csv')
offensive_rebounding_df.to_csv('data/player/aggregates/tracking_offensive_rebounding.csv')
defensive_rebounding_df.to_csv('data/player/aggregates/tracking_defensive_rebounding.csv')
shooting_efficiency_df.to_csv('data/player/aggregates/tracking_shooting_efficiency.csv')
speed_distance_df.to_csv('data/player/aggregates/tracking_speed_distance.csv')
elbow_touch_df.to_csv('data/player/aggregates/tracking_elbow_touch.csv')
postups_df.to_csv('data/player/aggregates/tracking_postups.csv')
paint_touches_df.to_csv('data/player/aggregates/tracking_paint_touches.csv')

print(f' drives: {drives_df.shape}, defensive impact: {defensive_impact_df.shape}, catch n shoot: {catch_n_shoot_df.shape}, passing: {passing_df.shape}, touches: {touches_df.shape}, pullup shooting: {pullup_shooting_df.shape}, offensive rebounding: {offensive_rebounding_df.shape}, defensive rebounding: {defensive_rebounding_df.shape}, shooting efficiency: {shooting_efficiency_df.shape}, speed distance: {speed_distance_df.shape}, elbow touch: {elbow_touch_df.shape}, postups: {postups_df.shape}, paint touches: {paint_touches_df.shape}')

 drives: (5179, 26), defensive impact: (5179, 15), catch n shoot: (5179, 15), passing: (5179, 18), touches: (5179, 22), pullup shooting: (5179, 17), offensive rebounding: (5179, 17), defensive rebounding: (5179, 17), shooting efficiency: (5179, 23), speed distance: (5179, 16), elbow touch: (5785, 27), postups: (5179, 27), paint touches: (5179, 27)


In [17]:
all_tracking_data = pd.merge(drives_df, defensive_impact_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_defensive_impact__player', 'tracking_defensive_impact__team', 'tracking_defensive_impact__season', 'tracking_defensive_impact__season_type'],
                                how = 'left')

print(f' drives: {drives_df.shape}, defensive impact: {defensive_impact_df.shape}, all_tracking_data: {all_tracking_data.shape}')

 drives: (5179, 26), defensive impact: (5179, 15), all_tracking_data: (5179, 41)


In [18]:
# add catch n shoot
all_tracking_data2 = pd.merge(all_tracking_data, catch_n_shoot_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_catch_n_shoot__player', 'tracking_catch_n_shoot__team', 'tracking_catch_n_shoot__season', 'tracking_catch_n_shoot__season_type'],
                                how = 'left')

print(f' drives: {drives_df.shape}, defensive impact: {defensive_impact_df.shape}, catch n shoot: {catch_n_shoot_df.shape}, all_tracking_data2: {all_tracking_data2.shape}')

 drives: (5179, 26), defensive impact: (5179, 15), catch n shoot: (5179, 15), all_tracking_data2: (5179, 56)


In [19]:
# add passing
all_tracking_data3 = pd.merge(all_tracking_data2, passing_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_passing__player', 'tracking_passing__team', 'tracking_passing__season', 'tracking_passing__season_type'],
                                how = 'left')

print(f' drives: {drives_df.shape}, defensive impact: {defensive_impact_df.shape}, catch n shoot: {catch_n_shoot_df.shape}, passing: {passing_df.shape}, all_tracking_data3: {all_tracking_data3.shape}')

 drives: (5179, 26), defensive impact: (5179, 15), catch n shoot: (5179, 15), passing: (5179, 18), all_tracking_data3: (5179, 74)


In [20]:
# add touches
all_tracking_data4 = pd.merge(all_tracking_data3, touches_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_touches__player', 'tracking_touches__team', 'tracking_touches__season', 'tracking_touches__season_type'],
                                how = 'left')

In [21]:
# add pullup shooting
all_tracking_data5 = pd.merge(all_tracking_data4, pullup_shooting_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_pullup_shooting__player', 'tracking_pullup_shooting__team', 'tracking_pullup_shooting__season', 'tracking_pullup_shooting__season_type'],
                                how = 'left')

In [22]:
# add offensive rebounding
all_tracking_data6 = pd.merge(all_tracking_data5, offensive_rebounding_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_offensive_rebounding__player', 'tracking_offensive_rebounding__team', 'tracking_offensive_rebounding__season', 'tracking_offensive_rebounding__season_type'],
                                how = 'left')

In [23]:
# add defensive rebounding
all_tracking_data7 = pd.merge(all_tracking_data6, defensive_rebounding_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_defensive_rebounding__player', 'tracking_defensive_rebounding__team', 'tracking_defensive_rebounding__season', 'tracking_defensive_rebounding__season_type'],
                                how = 'left')

In [24]:
# add shooting efficiency
all_tracking_data8 = pd.merge(all_tracking_data7, shooting_efficiency_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_shooting_efficiency__player', 'tracking_shooting_efficiency__team', 'tracking_shooting_efficiency__season', 'tracking_shooting_efficiency__season_type'],
                                how = 'left')

In [25]:
# add speed distance
all_tracking_data9 = pd.merge(all_tracking_data8, speed_distance_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_speed_distance__player', 'tracking_speed_distance__team', 'tracking_speed_distance__season', 'tracking_speed_distance__season_type'],
                                how = 'left')

In [26]:
all_tracking_data10 = pd.merge(all_tracking_data9, catch_n_shoot_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_catch_n_shoot__player', 'tracking_catch_n_shoot__team', 'tracking_catch_n_shoot__season', 'tracking_catch_n_shoot__season_type'],
                                how = 'left')

In [27]:
all_playtype_data11 = pd.merge(all_tracking_data10, paint_touches_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_paint_touches__player', 'tracking_paint_touches__team', 'tracking_paint_touches__season', 'tracking_paint_touches__season_type'],
                                how = 'left')

In [28]:
all_playtype_data_final = pd.merge(all_playtype_data11, postups_df,
                                left_on = ['tracking_drives__player', 'tracking_drives__team' ,'tracking_drives__season', 'tracking_drives__season_type'],
                                right_on = ['tracking_postups__player', 'tracking_postups__team', 'tracking_postups__season', 'tracking_postups__season_type'],
                                how = 'left')

In [29]:
all_playtype_data_final.to_csv('data/player/aggregates/All_tracking_data.csv', index = False)