# Scrape General Player Stats

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
import sqlite3
import seaborn as sns
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests   
import shutil      
import datetime
from scipy.stats import norm
import os
import winsound

home_folder = 'C:\\Users\\Travis\\OneDrive\\Data Science\\Personal_Projects\\Sports\\NBA_Prediction_V3_1'
os.chdir(home_folder)

## Functions

In [2]:
def replace_name_values(filename):
        # replace values with dashes for compatibility
    filename = filename.replace('%','_')
    filename = filename.replace('=','_')
    filename = filename.replace('?','_')
    filename = filename.replace('&','_')
    filename = filename.replace('20Season_','')
    filename = filename.replace('20Season','')
    filename = filename.replace('SeasonType_','')
    return filename

In [3]:
def grab_player_data(url_list, file_folder):    
        
        # Scrape Season-Level player data from the url_list

        i = 0
        for u in url_list:
                
                driver.get(u)
                time.sleep(2)

                # if the page does not load, go to the next in the list
                try:
                        xpath = '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select/option[1]'
                        elem = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, xpath)))
                except:
                        print(f'{u} did not load. Moving to next url.')
                        continue

                # click "all pages"
                xpath_all = '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select/option[1]' 
                elem = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, xpath_all)))
                
                driver.find_element(by=By.XPATH, value=xpath_all).click()
                src = driver.page_source
                parser = BeautifulSoup(src, "lxml")
                table = parser.find("table", attrs = {"class":"Crom_table__p1iZz"})
                headers = table.findAll('th')
                headerlist = [h.text.strip() for h in headers[0:]] 
                row_names = table.findAll('a')                             # find rows
                row_list = [b.text.strip() for b in row_names[0:]] 
                rows = table.findAll('tr')[0:]
                player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
                tot_cols = len(player_stats[1])                           #set the length to ignore hidden columns
                headerlist = headerlist[:tot_cols]   
                stats = pd.DataFrame(player_stats, columns = headerlist)

                # assign filename
                filename = file_folder + str(u[34:]).replace('/', '_') + '.csv'
                filename = replace_name_values(filename)
                pd.DataFrame.to_csv(stats, filename)
                i += 1
                lu = len(url_list)
                # close driver
                print(f'{filename} Completed Successfully! {i} / {lu} Complete!')

        winsound.Beep(523, 500)

In [4]:
def append_the_data(folder, data_prefix, filename_selector):
    
    # Appending data together via folder and/or file name

    path = folder
    p = os.listdir(path)
    pf = pd.DataFrame(p)


    # filter for files that contain the filename_selector
    pf_reg = pf.loc[pf[0].astype(str).str.contains(filename_selector)] 

    appended_data = []
    for file in pf_reg[0]:
        data = pd.read_csv(folder + '/' + file)
        # if "Season" a column, drop it
        if 'Season' in data.columns:
            data = data.drop(columns = ['Season'])
        
        data['season'] = file[(file.find('20')):(file.find('20'))+4]
        data['season_type'] = np.where('Regular' in file, 'Regular', 'Playoffs')
        # add prefix to columns
        data = data.add_prefix(data_prefix)
        data.columns = data.columns.str.lower()
        appended_data.append(data)
    
    appended_data = pd.concat(appended_data)
    return appended_data

In [5]:
# If there are files in the general folder, move them to playoffs or reg_season
gen_files = os.listdir('data/player/general')
gen_files = [f for f in gen_files if f.endswith('.csv')]
if len(gen_files) > 0:
    for f in gen_files:
        if 'Playoffs' in f:
            os.rename('data/player/general/' + f, 'data/player/general/playoffs/' + f)
        else:
            os.rename('data/player/general/' + f, 'data/player/general/regular_season/' + f)

## Urls

In [6]:
# This gets a list of all the urls for the player general stats for PREVIOUS YEARS

years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17', 
            '2015-16','2014-15', '2013-14', '2012-13', '2011-12']
stat_types = ['traditional', 'advanced', 'misc', 'scoring', 'usage','opponent', 'defense']
season_types = ['Playoffs', 'Regular%20Season']

player_general_urls = []

for year in years:
    for stattype in stat_types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/'+ stattype +'?SeasonType=' + s_types + '&Season=' + year 
            player_general_urls.append(url)

In [7]:
# add the 2022-23 season to the list of urls

year = '2022-23'
season_type = 'Regular%20Season'
for stattype in stat_types:
    url = 'https://www.nba.com/stats/players/'+ stattype +'?SeasonType=' + season_type + '&Season=' + year 
    player_general_urls.append(url)


In [8]:
# translate urls to naming convention
def trans_urls(url):
    new_url = str(url)[34:].replace('/', '_')
    filename = replace_name_values(new_url)
    return filename

In [9]:
# ID Missing Files

pg_url = pd.DataFrame(player_general_urls, columns = ['url'])

# apply lambda function to get filename
pg_url['filename'] = pg_url.apply(lambda row: trans_urls(row['url']), axis=1)

# Get files in folder
folder1 = os.listdir('data/player/general/regular_season/')
folder2 = os.listdir('data/player/general/playoffs/')
folder = folder1 + folder2
folder = [x.replace('.csv', '') for x in folder]

# get list of files that need to be downloaded, the files that are not in the folder
to_download = pg_url[~pg_url['filename'].isin(folder)]
to_download

Unnamed: 0,url,filename


In [10]:
driver = webdriver.Chrome()

In [11]:
# if there are files to download, download them

# turn url to list
to_download_list = to_download['url'].tolist()

if len(to_download_list) > 0:
    grab_player_data(to_download_list, 'data/player/general/')
else:
    print('No files to download')

No files to download


### Add This Year

In [12]:
# add the 2022-23 season to the list of urls
this_year = []

year = '2022-23'
season_type = 'Regular%20Season'
for stattype in stat_types:
    url = 'https://www.nba.com/stats/players/'+ stattype +'?SeasonType=' + season_type + '&Season=' + year 
    this_year.append(url)


In [13]:
driver = webdriver.Chrome()
grab_player_data(this_year, 'data/player/general/')

data/player/general/traditional_Regular_Season_2022-23.csv Completed Successfully! 1 / 7 Complete!
data/player/general/advanced_Regular_Season_2022-23.csv Completed Successfully! 2 / 7 Complete!
data/player/general/misc_Regular_Season_2022-23.csv Completed Successfully! 3 / 7 Complete!
data/player/general/scoring_Regular_Season_2022-23.csv Completed Successfully! 4 / 7 Complete!
data/player/general/usage_Regular_Season_2022-23.csv Completed Successfully! 5 / 7 Complete!
data/player/general/opponent_Regular_Season_2022-23.csv Completed Successfully! 6 / 7 Complete!
data/player/general/defense_Regular_Season_2022-23.csv Completed Successfully! 7 / 7 Complete!


### Append Data

In [14]:
trad_data = append_the_data('data/player/general/regular_season/', 'trad_', 'traditional')
print(f' data shape: {trad_data.shape}')
trad_data.head(3)

 data shape: (6734, 33)


Unnamed: 0,trad_unnamed: 0,trad_unnamed: 1,trad_player,trad_team,trad_age,trad_gp,trad_w,trad_l,trad_min,trad_pts,...,trad_tov,trad_stl,trad_blk,trad_pf,trad_fp,trad_dd2,trad_td3,trad_+/-,trad_season,trad_season_type
0,0,,,,,,,,,,...,,,,,,,,,2011,Regular
1,1,1.0,Kevin Durant,OKC,23.0,66.0,47.0,19.0,38.6,28.0,...,3.8,1.3,1.2,2.0,46.6,18.0,0.0,5.6,2011,Regular
2,2,2.0,Kobe Bryant,LAL,33.0,58.0,36.0,22.0,38.5,27.9,...,3.5,1.2,0.3,1.8,42.1,3.0,0.0,2.4,2011,Regular


In [15]:
trad_data.to_csv('data/player/aggregates/player_general_traditional_seasonview.csv')

In [16]:
adv_data = append_the_data('data/player/general/regular_season/', 'adv_', 'advanced')
print(f' data shape: {adv_data.shape}')

 data shape: (6128, 27)


In [17]:
adv_data.to_csv('data/player/aggregates/player_general_advanced_seasonview.csv')

#### Defense stats

In [18]:
def_data = append_the_data('data/player/general/regular_season/', 'def_', 'defense')

In [19]:
def_data.to_csv('data/player/aggregates/player_general_defense_aggregates.csv')

#### Scoring Stats

In [20]:
scoring_data = append_the_data('data/player/general/regular_season/', 'scor_', 'scoring')

In [21]:
scoring_data.to_csv('data/player/aggregates/player_general_scoring_aggregates.csv')

#### Usage Stats

In [22]:
usage_data = append_the_data('data/player/general/regular_season/', 'usage_', 'usage')
usage_data.to_csv('data/player/aggregates/player_general_usage_aggregates.csv')

#### Opponent Stats

In [23]:
opponent_data = append_the_data('data/player/general/regular_season/', 'opp_', 'opponent')
opponent_data.to_csv('data/player/aggregates/player_general_opponent_aggregates.csv')

#### Misc stats

In [24]:
misc_data = append_the_data('data/player/general/regular_season/', 'misc_', 'misc')
misc_data.to_csv('data/player/aggregates/player_general_misc_aggregates.csv')

### Merge All General Stats in one File

In [25]:
trad_data

Unnamed: 0,trad_unnamed: 0,trad_unnamed: 1,trad_player,trad_team,trad_age,trad_gp,trad_w,trad_l,trad_min,trad_pts,...,trad_tov,trad_stl,trad_blk,trad_pf,trad_fp,trad_dd2,trad_td3,trad_+/-,trad_season,trad_season_type
0,0,,,,,,,,,,...,,,,,,,,,2011,Regular
1,1,1.0,Kevin Durant,OKC,23.0,66.0,47.0,19.0,38.6,28.0,...,3.8,1.3,1.2,2.0,46.6,18.0,0.0,5.6,2011,Regular
2,2,2.0,Kobe Bryant,LAL,33.0,58.0,36.0,22.0,38.5,27.9,...,3.5,1.2,0.3,1.8,42.1,3.0,0.0,2.4,2011,Regular
3,3,3.0,LeBron James,MIA,27.0,62.0,45.0,17.0,37.5,27.1,...,3.4,1.9,0.8,1.5,50.6,23.0,0.0,7.6,2011,Regular
4,4,4.0,Kevin Love,MIN,23.0,55.0,24.0,31.0,39.0,26.0,...,2.3,0.9,0.5,2.8,46.8,48.0,0.0,0.5,2011,Regular
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,601,582.0,Nate Hinton,IND,23.0,2.0,0.0,2.0,1.2,0.0,...,0.5,0.0,0.0,0.0,-0.5,0.0,0.0,2.0,2021,Regular
602,602,582.0,Sam Dekker,TOR,28.0,1.0,1.0,0.0,0.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,2021,Regular
603,603,582.0,Trayvon Palmer,DET,27.0,1.0,0.0,1.0,16.8,0.0,...,1.0,0.0,0.0,2.0,1.4,0.0,0.0,-12.0,2021,Regular
604,604,582.0,Tyler Hall,NYK,25.0,1.0,1.0,0.0,1.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,2021,Regular


In [26]:
trad_data['trad_season'] = trad_data['trad_season'].astype(np.int64)

In [27]:
adv_data.adv_season = adv_data.adv_season.astype(np.int64)

In [28]:
# merge advanced and traditional data
all_gen_data = pd.merge(adv_data, trad_data.drop_duplicates(subset = ['trad_player','trad_season', 'trad_season_type']),
                 left_on= ['adv_player','adv_season', 'adv_season_type'], 
                 right_on= ['trad_player','trad_season', 'trad_season_type'], 
                 how = 'left')

In [29]:
def_data['def_season'] = def_data['def_season'].astype(np.int64)

In [30]:
# merge defense data with adv&trad data

all_gen_data = pd.merge(all_gen_data, def_data.drop_duplicates(subset = ['def_player','def_season', 'def_season_type']), 
    left_on= ['adv_player','adv_season', 'adv_season_type'], 
    right_on= ['def_player','def_season', 'def_season_type'], 
    how = 'left')

In [31]:
scoring_data['scor_season'] = scoring_data['scor_season'].astype(np.int64)

In [32]:
# merge scoring data with adv&trad&def data

all_gen_data = pd.merge(all_gen_data, scoring_data.drop_duplicates(subset = ['scor_player','scor_season', 'scor_season_type']), 
    left_on= ['adv_player','adv_season', 'adv_season_type'], 
    right_on= ['scor_player','scor_season', 'scor_season_type'], 
    how = 'left')

In [33]:
usage_data['usage_season'] = usage_data['usage_season'].astype(np.int64)

In [34]:
# merge usage data with adv&trad&def&scoring data

all_gen_data = pd.merge(all_gen_data, usage_data.drop_duplicates(subset = ['usage_player','usage_season', 'usage_season_type'] ), 
    left_on= ['adv_player','adv_season', 'adv_season_type'], 
    right_on= ['usage_player','usage_season', 'usage_season_type'], 
    how = 'left')

In [35]:
all_gen_data.to_csv('data/player/aggregates/player_general_all_aggregates_seasonview.csv')