In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
import sqlite3
import seaborn as sns
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests   
import shutil      
import datetime
from scipy.stats import norm
import os
import winsound

home_folder = 'C:\\Users\\Travis\\OneDrive\\Data Science\\Personal_Projects\\Sports\\NBA_Prediction_V3_1'
os.chdir(home_folder)

In [2]:
def replace_name_values2(filename):
        # replace values with dashes for compatibility
    filename = filename.replace('%','_')
    filename = filename.replace('=','_')
    filename = filename.replace('?','_')
    filename = filename.replace('&','_')
    filename = filename.replace('20Season_','')
    filename = filename.replace('_20Season','')
    filename = filename.replace('SeasonType_','')
    filename = filename.replace('sort_gdate_dir_-1_','')
    filename = filename.replace('SeasonYear_','')
    return filename

In [3]:
def clutch_url_to_filename(url):
    l = len('https://www.nba.com/stats/players/')
    url = url[l:]
    filename = replace_name_values2(url)
    filename = filename.replace('/', '_')
    return filename

In [4]:
def append_the_data(folder, data_prefix, filename_selector):
    # Appending data together via folder and/or file name

    path = folder
    p = os.listdir(path)
    pf = pd.DataFrame(p)


    # filter for files that contain the filename_selector
    pf_reg = pf.loc[pf[0].astype(str).str.contains(filename_selector)] 

    appended_data = []
    for file in pf_reg[0]:
        data = pd.read_csv(folder + '/' + file)
        # if "Season" a column, drop it
        if 'Season' in data.columns:
            data = data.drop(columns = ['Season'])
        
        data['season'] = file[(file.find('20')):(file.find('20'))+4]
        data['season_type'] = np.where('Regular' in file, 'Regular', 'Playoffs')
        # add prefix to columns
        data = data.add_prefix(data_prefix)
        data.columns = data.columns.str.lower()
        appended_data.append(data)
    
    appended_data = pd.concat(appended_data)
    return appended_data

In [5]:
# move files to regular season folder or playoffs folder

for f in 'data/player/clutch':
    if '.csv' in f:
        if 'Playoffs' in f:
            os.rename('data/player/clutch/' + f + '.csv', 'data/player/clutch/playoffs/' + f + '.csv')
        else:
            os.rename('data/player/clutch/' + f + '.csv', 'data/player/clutch/regular_season/' + f + '.csv')


In [6]:
# Get URLS

traditional_clutch = 'https://www.nba.com/stats/players/clutch-traditional/?Season='    
advanced_clutch = 'https://www.nba.com/stats/players/clutch-advanced/?Season='    
misc_clutch = 'https://www.nba.com/stats/players/clutch-misc/?Season=' 
scoring_clutch = 'https://www.nba.com/stats/players/clutch-scoring/?Season='
usage_clutch = 'https://www.nba.com/stats/players/clutch-usage/?Season='

clutch_stats = [traditional_clutch, advanced_clutch, misc_clutch, scoring_clutch, usage_clutch]
seasonz = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14']

clutch_urls = []
for s in seasonz:
        for c in clutch_stats:
                clutch_urls.append(c + s + '&SeasonType=Regular%20Season')

clutch_urls_playoffs = []
for s in seasonz:
        for c in clutch_stats:
                clutch_urls_playoffs.append(c + s + '&SeasonType=Playoffs')

In [7]:
# check urls against downloaded files

clutch_urls = pd.DataFrame(clutch_urls, columns = ['url'])
clutch_urls['filename'] = clutch_urls.apply(lambda row: clutch_url_to_filename(row['url']), axis = 1)
# get rid of the .csv

clutch_urls.head(3)

Unnamed: 0,url,filename
0,https://www.nba.com/stats/players/clutch-tradi...,clutch-traditional__Season_2021-22_Regular
1,https://www.nba.com/stats/players/clutch-advan...,clutch-advanced__Season_2021-22_Regular
2,https://www.nba.com/stats/players/clutch-misc/...,clutch-misc__Season_2021-22_Regular


In [8]:
files_in_folder = os.listdir('data/player/clutch/regular_season')

# remove .csv
files_in_folder = [f[:-4] for f in files_in_folder]

In [9]:
# get the files that are not in the clutch folder
missing = clutch_urls[~clutch_urls['filename'].isin(files_in_folder)]
missing

Unnamed: 0,url,filename


In [10]:
if len(missing) > 0:
    driver=webdriver.Chrome()
    grab_player_clutch_stats(missing, 'data/player/clutch')
else:
    print('no missing files')


no missing files


In [11]:
# append clutch advanced data
c_adv_reg = append_the_data('data/player/clutch/regular_season', 'c_adv_', 'advanced')
c_adv_reg.to_csv('data/player/aggregates/clutch_advanced_regular_season.csv')

c_adv_playoffs = append_the_data('data/player/clutch/playoffs', 'c_adv_', 'advanced')
c_adv_playoffs.to_csv('data/player/aggregates/clutch_advanced_playoffs.csv')

# merge these dataframes
clutch_advanced = pd.concat([c_adv_reg, c_adv_playoffs])
clutch_advanced.to_csv('data/player/aggregates/clutch_advanced_AllSeasons.csv')

# append clutch traditional data
c_trad_reg = append_the_data('data/player/clutch/regular_season', 'c_trad_', 'traditional')
c_trad_reg.to_csv('data/player/aggregates/clutch_traditional_regular_season.csv')

c_trad_playoffs = append_the_data('data/player/clutch/playoffs', 'c_trad_', 'traditional')
c_trad_playoffs.to_csv('data/player/aggregates/clutch_traditional_playoffs.csv')

# merge these dataframes
clutch_traditional = pd.concat([c_trad_reg, c_trad_playoffs])
clutch_traditional.to_csv('data/player/aggregates/clutch_traditional_AllSeasons.csv')

#append clutch misc data
c_misc_reg = append_the_data('data/player/clutch/regular_season', 'c_misc_', 'misc')
c_misc_reg.to_csv('data/player/aggregates/clutch_misc_regular_season.csv')

c_misc_playoffs = append_the_data('data/player/clutch/playoffs', 'c_misc_', 'misc')
c_misc_playoffs.to_csv('data/player/aggregates/clutch_misc_playoffs.csv')

# merge these dataframes
clutch_misc = pd.concat([c_misc_reg, c_misc_playoffs])
clutch_misc.to_csv('data/player/aggregates/clutch_misc_AllSeasons.csv')

# append clutch scoring data

c_scoring_reg = append_the_data('data/player/clutch/regular_season', 'c_scoring_', 'scoring')
c_scoring_reg.to_csv('data/player/aggregates/clutch_scoring_regular_season.csv')

c_scoring_playoffs = append_the_data('data/player/clutch/playoffs', 'c_scoring_', 'scoring')
c_scoring_playoffs.to_csv('data/player/aggregates/clutch_scoring_playoffs.csv')

# merge these dataframes
clutch_scoring = pd.concat([c_scoring_reg, c_scoring_playoffs])
clutch_scoring.to_csv('data/player/aggregates/clutch_scoring_AllSeasons.csv')

# append clutch usage data
c_usage_reg = append_the_data('data/player/clutch/regular_season', 'c_usage_', 'usage')
c_usage_reg.to_csv('data/player/aggregates/clutch_usage_regular_season.csv')

c_usage_playoffs = append_the_data('data/player/clutch/playoffs', 'c_usage_', 'usage')
c_usage_playoffs.to_csv('data/player/aggregates/clutch_usage_playoffs.csv')

# merge these dataframes
clutch_usage = pd.concat([c_usage_reg, c_usage_playoffs])
clutch_usage.to_csv('data/player/aggregates/clutch_usage_AllSeasons.csv')


In [12]:
# append all clutch data
adv = pd.read_csv('data/player/aggregates/clutch_advanced_AllSeasons.csv')
trad = pd.read_csv('data/player/aggregates/clutch_traditional_AllSeasons.csv')
misc = pd.read_csv('data/player/aggregates/clutch_misc_AllSeasons.csv')
scoring = pd.read_csv('data/player/aggregates/clutch_scoring_AllSeasons.csv')
usage = pd.read_csv('data/player/aggregates/clutch_usage_AllSeasons.csv')

In [13]:
print(f' Advanced is {adv.shape}, Traditional is {trad.shape}, Misc is {misc.shape}, Scoring is {scoring.shape}, Usage is {usage.shape}')

 Advanced is (5294, 27), Traditional is (5156, 34), Misc is (5294, 24), Scoring is (5294, 27), Usage is (5294, 30)


In [14]:
all_clutch = pd.merge(adv, trad.drop_duplicates(subset = ['c_trad_player', 'c_trad_season', 'c_trad_season_type']), 
                                                left_on= ['c_adv_player', 'c_adv_season', 'c_adv_season_type'], 
                                                right_on= ['c_trad_player', 'c_trad_season', 'c_trad_season_type'], 
                                                how= 'left')

In [15]:
all_clutch = pd.merge(all_clutch, misc.drop_duplicates(subset = ['c_misc_player', 'c_misc_season', 'c_misc_season_type']), 
                                                left_on= ['c_adv_player', 'c_adv_season', 'c_adv_season_type'], 
                                                right_on= ['c_misc_player', 'c_misc_season', 'c_misc_season_type'], 
                                                how= 'left')
all_clutch = pd.merge(all_clutch, scoring.drop_duplicates(subset = ['c_scoring_player', 'c_scoring_season', 'c_scoring_season_type']), 
                                                left_on= ['c_adv_player', 'c_adv_season', 'c_adv_season_type'], 
                                                right_on= ['c_scoring_player', 'c_scoring_season', 'c_scoring_season_type'], 
                                                how= 'left')
all_clutch = pd.merge(all_clutch, usage.drop_duplicates(subset = ['c_usage_player', 'c_usage_season', 'c_usage_season_type']), 
                                                left_on= ['c_adv_player', 'c_adv_season', 'c_adv_season_type'], 
                                                right_on= ['c_usage_player', 'c_usage_season', 'c_usage_season_type'], 
                                                how= 'left')
all_clutch.to_csv('data/player/aggregates/ALL_Clutch.csv')

  all_clutch = pd.merge(all_clutch, scoring.drop_duplicates(subset = ['c_scoring_player', 'c_scoring_season', 'c_scoring_season_type']),
