# Part 1: Scraping

Can we predict who will win an NBA game, given only information before the game has started?

## The Plan
- Part 1: Scraping NBA.com Data
- Part 2: Scraping team & player images
- Part 3: Get all Game-by-Game data
- Part 4: Initial Model(s)
- Part 5: Add player-level data
- Part 6: More Models
- Part 7: Conclusion

In [8]:
#Load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
import sqlite3
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import requests     # to get images
import shutil       # to save files locally
import datetime
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import os
import winsound
# NBA API
#from nba_api.stats.static import players
#from nba_api.stats.endpoints import commonplayerinfo
#from nba_api.stats.static import teams

In [13]:
### Define Player Scraping Function
def grab_player_data(url_list, file_folder):
        driver = webdriver.Chrome()
        i = 0
        for u in url_list:
                driver.get(u)
                time.sleep(7)
                xpath_all = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select/option[1]' # click "all pages"
                driver.find_element(by=By.XPATH, value=xpath_all).click()
                time.sleep(7)
                src = driver.page_source
                parser = BeautifulSoup(src, "lxml")
                table = parser.find("div", attrs = {"class":"nba-stat-table__overflow"})
                headers = table.findAll('th')
                headerlist = [h.text.strip() for h in headers[0:]] 
                headerlist = [a for a in headerlist if not 'RANK' in a]
                row_names = table.findAll('a')                             # find rows
                row_list = [b.text.strip() for b in row_names[0:]] 
                rows = table.findAll('tr')[0:]
                player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
                tot_cols = len(player_stats[1])                           #set the length to ignore hidden columns
                headerlist = headerlist[:tot_cols]   
                stats = pd.DataFrame(player_stats, columns = headerlist)
                filename = file_folder + str(u[34:]).replace('/', '_') + '.xlsx'
                filename = filename.replace('%','_')
                filename = filename.replace('=','_')
                filename = filename.replace('?','_')
                filename = filename.replace('&','_')
                pd.DataFrame.to_excel(stats, filename)
                i += 1
                lu = len(url_list)
                print(f'{filename} Completed Successfully! {i} / {lu} Complete!')
        winsound.Beep(523, 500)

# Part 1: Scraping NBA.com data

In order to get anything from the NBA.com website, I need to first get the player names and ids, because I will need the id's later down the road to scrape the player images. 

Luckily enough, I can grab the IDs and names from the NBA API. 


## Initial Scrapes for images

In [None]:
# Grab player ids and names from nba.com api.
'''
stats = players.get_players()
stats = pd.DataFrame(stats)
stats.head()
'''

Now, I have the player names and ids. From looking through the NBA.com website, I know I can access the player photos through this general url: 
https://cdn.nba.com/headshots/nba/latest/260x190/[PLAYER_ID].png

So, from here I add a player URL column to download the photos from. 


In [None]:
stats.head()

In [None]:
#Create URL for player photos to download
'''
stats['id'] = stats['id'].astype(np.str)
stats['photo_url'] = 'https://cdn.nba.com/headshots/nba/latest/260x190/' + stats['id'] + '.png'
stats.head()
'''

In [None]:
#stats.to_csv('scraped_data/player_data.csv')

Now, to download the images

In [None]:
'''
pic_url = stats['photo_url']
pic_url = pd.DataFrame(pic_url)
pic_url.head()
'''

In [None]:
# Download all listed pictures. Takes about 20 minutes. 
'''
for pic in pic_url['photo_url']:
    image_url = pic
    filename = image_url.split("/")[-1]
    r = requests.get(image_url, stream = True)
   
    if r.status_code == 200:                         # Check if image found 
        r.raw.decode_content = True                 # This allows image file to not have size = 0
        with open(filename,'wb') as f:              # WB is write binary
         shutil.copyfileobj(r.raw, f)
'''

Now, I need to get the team logos

In [None]:
# take https://www.nba.com/teams
# download all images

# images look like:
# https://www.nba.com/stats/media/img/teams/logos/BOS_logo.svg


In [None]:
'''
# Import teams from NBA api
teams = teams.get_teams()
abs = pd.DataFrame(teams)

#Get abbreviations to get URLs for team photos
abrevs = pd.DataFrame(abs['abbreviation'])
abrevs.head()
'''

In [None]:
'''
abrevs['abbreviation'] = abrevs['abbreviation'].astype(np.str)
abrevs['photo_url'] = 'https://www.nba.com/stats/media/img/teams/logos/' + abrevs['abbreviation'] +'_logo.svg'  
abrevs.head()
'''

Note: Downloaded them from https://www.stickpng.com/cat/sports/basketball/nba-teams?page=1 after nba.com svg downloads did not take. 


In [None]:
'''
for pic in abrevs['photo_url']:
    image_url = pic
    filename = image_url.split("/")[-1]
    r = requests.get(image_url, stream = True)
   
    if r.status_code == 200:                         # Check if image found 
        r.raw.decode_content = True                 # This allows image file to not have size = 0
        with open(filename,'wb') as f:              # WB is write binary
         shutil.copyfileobj(r.raw, f)
'''

In [None]:
'''
from bs4 import BeautifulSoup

url ="https://www.nba.com/teams"

#send get request
response = requests.get(url)

html_page = BeautifulSoup(response.text, 'html.parser')

images = html_page.find_all("img")

for index, image in enumerate(images):
    image_url= image.get("src")      #img src value
    
    image_extension= image_url.split(".")[-1]       #get image extension

    #get image data
    image_bytes = requests.get(image_url).content
    
    if image_bytes:
        #write the image data
        with open(f"Image {index+1}.{image_extension}", "wb") as file:
            file.write(image_bytes)
            print(f"Downloading image {index+1}.{image_extension}")

'''

In [None]:

'''
url = 'https://www.nba.com/players'
grab = requests.get(url)
soup = BeautifulSoup(grab.text, 'html.parser')

urls = []

driver = webdriver.Chrome()
'''

In [None]:
# Trying to piece both of these together
#pt 1
'''

driver.get(url)
time.sleep(10)
xpath3 = '/html/body/div[1]/div[2]/div[3]/section/div/div[2]/div[1]/div[6]/label/div/span'
driver.find_element(by=By.XPATH, value=xpath3).click()
time.sleep(10)

time.sleep(25)
#pt2
elems = driver.find_elements_by_xpath("//a[@href]")
for elem in elems:
    urls.append(elem.get_attribute("href"))

'''

Now, I need to get rid of the links that I dont need (everything but player). 

## Scraping Player Data

There is a ton of data to scrape from nba.com, but scraping it can be difficult, and the data is broken up in maaany parts that we need to put together. 


https://www.nba.com/stats/players/traditional/?sort=PTS&dir=-1&Season=2021-22&SeasonType=Regular%20Season

### Define Player Scraping Function

In [4]:
def grab_player_data(url_list, file_folder):
        driver = webdriver.Chrome()
        i = 0
        for u in url_list:
                driver.get(u)
                time.sleep(7)
                xpath_all = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select/option[1]' # click "all pages"
                driver.find_element(by=By.XPATH, value=xpath_all).click()
                time.sleep(7)
                src = driver.page_source
                parser = BeautifulSoup(src, "lxml")
                table = parser.find("div", attrs = {"class":"nba-stat-table__overflow"})
                headers = table.findAll('th')
                headerlist = [h.text.strip() for h in headers[0:]] 
                headerlist = [a for a in headerlist if not 'RANK' in a]
                row_names = table.findAll('a')                             # find rows
                row_list = [b.text.strip() for b in row_names[0:]] 
                rows = table.findAll('tr')[0:]
                player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
                tot_cols = len(player_stats[1])                           #set the length to ignore hidden columns
                headerlist = headerlist[:tot_cols]   
                stats = pd.DataFrame(player_stats, columns = headerlist)
                filename = file_folder + str(u[34:]).replace('/', '_') + '.xlsx'
                filename = filename.replace('%','_')
                filename = filename.replace('=','_')
                filename = filename.replace('?','_')
                filename = filename.replace('&','_')
                pd.DataFrame.to_excel(stats, filename)
                i += 1
                lu = len(url_list)
                print(f'{filename} Completed Successfully! {i} / {lu} Complete!')
        

### Start Scraping

In [56]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
stat_types = ['traditional', 'advanced', 'misc', 'scoring', 'usage','opponent', 'defense']
season_types = ['Playoffs', 'Regular%20Season']

In [57]:
urlz = []


for year in years:
    for stattype in stat_types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/'+ stattype +'?SeasonType=' + s_types + '&Season=' + year 
            urlz.append(url)

In [None]:
urlz

In [59]:
pew = pd.DataFrame(urlz)
pew.shape
yee = pew.shape
yee

(70, 1)

### Player Playtype Data

In [None]:
#nba.com/stats/players/transition/?SeasonType=Playoffs&SeasonYear=2020-21

In [4]:
# https://www.nba.com/stats/players/isolation/

years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
playtypes = ['isolation', 'transition', 'ball-handler', 'roll-man', 
            'playtype-post-up','spot-up', 'hand-off', 'cut',
            'off-screen', 'putbacks', 'misc'] 
season_types = ['Playoffs', 'Regular%20Season']

In [5]:
play_urlz = []


for year in years:
    for play in playtypes:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/'+ play +'/?SeasonType=' + s_types + '&SeasonYear=' + year
            play_urlz.append(str(url))

In [None]:
play_urlz

In [None]:
grab_player_data(play_urlz, 'player_playtype/')

### Players Defensive Dashboard


In [22]:
# Make a new folder for the scraped data
#os.mkdir('player_d_dash')

In [61]:
# https://www.nba.com/stats/players/isolation/

years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['defense-dash-overall', 'defense-dash-3pt', 'defense-dash-2pt', 'defense-dash-lt6',
         'defense-dash-lt10', 'defense-dash-gt15' ]
season_types = ['Playoffs', 'Regular%20Season']



In [62]:
def_urlz = []


for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/'+ type +'?SeasonType=' + s_types + '&Season=' + year
            def_urlz.append(str(url))

In [None]:
def_urlz

In [66]:
grab_player_data(def_urlz, 'player_d_dash/')

player_d_dash/defense-dash-overall_SeasonType_Playoffs_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-overall_SeasonType_Regular_20Season_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-3pt_SeasonType_Playoffs_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-3pt_SeasonType_Regular_20Season_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-2pt_SeasonType_Playoffs_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-2pt_SeasonType_Regular_20Season_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-lt6_SeasonType_Playoffs_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-lt6_SeasonType_Regular_20Season_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-lt10_SeasonType_Playoffs_Season_2021-22.xlsx Completed Successfully!
player_d_dash/defense-dash-lt10_SeasonType_Regular_20Season_Season_2021-22.xlsx Completed Successfully!


### Player Other 

In [18]:
# make folder for player other data
#os.mkdir('player_other')

In [67]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['shooting', 'opponent-shooting', 'hustle' ]
season_types = ['Playoffs', 'Regular%20Season']

In [68]:
oth_urlz = []

for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/'+ type +'?SeasonType=' + s_types + '&Season=' + year
            oth_urlz.append(str(url))

In [None]:

grab_player_data(oth_urlz, 'player_other/')

### Game-by-Game Team Data

Teams general stats - only one page, no need for xpath click


In [24]:
# make folder for team general data
#os.mkdir('team_general')

In [22]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['traditional', 'advanced', 'four-factors', 'misc', 'scoring', 
        'opponent', 'defense']
season_types = ['Playoffs', 'Regular%20Season']

In [29]:
team_urlz = []


for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/teams/'+ type +'?SeasonType=' + s_types + '&Season=' + year
            team_urlz.append(str(url))

In [None]:
#'''
driver = webdriver.Chrome()

for u in team_urlz:
        driver.get(u)
        time.sleep(7)
        #xpath_all = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select/option[1]' # click "all pages"
        #driver.find_element(by=By.XPATH, value=xpath_all).click()
        #time.sleep(15)
        src = driver.page_source
        parser = BeautifulSoup(src, "lxml")
        table = parser.find("div", attrs = {"class":"nba-stat-table__overflow"})
        headers = table.findAll('th')
        headerlist = [h.text.strip() for h in headers[0:]] 
        headerlist = [a for a in headerlist if not 'RANK' in a]
        row_names = table.findAll('a')                             # find rows
        row_list = [b.text.strip() for b in row_names[0:]] 
        rows = table.findAll('tr')[0:]
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
        tot_cols = len(player_stats[1])                           #set the length to ignore hidden columns
        headerlist = headerlist[:tot_cols]   
        stats = pd.DataFrame(player_stats, columns = headerlist)
        filename = 'team_general/' + str(u[31:]) + '.xlsx'
        filename = filename.replace('%','_')
        filename = filename.replace('=','_')
        filename = filename.replace('?','_')
        filename = filename.replace('&','_')
        pd.DataFrame.to_excel(stats, filename)
        print(f'{filename} Completed Successfully!')

#'''

### Teams Playtypes

In [32]:
# make folder for team general data
#os.mkdir('team_playtype')

In [38]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['isolation', 'transition', 'ball-handler', 'roll-man', 
            'playtype-post-up','spot-up', 'hand-off', 'cut',
            'off-screen', 'putbacks', 'misc']
season_types = ['Playoffs', 'Regular%20Season']


In [41]:
team_urlz = []


for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/teams/'+ type +'?SeasonType=' + s_types + '&Season=' + year + "&SeasonYear=" + year
            team_urlz.append(str(url))

In [None]:
team_urlz

In [None]:
#'''
driver = webdriver.Chrome()

for u in team_urlz:
        driver.get(u)
        time.sleep(7)
        #xpath_all = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select/option[1]' # click "all pages"
        #driver.find_element(by=By.XPATH, value=xpath_all).click()
        #time.sleep(15)
        src = driver.page_source
        parser = BeautifulSoup(src, "lxml")
        table = parser.find("div", attrs = {"class":"nba-stat-table__overflow"})
        headers = table.findAll('th')
        headerlist = [h.text.strip() for h in headers[0:]] 
        headerlist = [a for a in headerlist if not 'RANK' in a]
        row_names = table.findAll('a')                             # find rows
        row_list = [b.text.strip() for b in row_names[0:]] 
        rows = table.findAll('tr')[0:]
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
        tot_cols = len(player_stats[1])                           #set the length to ignore hidden columns
        headerlist = headerlist[:tot_cols]   
        stats = pd.DataFrame(player_stats, columns = headerlist)
        filename = 'team_playtype/' + str(u[31:]) + '.xlsx'
        filename = filename.replace('%','_')
        filename = filename.replace('=','_')
        filename = filename.replace('?','_')
        filename = filename.replace('&','_')
        pd.DataFrame.to_excel(stats, filename)
        print(f'{filename} Completed Successfully!')
#'''

### Team Playtype Defense (TODO!)

In [21]:
#os.mkdir('team_playtype_d')

In [6]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['isolation', 'ball-handler', 'roll-man', 'playtype-post-up',
        'spot-up', 'hand-off', 'cut','off-screen']
season_types = ['Playoffs', 'Regular%20Season']

In [7]:
playd_urlz = []
for year in years:
    for t in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/teams/'+ t +'/?SeasonType=' + s_types + '&TypeGrouping=defensive&SeasonYear=' + year
            playd_urlz.append(str(url))

### Player Playtypes

In [30]:
#os.mkdir('player_playtype')

In [10]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['isolation', 'transition', 'ball-handler', 'roll-man', 
            'playtype-post-up','spot-up', 'hand-off', 'cut',
            'off-screen', 'putbacks', 'misc']
season_types = ['Playoffs', 'Regular%20Season']

In [11]:
play_urlz = []


for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/'+ type +'?SeasonType=' + s_types + "&SeasonYear=" + year
            play_urlz.append(str(url))

In [None]:
driver = webdriver.Chrome()

for u in play_urlz:
        driver.get(u)
        time.sleep(7)
        xpath_all = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select/option[1]' # click "all pages"
        driver.find_element(by=By.XPATH, value=xpath_all).click()
        time.sleep(7)
        src = driver.page_source
        parser = BeautifulSoup(src, "lxml")
        table = parser.find("div", attrs = {"class":"nba-stat-table__overflow"})
        headers = table.findAll('th')
        headerlist = [h.text.strip() for h in headers[0:]] 
        headerlist = [a for a in headerlist if not 'RANK' in a]
        row_names = table.findAll('a')                             # find rows
        row_list = [b.text.strip() for b in row_names[0:]] 
        rows = table.findAll('tr')[0:]
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
        tot_cols = len(player_stats[1])                           #set the length to ignore hidden columns
        headerlist = headerlist[:tot_cols]   
        stats = pd.DataFrame(player_stats, columns = headerlist)
        filename = 'player_playtype/' + str(u[34:]) + '.xlsx'
        filename = filename.replace('%','_')
        filename = filename.replace('=','_')
        filename = filename.replace('?','_')
        filename = filename.replace('&','_')
        pd.DataFrame.to_excel(stats, filename)
        print(f'{filename} Completed Successfully!')

### Player Playtype Defense Download

Player Playtypes - Defense are available for the following plays:
- Isolation
- PR Ball Handler
- PR Man
- Post-up
- Spot-up
- Handoff
- Off-Screen

In [18]:
#os.mkdir('player_playtype_d')

In [13]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['isolation', 'ball-handler', 'roll-man', 
            'playtype-post-up','spot-up', 'hand-off','off-screen', ]
season_types = ['Playoffs', 'Regular%20Season']

In [16]:
play_d_urlz = []

for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/'+ type +'/?SeasonType=' + s_types + '&TypeGrouping=defensive&SeasonYear=' + year
            play_d_urlz.append(str(url))

### Boxscores

In [44]:
# make folder for team general data
# os.mkdir('team_boxes')

In [52]:
years = ['2021-22', '2020-21', '2019-20', '2017-18', '2017-18']
types = ['boxscores-traditional', 'boxscores-advanced', 'boxscores-four-factors', 
        'boxscores-misc', 'boxscores-scoring' ]
season_types = ['Playoffs', 'Regular%20Season']

In [53]:
team_urlz = []


for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/teams/'+ type +'?SeasonType=' + s_types + '&Season=' + year + "&SeasonYear=" + year
            team_urlz.append(str(url))

In [None]:
team_urlz

### Opponent Shooting

In [48]:
# make folder for team general data
#os.mkdir('team_opp_shooting')

In [49]:
years = ['2021-22', '2020-21', '2019-20', '2017-18', '2017-18']
types = ['opponent-shooting', 'opponent-shots-general', 'opponent-shots-shotclock', 
        'opponent-shots-dribbles', 'opponent-shots-touch-time', 'opponent-shots-closest-defender',
         'opponent-shots-closest-defender-10']
season_types = ['Playoffs', 'Regular%20Season']


In [50]:
team_urlz = []


for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/teams/'+ type +'?SeasonType=' + s_types + '&Season=' + year + "&SeasonYear=" + year
            team_urlz.append(str(url))

In [None]:
team_urlz

### Player Boxscores - Get code that downloaded them in pieces

Player Boxscores are the boxscores by player. There is far more data here than ususal, and my typical scraping technique will not work due to the amount of data. Because of this, I have to download the data one month at a time, so I have to do a few more steps.

In [60]:
# make folder for player boxscores
#os.mkdir('player_box_scores')

In [91]:
s_17 = '2017-18'
s_18 = '2017-18'
s_19 = '2019-20'
s_20 = '2020-21'
s_21 = '2021-22'
yearz = ['2021', '2020', '2019', '2018', '2017']
s_years = [s_21, s_20, s_19, s_18, s_17]

#months played by year -- the months that HAD GAMES during said season
months_2021 = ['1', '2','3', '4', '5', '6', '7']
months_2020 = ['3', '4', '5', '6', '7','8', '9', '10', '11', '12']
months_2019 = ['1', '2','3', '4', '5', '6']
months_2018 = ['1', '2','3', '4', '5', '6', '7']
months_2017 = ['1', '2','3', '4', '5', '6', '7']

In [99]:
years = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
types = ['boxscores-advanced', 'boxscores-traditional']
season_types = ['Regular%20Season']


In [101]:
# List all months to scrape
urls1 = []
'''
for m in months_2021:
    for t in types: 
        url = 'https://www.nba.com/stats/players/' + str(t) +'/?Season=2021-22&sort=gdate&dir=-1&Month='+str(m) +'&SeasonType=Regular%20Season'
        urls1.append(url)

for m in months_2020:
    for t in types: 
        url = 'https://www.nba.com/stats/players/' + str(t) + '/?Season=2020-21&sort=gdate&dir=-1&Month='+str(m) +'&SeasonType=Regular%20Season'
        urls1.append(url)

for m in months_2019:
    for t in types: 
        url = 'https://www.nba.com/stats/players/' + str(t) + '/?Season=2019-20&sort=gdate&dir=-1&Month='+str(m) +'&SeasonType=Regular%20Season'
        urls1.append(url)

for m in months_2018:
    for t in types: 
        url = 'https://www.nba.com/stats/players/' + str(t) + '/?Season=2018-19&sort=gdate&dir=-1&Month='+str(m) +'&SeasonType=Regular%20Season'
        urls1.append(url)

for m in months_2018:
    for t in types: 
        url = 'https://www.nba.com/stats/players/' + str(t) + '/?Season=2017-18&sort=gdate&dir=-1&Month='+str(m) +'&SeasonType=Regular%20Season'
        urls1.append(url)

lendo = len(urls1)
lendo
'''

40

### Working with player boxscore data
Should I combine it all by player & game to broaden out the columns?

### Adding in BigDataBall Data (purchased)

I wanted to get another data source for this, so I bought data from BigDataBall.com. 

- 2020-2021 game-by-game data
- 2019-2020 game-by-game data
- 2018-2019 game-by-game data
- 2017-2018 game-by-game data

## Downloading from basketballreference.com - Contract Data

Trades and player transactions are on individual player pages 
- Tyson Chandler : https://www.basketball-reference.com/players/c/chandty01.html
- Robert Covington:https://www.basketball-reference.com/players/c/covinro01.html
- Nikola Jokic:    https://www.basketball-reference.com/players/j/jokicni01.html



In [5]:
p17 = pd.read_excel('bball_reference\\players_2017-2018.xlsx')
p18 = pd.read_excel('bball_reference\\players_2018-2019.xlsx')
p19 = pd.read_excel('bball_reference\\players_2019-2020.xlsx')
p20 = pd.read_excel('bball_reference\\players_2020-2021.xlsx')
p21 = pd.read_excel('bball_reference\\players_2021-2022.xlsx')

In [6]:
p17u = pd.DataFrame(p17.Player.unique())
p18u = pd.DataFrame(p18.Player.unique())
p19u = pd.DataFrame(p19.Player.unique())
p20u = pd.DataFrame(p20.Player.unique())
p21u = pd.DataFrame(p21.Player.unique())

In [7]:
p17_22 = p17u.append(p18u)
p17_22 = p17_22.append(p19u)
p17_22 = p17_22.append(p20u)
p17_22 = p17_22.append(p21u)

p17_22

Unnamed: 0,0
0,Álex Abrines\abrinal01
1,Quincy Acy\acyqu01
2,Steven Adams\adamsst01
3,Bam Adebayo\adebaba01
4,Arron Afflalo\afflaar01
...,...
600,Thaddeus Young\youngth01
601,Trae Young\youngtr01
602,Omer Yurtseven\yurtsom01
603,Cody Zeller\zelleco01


In [8]:
p17_22 = p17_22[0].str.split('\\', expand = True)
p17_22.head(3)

Unnamed: 0,0,1
0,Álex Abrines,abrinal01
1,Quincy Acy,acyqu01
2,Steven Adams,adamsst01


In [9]:
p17_22 = p17_22.rename(columns = {0 : 'player_name', 1 : 'player_id'})

In [10]:
p17_22['url'] = 'https://www.basketball-reference.com/players/' + p17_22['player_id'].str[:1] + '/' + p17_22['player_id'] + '.html'

In [11]:
p17_22.shape

(2744, 3)

In [12]:
p17_22 = p17_22.drop_duplicates()
p17_22

Unnamed: 0,player_name,player_id,url
0,Álex Abrines,abrinal01,https://www.basketball-reference.com/players/a...
1,Quincy Acy,acyqu01,https://www.basketball-reference.com/players/a...
2,Steven Adams,adamsst01,https://www.basketball-reference.com/players/a...
3,Bam Adebayo,adebaba01,https://www.basketball-reference.com/players/a...
4,Arron Afflalo,afflaar01,https://www.basketball-reference.com/players/a...
...,...,...,...
589,Ziaire Williams,willizi02,https://www.basketball-reference.com/players/w...
597,McKinley Wright IV,wrighmc01,https://www.basketball-reference.com/players/w...
598,Moses Wright,wrighmo01,https://www.basketball-reference.com/players/w...
599,Gabe York,yorkga01,https://www.basketball-reference.com/players/y...


In [None]:
driver = webdriver.Chrome()
for url in p17_22['url']:
    driver.get(url)
    time.sleep(5)
    search = driver.find_element_by_id("div_transactions")
    nm = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/div/h1/span")
    time.sleep(7)
    yup = nm.text
    filename = 'player_contracts/' + nm.text + '.xlsx'
    data = search.text
    df = pd.DataFrame([x.split(';') for x in data.split('\n')])
    df = df.rename(columns= {0 : yup})
    df.to_excel(filename)

In [13]:
path = 'C:\\Users\\tmcro\\OneDrive\\0_NBA_Statistitcs_2021\\NBA Current Studies\\Flatiron Phase 3 Project\\player_contracts'
p = os.listdir(path)
pf = pd.DataFrame(p)
pf.shape

(985, 1)

In [14]:
pf['player_name'] = pf[0].str[:-5]
pf

Unnamed: 0,0,player_name
0,Aaron Brooks.xlsx,Aaron Brooks
1,Aaron Gordon.xlsx,Aaron Gordon
2,Aaron Harrison.xlsx,Aaron Harrison
3,Aaron Henry.xlsx,Aaron Henry
4,Aaron Holiday.xlsx,Aaron Holiday
...,...,...
980,Zion Williamson.xlsx,Zion Williamson
981,Zylan Cheatham.xlsx,Zylan Cheatham
982,Álex Abrines.xlsx,Álex Abrines
983,Ángel Delgado.xlsx,Ángel Delgado


In [15]:
miss = p17_22[~p17_22.player_name.isin(pf.player_name)]
miss

Unnamed: 0,player_name,player_id,url
177,Manu Ginóbili*,ginobma01,https://www.basketball-reference.com/players/g...
542,Isaiah Todd,toddis01,https://www.basketball-reference.com/players/t...
558,Franz Wagner,wagnefr01,https://www.basketball-reference.com/players/w...
560,Ish Wainright,wainris01,https://www.basketball-reference.com/players/w...
563,M.J. Walker,walkemj01,https://www.basketball-reference.com/players/w...
567,Duane Washington Jr.,washidu02,https://www.basketball-reference.com/players/w...
570,Lindy Waters III,waterli01,https://www.basketball-reference.com/players/w...
572,Trendon Watford,watfotr01,https://www.basketball-reference.com/players/w...
579,Joe Wieskamp,wieskjo01,https://www.basketball-reference.com/players/w...
580,Aaron Wiggins,wiggiaa01,https://www.basketball-reference.com/players/w...


In [16]:
# For Loop
driver = webdriver.Chrome()
for url in miss['url']:
    driver.get(url)
    time.sleep(7)
    search = driver.find_element_by_id("div_transactions")
    nm = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/div/h1/span")
    time.sleep(7)
    yup = nm.text
    filename = 'player_contracts/' + nm.text + '.xlsx'
    data = search.text
    df = pd.DataFrame([x.split(';') for x in data.split('\n')])
    df = df.rename(columns= {0 : yup})
    df.to_excel(filename)

  search = driver.find_element_by_id("div_transactions")
  nm = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/div/h1/span")


## Shooting - Closest Defender [From Shot Dashboard]

How players shoot when being closely guarded


In [5]:
years = ['2021-22', '2020-21', '2019-20', '2017-18', '2017-18']
types = ['', '&CloseDefDistRange=2-4%20Feet%20-%20Tight', '&CloseDefDistRange=4-6%20Feet%20-%20Open', '&CloseDefDistRange=6%2B Feet - Wide Open' ]
season_types = ['Playoffs', 'Regular%20Season']

In [6]:
sd_urlz = []

for year in years:
    for type in types:
        for s_types in season_types:
            url = 'https://www.nba.com/stats/players/shots-closest-defender/?Season=' + year + '&SeasonType=' + s_types  + '&SeasonYear=' + year + type
            sd_urlz.append(str(url))

In [7]:
grab_player_data(sd_urlz, 'player_shooting_closest_defender/')

player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Playoffs_SeasonYear_2021-22.xlsx Completed Successfully! 1 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Regular_20Season_SeasonYear_2021-22.xlsx Completed Successfully! 2 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Playoffs_SeasonYear_2021-22_CloseDefDistRange_2-4_20Feet_20-_20Tight.xlsx Completed Successfully! 3 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Regular_20Season_SeasonYear_2021-22_CloseDefDistRange_2-4_20Feet_20-_20Tight.xlsx Completed Successfully! 4 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Playoffs_SeasonYear_2021-22_CloseDefDistRange_4-6_20Feet_20-_20Open.xlsx Completed Successfully! 5 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_Seaso

In [34]:
sd_urlz

['https://www.nba.com/stats/players/shots-closest-defender/?Season=2021-22&SeasonType=Playoffs&SeasonYear=2021-22',
 'https://www.nba.com/stats/players/shots-closest-defender/?Season=2021-22&SeasonType=Regular%20Season&SeasonYear=2021-22',
 'https://www.nba.com/stats/players/shots-closest-defender/?Season=2021-22&SeasonType=Playoffs&SeasonYear=2021-22&CloseDefDistRange=2-4%20Feet%20-%20Tight',
 'https://www.nba.com/stats/players/shots-closest-defender/?Season=2021-22&SeasonType=Regular%20Season&SeasonYear=2021-22&CloseDefDistRange=2-4%20Feet%20-%20Tight',
 'https://www.nba.com/stats/players/shots-closest-defender/?Season=2021-22&SeasonType=Playoffs&SeasonYear=2021-22&CloseDefDistRange=4-6%20Feet%20-%20Open',
 'https://www.nba.com/stats/players/shots-closest-defender/?Season=2021-22&SeasonType=Regular%20Season&SeasonYear=2021-22&CloseDefDistRange=4-6%20Feet%20-%20Open',
 'https://www.nba.com/stats/players/shots-closest-defender/?Season=2021-22&SeasonType=Playoffs&SeasonYear=2021-22&Clos

In [33]:
# This isnt working. Lets find out why. 

def grab_player_closest_defender(url_list, file_folder):
        driver = webdriver.Chrome()
        i = 0
        for u in url_list:
                driver.get(u)
                time.sleep(7)
                xpath_all = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select/option[1]' # click "all pages"
                driver.find_element(by=By.XPATH, value=xpath_all).click()
                time.sleep(7)
                src = driver.page_source
                parser = BeautifulSoup(src, "lxml")
                table = parser.find("div", attrs = {"class":"nba-stat-table__overflow"})
                headers = table.findAll('th')
                headerlist = [h.text.strip() for h in headers[0:]] 
                row_names = table.findAll('a')                             # find rows
                row_list = [b.text.strip() for b in row_names[0:]] 
                rows = table.findAll('tr')[0:]
                player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[0:]] for i in range(len(rows))]
                tot_cols = len(player_stats[2]) + 4                           #set the length to ignore hidden columns
                headerlist = headerlist[4:tot_cols]       
                stats = pd.DataFrame(player_stats, columns = headerlist)
                filename = file_folder + str(u[34:]).replace('/', '_') + '.xlsx'
                filename = filename.replace('%','_')
                filename = filename.replace('=','_')
                filename = filename.replace('?','_')
                filename = filename.replace('&','_')
                filename = filename.replace('SeasonType_Regular_20Season','Reg_Season')
                pd.DataFrame.to_excel(stats, filename)
                i += 1
                lu = len(url_list)
                print(f'{filename} Completed Successfully! {i} / {lu} Complete!')
        winsound.Beep(523, 500)

In [35]:
grab_player_closest_defender(sd_urlz, 'player_shooting_closest_defender/')

player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Playoffs_SeasonYear_2021-22.xlsx Completed Successfully! 1 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_Reg_Season_SeasonYear_2021-22.xlsx Completed Successfully! 2 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Playoffs_SeasonYear_2021-22_CloseDefDistRange_2-4_20Feet_20-_20Tight.xlsx Completed Successfully! 3 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_Reg_Season_SeasonYear_2021-22_CloseDefDistRange_2-4_20Feet_20-_20Tight.xlsx Completed Successfully! 4 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_SeasonType_Playoffs_SeasonYear_2021-22_CloseDefDistRange_4-6_20Feet_20-_20Open.xlsx Completed Successfully! 5 / 40 Complete!
player_shooting_closest_defender/shots-closest-defender__Season_2021-22_Reg_Season_SeasonYear_2021-22_CloseDefD