In [3]:
from pymongo import MongoClient
import pprint

import pandas as pd
import matplotlib.pyplot as plt

import requests

from bs4 import BeautifulSoup

import json
import datetime
import calendar
import time

## Request the webpage's raw HTML

In [4]:
url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-october.html'
r = requests.get(url)

In [5]:
r.status_code

200

In [4]:
#pprint.pprint(r.text)

## Save the Raw HTML into a MongoDB

In [6]:
client = MongoClient()
db = client.capstone1_bball_reference

In [7]:
one_season = db.testing

In [7]:
# commenting out - do not want multiples if I rerun this notebook
# one_season.insert_one({'link':url,'html':r.text})

<pymongo.results.InsertOneResult at 0x7f9664150c80>

## Parse the hypertext to get data with Beautiful Soup

In [8]:
soup = BeautifulSoup(r.text,'html.parser')

In [9]:
#print(soup.prettify())

In [10]:
soup.find_all('tr')[1] #get info for each game

<tr><th class="left" csk="201810160BOS" data-stat="date_game" scope="row"><a href="/boxscores/index.fcgi?month=10&amp;day=16&amp;year=2018">Tue, Oct 16, 2018</a></th><td class="right" data-stat="game_start_time">8:00p</td><td class="left" csk="PHI.201810160BOS" data-stat="visitor_team_name"><a href="/teams/PHI/2019.html">Philadelphia 76ers</a></td><td class="right" data-stat="visitor_pts">87</td><td class="left" csk="BOS.201810160BOS" data-stat="home_team_name"><a href="/teams/BOS/2019.html">Boston Celtics</a></td><td class="right" data-stat="home_pts">105</td><td class="center" data-stat="box_score_text"><a href="/boxscores/201810160BOS.html">Box Score</a></td><td class="center iz" data-stat="overtimes"></td><td class="right" data-stat="attendance">18,624</td><td class="left iz" data-stat="game_remarks"></td></tr>

In [11]:
soup.find_all('tr')[1].find('a').text[5:] 
#get date for each game - removed day (all dates have a three letter abbrev. with a space and ','
#five chars i.e. 'Tue, '

'Oct 16, 2018'

In [12]:
soup.find_all('tr')[1].find_all('td','left')[0].text #get name of visiting team

'Philadelphia 76ers'

In [13]:
soup.find_all('tr')[1].find_all('td','right')[1].text #get score of visiting team

'87'

In [14]:
soup.find_all('tr')[1].find_all('td','left')[1].text #get name of home team

'Boston Celtics'

In [15]:
soup.find_all('tr')[1].find_all('td','right')[2].text #get score of home team

'105'

## Testing other pages. Charts are not consistent - older seasons are missing 'Start (ET)' columns
Spot checked - there isn't some specific season that the column was adopted. Some are sporadic. Original spot check looked like 'Start(ET)' column starts 1985-86 and after, however, after clicking around some more, 1994-1995 did not have such column.

In [9]:
url = 'https://www.basketball-reference.com/leagues/NBA_1959_games-november.html'
x = requests.get(url)
x.status_code

200

In [17]:
#pprint.pprint(x.text)

In [18]:
# commenting out - do not want multiples if I rerun this notebook
# one_season.insert_one({'link':url,'html':x.text})

<pymongo.results.InsertOneResult at 0x7f96632a5870>

In [10]:
soup2 = BeautifulSoup(x.text,'html.parser')

In [20]:
#print(soup2.prettify())

In [21]:
soup2.find_all('tr')[1] #get info for each game

<tr><th class="left" csk="195811010NYK" data-stat="date_game" scope="row"><a href="/boxscores/index.fcgi?month=11&amp;day=1&amp;year=1958">Sat, Nov 1, 1958</a></th><td class="left" csk="PHW.195811010NYK" data-stat="visitor_team_name"><a href="/teams/PHW/1959.html">Philadelphia Warriors</a></td><td class="right" data-stat="visitor_pts">111</td><td class="left" csk="NYK.195811010NYK" data-stat="home_team_name"><a href="/teams/NYK/1959.html">New York Knicks</a></td><td class="right" data-stat="home_pts">92</td><td class="center" data-stat="box_score_text"><a href="/boxscores/195811010NYK.html">Box Score</a></td><td class="center iz" data-stat="overtimes"></td><td class="right iz" data-stat="attendance"></td><td class="left iz" data-stat="game_remarks"></td></tr>

In [22]:
soup2.find_all('tr')[1].find('a').text[5:] 
#get date for each game - removed day (all dates have a three letter abbrev. with a space and ','
#five chars i.e. 'Tue, 

'Nov 1, 1958'

In [23]:
soup2.find_all('tr')[1].find_all('td','left')[0].text #get name of visiting team

'Philadelphia Warriors'

In [24]:
soup2.find_all('tr')[1].find_all('td','right')[0].text #get score of visiting team - index at 0 instead of 1 because it does not have a Start (ET) column

'111'

In [25]:
soup2.find_all('tr')[1].find_all('td','left')[1].text #get name of home team

'New York Knicks'

In [26]:
soup2.find_all('tr')[1].find_all('td','right')[1].text #get score of home team

'92'

Spot checked - looks like there is either a Start (ET) column or there is not, there are no other changes to column names or number of columns. If this is the case, column counts will be either 9 or 10 depending on if it has the Start (ET) column. We can use this to dictate what index to pull from. This only affects pulling scores - team names are unaffected. (If there is a Start (ET) column the index will be 1 higher compared to when there is no Start (ET) column when pulling both visiting and home team scores)

In [27]:
#check
if len(soup.find_all('col')) == 9:
    print(soup.find_all('tr')[1].find_all('td','right')[0].text)
else:
    print(soup.find_all('tr')[1].find_all('td','right')[1].text)

if len(soup2.find_all('col')) == 9:
    print(soup2.find_all('tr')[1].find_all('td','right')[0].text)
else:
    print(soup2.find_all('tr')[1].find_all('td','right')[1].text)

87
111


## Ran into another issue. "Notes" column for some games denotes that the game was played elsewhere - not either teams' arenas

In [28]:
soup2.find_all('tr')[1].find_all('td','left')[-1].text #difference in columns does not affect the the notes column - it is always the last column

''

## Gather information from other rows/games - games for the month

In [11]:
date = []
visiting = []
visiting_score = []
home = []
home_score = []
season = []
notes = []

#2018-2019 season october example 
for i,game in enumerate(soup.find_all('tr')):
    if game.a != None: #skips the chart label rows
        datestring = game.find('a').text[5:]
        dt = datetime.datetime.strptime(datestring,'%b %d, %Y') #change datestring to datetime object
        date.append(dt)
        visiting.append(game.find_all('td','left')[0].text)
        home.append(game.find_all('td','left')[1].text)
        season.append(soup.find_all('div','inactive')[0].find('li','index').text)
        notes.append(game.find_all('td','left')[-1].text)
        if len(soup.find_all('col'))==10: #takes care of the different column counts 
            visiting_score.append(int(game.find_all('td','right')[1].text))
            home_score.append(int(game.find_all('td','right')[2].text))
        else: 
            visiting_score.append(int(game.find_all('td','right')[0].text))
            home_score.append(int(game.find_all('td','right')[1].text))
        

#1958-1959 season november example 
for i,game in enumerate(soup2.find_all('tr')):
    if game.a != None: #skips the chart label rows
        datestring = game.find('a').text[5:]
        dt = datetime.datetime.strptime(datestring,'%b %d, %Y') #change datestring to datetime object
        date.append(dt)
        visiting.append(game.find_all('td','left')[0].text)
        home.append(game.find_all('td','left')[1].text)
        season.append(soup2.find_all('div','inactive')[0].find('li','index').text)
        notes.append(game.find_all('td','left')[-1].text)
        if len(soup2.find_all('col'))==10: #takes care of the different column counts 
            visiting_score.append(int(game.find_all('td','right')[1].text))
            home_score.append(int(game.find_all('td','right')[2].text))
        else: 
            visiting_score.append(int(game.find_all('td','right')[0].text))
            home_score.append(int(game.find_all('td','right')[1].text))



## Need to account for games where schedules are out but have not been played yet so scores will be an empty string

Cannot cast empty string as an int - will get error. Change any empty strings to zeroes.

In [43]:
#2020-2021 season february example
url = 'https://www.basketball-reference.com/leagues/NBA_2021_games-february.html'
c = requests.get(url)
c.status_code

200

In [44]:
soup3 = BeautifulSoup(c.text,'html.parser')

In [50]:
for i,game in enumerate(soup3.find_all('tr')):
    if game.a != None: #skips the chart label rows
        datestring = game.find('a').text[5:]
        dt = datetime.datetime.strptime(datestring,'%b %d, %Y') #change datestring to datetime object
        date.append(dt)
        visiting.append(game.find_all('td','left')[0].text)
        home.append(game.find_all('td','left')[1].text)
        season.append(soup3.find_all('div','inactive')[0].find('li','index').text)
        notes.append(game.find_all('td','left')[-1].text)
        if len(soup3.find_all('col'))==10: #takes care of the different column counts 
            if (game.find_all('td','right')[1].text) != '':
                visiting_score.append(int(game.find_all('td','right')[1].text))
                home_score.append(int(game.find_all('td','right')[2].text))
            else:
                visiting_score.append(0)
                home_score.append(0)
        else: 
            if (game.find_all('td','right')[0].text) != '':
                visiting_score.append(int(game.find_all('td','right')[0].text))
                home_score.append(int(game.find_all('td','right')[1].text))
            else:
                visiting_score.append(0)
                home_score.append(0)


## Change columns of data to a DF to spot check

In [51]:
examples = pd.DataFrame({'date':date,'season':season,'visiting_team':visiting,'visiting_score':visiting_score,
                  'home_team':home, 'home_score':home_score, 'notes':notes})
# examples[examples.notes != '']

ValueError: arrays must all be same length

In [31]:
oct1819dec5859.dtypes #checking dtypes

date              datetime64[ns]
season                    object
visiting_team             object
visiting_score             int64
home_team                 object
home_score                 int64
notes                     object
dtype: object

## Gather games from other months of the season + games from other seasons

### Request all HTML pages and store in MongoDB

In [32]:
possible_links = []
months = ['october','november','december','january','february','march','april','may','june','july','august','september']
#most recent seasons run from oct - june (inclusive of playoffs) however, it used to run from oct - april with some seasons running into may (i.e. '69-70') 
link = 'https://www.basketball-reference.com/leagues/NBA_1950_games-october.html'
for year in range(1950,datetime.datetime.now().year+1):
    if year!=2020:
#2020 season had two october months (spanned across years) because of covid. current year can pull from all the months where possible
#(i.e no 404 error when requesting data), we can then drop rows where the game schedule is out but the game has not been played yet cause the scores will either be blank or zero
        for month in months:
            possible_links.append(f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html')
    else:
        for month in months:
            if month != 'october':
                possible_links.append(f'https://www.basketball-reference.com/leagues/NBA_2020_games-{month}.html')
        possible_links.append(f'https://www.basketball-reference.com/leagues/NBA_2020_games-october-2019.html')
        possible_links.append(f'https://www.basketball-reference.com/leagues/NBA_2020_games-october-2020.html')

In [33]:
test = requests.get('https://www.basketball-reference.com/leagues/NBA_1950_games-may.html') #may does not exist in this season

In [34]:
test.status_code #testing to see what a month that doesn't exist will provide - can use this to filter those months out

404

Code below accounts for the weird 2019-2020 season where there were two Octobers in the season (Oct '19 and Oct '20)
However, it does not account for April for which a schedule was out but games were never played (schedule and results page does not show an April button but when iterating through the urls April did not throw a 404 error hence the empty chart was pulled). Will need to drop 2019-2020 April data which is all blanks for scores.

In [35]:
actual_links = []
basketball_charts = db.all_seasons
for i,link in enumerate(possible_links):
    season_month = requests.get(link)
    if season_month.status_code != 404:
        # commenting out - do not want multiples if I rerun this notebook
#         basketball_charts.insert_one({'link':link,'html':season_month.text})
        print(f'Website link: {link}')
        actual_links.append(link)
        time.sleep(2)

Website link: https://www.basketball-reference.com/leagues/NBA_1950_games-october.html
Website link: https://www.basketball-reference.com/leagues/NBA_1950_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_1950_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_1950_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_1950_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_1950_games-march.html
Website link: https://www.basketball-reference.com/leagues/NBA_1950_games-april.html
Website link: https://www.basketball-reference.com/leagues/NBA_1951_games-october.html
Website link: https://www.basketball-reference.com/leagues/NBA_1951_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_1951_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_1951_games-january.html
Website link: https://www.basketball-refer

Website link: https://www.basketball-reference.com/leagues/NBA_1963_games-april.html
Website link: https://www.basketball-reference.com/leagues/NBA_1964_games-october.html
Website link: https://www.basketball-reference.com/leagues/NBA_1964_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_1964_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_1964_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_1964_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_1964_games-march.html
Website link: https://www.basketball-reference.com/leagues/NBA_1964_games-april.html
Website link: https://www.basketball-reference.com/leagues/NBA_1965_games-october.html
Website link: https://www.basketball-reference.com/leagues/NBA_1965_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_1965_games-december.html
Website link: https://www.basketball-referen

Website link: https://www.basketball-reference.com/leagues/NBA_1976_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_1976_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_1976_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_1976_games-march.html
Website link: https://www.basketball-reference.com/leagues/NBA_1976_games-april.html
Website link: https://www.basketball-reference.com/leagues/NBA_1976_games-may.html
Website link: https://www.basketball-reference.com/leagues/NBA_1976_games-june.html
Website link: https://www.basketball-reference.com/leagues/NBA_1977_games-october.html
Website link: https://www.basketball-reference.com/leagues/NBA_1977_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_1977_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_1977_games-january.html
Website link: https://www.basketball-reference.com

Website link: https://www.basketball-reference.com/leagues/NBA_1987_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_1987_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_1987_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_1987_games-march.html
Website link: https://www.basketball-reference.com/leagues/NBA_1987_games-april.html
Website link: https://www.basketball-reference.com/leagues/NBA_1987_games-may.html
Website link: https://www.basketball-reference.com/leagues/NBA_1987_games-june.html
Website link: https://www.basketball-reference.com/leagues/NBA_1988_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_1988_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_1988_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_1988_games-february.html
Website link: https://www.basketball-reference.co

Website link: https://www.basketball-reference.com/leagues/NBA_1999_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_1999_games-march.html
Website link: https://www.basketball-reference.com/leagues/NBA_1999_games-april.html
Website link: https://www.basketball-reference.com/leagues/NBA_1999_games-may.html
Website link: https://www.basketball-reference.com/leagues/NBA_1999_games-june.html
Website link: https://www.basketball-reference.com/leagues/NBA_2000_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_2000_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_2000_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_2000_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_2000_games-march.html
Website link: https://www.basketball-reference.com/leagues/NBA_2000_games-april.html
Website link: https://www.basketball-reference.com/lea

Website link: https://www.basketball-reference.com/leagues/NBA_2010_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_2010_games-march.html
Website link: https://www.basketball-reference.com/leagues/NBA_2010_games-april.html
Website link: https://www.basketball-reference.com/leagues/NBA_2010_games-may.html
Website link: https://www.basketball-reference.com/leagues/NBA_2010_games-june.html
Website link: https://www.basketball-reference.com/leagues/NBA_2011_games-october.html
Website link: https://www.basketball-reference.com/leagues/NBA_2011_games-november.html
Website link: https://www.basketball-reference.com/leagues/NBA_2011_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_2011_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_2011_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_2011_games-march.html
Website link: https://www.basketball-reference.com/l

Website link: https://www.basketball-reference.com/leagues/NBA_2021_games-december.html
Website link: https://www.basketball-reference.com/leagues/NBA_2021_games-january.html
Website link: https://www.basketball-reference.com/leagues/NBA_2021_games-february.html
Website link: https://www.basketball-reference.com/leagues/NBA_2021_games-march.html


In [37]:
actual_links # pulling links to reference for dataset - can serve as a starting point and always pull data from 2021 onwards and append if we ever need to update with more recent data

['https://www.basketball-reference.com/leagues/NBA_1950_games-october.html',
 'https://www.basketball-reference.com/leagues/NBA_1950_games-november.html',
 'https://www.basketball-reference.com/leagues/NBA_1950_games-december.html',
 'https://www.basketball-reference.com/leagues/NBA_1950_games-january.html',
 'https://www.basketball-reference.com/leagues/NBA_1950_games-february.html',
 'https://www.basketball-reference.com/leagues/NBA_1950_games-march.html',
 'https://www.basketball-reference.com/leagues/NBA_1950_games-april.html',
 'https://www.basketball-reference.com/leagues/NBA_1951_games-october.html',
 'https://www.basketball-reference.com/leagues/NBA_1951_games-november.html',
 'https://www.basketball-reference.com/leagues/NBA_1951_games-december.html',
 'https://www.basketball-reference.com/leagues/NBA_1951_games-january.html',
 'https://www.basketball-reference.com/leagues/NBA_1951_games-february.html',
 'https://www.basketball-reference.com/leagues/NBA_1951_games-march.html',