In [1]:
from pymongo import MongoClient
import pprint

import pandas as pd
import matplotlib.pyplot as plt

import requests

from bs4 import BeautifulSoup

import json
import datetime
import calendar
import time

## Request the webpage's raw HTML

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-october.html'
r = requests.get(url)

In [3]:
r.status_code

200

In [4]:
#pprint.pprint(r.text)

## Save the Raw HTML into a MongoDB

In [5]:
client = MongoClient()
db = client.capstone1_bball_reference

In [6]:
one_season = db.testing

In [7]:
one_season.insert_one({'link':url,'html':r.text})

<pymongo.results.InsertOneResult at 0x7ffb214eb780>

## Parse the hypertext to get data with Beautiful Soup

In [8]:
soup = BeautifulSoup(r.text,'html.parser')

In [9]:
#print(soup.prettify())

In [10]:
soup.find_all('tr')[1] #get info for each game

<tr><th class="left" csk="201810160BOS" data-stat="date_game" scope="row"><a href="/boxscores/index.fcgi?month=10&amp;day=16&amp;year=2018">Tue, Oct 16, 2018</a></th><td class="right" data-stat="game_start_time">8:00p</td><td class="left" csk="PHI.201810160BOS" data-stat="visitor_team_name"><a href="/teams/PHI/2019.html">Philadelphia 76ers</a></td><td class="right" data-stat="visitor_pts">87</td><td class="left" csk="BOS.201810160BOS" data-stat="home_team_name"><a href="/teams/BOS/2019.html">Boston Celtics</a></td><td class="right" data-stat="home_pts">105</td><td class="center" data-stat="box_score_text"><a href="/boxscores/201810160BOS.html">Box Score</a></td><td class="center iz" data-stat="overtimes"></td><td class="right" data-stat="attendance">18,624</td><td class="left iz" data-stat="game_remarks"></td></tr>

In [11]:
soup.find_all('tr')[1].find('a').text[5:] 
#get date for each game - removed day (all dates have a three letter abbrev. with a space and ','
#five chars i.e. 'Tue, '

'Oct 16, 2018'

In [12]:
soup.find_all('tr')[1].find_all('td','left')[0].text #get name of visiting team

'Philadelphia 76ers'

In [13]:
soup.find_all('tr')[1].find_all('td','right')[1].text #get score of visiting team

'87'

In [14]:
soup.find_all('tr')[1].find_all('td','left')[1].text #get name of home team

'Boston Celtics'

In [15]:
soup.find_all('tr')[1].find_all('td','right')[2].text #get score of home team

'105'

## Testing other pages. Charts are not consistent - older seasons are missing 'Start (ET)' columns
Spot checked - there isn't some specific season that the column was adopted. Some are sporadic. Original spot check looked like 'Start(ET)' column starts 1985-86 and after, however, after clicking around some more, 1994-1995 did not have such column.

In [16]:
url = 'https://www.basketball-reference.com/leagues/NBA_1984_games-december.html'
x = requests.get(url)
x.status_code

200

In [17]:
#pprint.pprint(x.text)

In [18]:
one_season.insert_one({'link':url,'html':x.text})

<pymongo.results.InsertOneResult at 0x7ffb218ad050>

In [19]:
soup2 = BeautifulSoup(x.text,'html.parser')

In [20]:
#print(soup2.prettify())

In [21]:
soup2.find_all('tr')[1] #get info for each game

<tr><th class="left" csk="198312010GSW" data-stat="date_game" scope="row"><a href="/boxscores/index.fcgi?month=12&amp;day=1&amp;year=1983">Thu, Dec 1, 1983</a></th><td class="left" csk="KCK.198312010GSW" data-stat="visitor_team_name"><a href="/teams/KCK/1984.html">Kansas City Kings</a></td><td class="right" data-stat="visitor_pts">95</td><td class="left" csk="GSW.198312010GSW" data-stat="home_team_name"><a href="/teams/GSW/1984.html">Golden State Warriors</a></td><td class="right" data-stat="home_pts">106</td><td class="center" data-stat="box_score_text"><a href="/boxscores/198312010GSW.html">Box Score</a></td><td class="center iz" data-stat="overtimes"></td><td class="right" data-stat="attendance">4,463</td><td class="left iz" data-stat="game_remarks"></td></tr>

In [22]:
soup2.find_all('tr')[1].find('a').text[5:] 
#get date for each game - removed day (all dates have a three letter abbrev. with a space and ','
#five chars i.e. 'Tue, 

'Dec 1, 1983'

In [23]:
soup2.find_all('tr')[1].find_all('td','left')[0].text #get name of visiting team

'Kansas City Kings'

In [60]:
soup2.find_all('tr')[1].find_all('td','right')[0].text #get score of visiting team - index at 0 instead of 1 because it does not have a Start (ET) column

'95'

In [62]:
soup2.find_all('tr')[1].find_all('td','left')[1].text #get name of home team

'Golden State Warriors'

In [65]:
soup2.find_all('tr')[1].find_all('td','right')[1].text #get score of home team

'106'

Spot checked - looks like there is either a Start (ET) column or there is not, there are no other changes to column names or number of columns. If this is the case, column counts will be either 9 or 10 depending on if it has the Start (ET) column. We can use this to dictate what index to pull from. This only affects pulling scores - team names are unaffected. (If there is a Start (ET) column the index will be 1 higher compared to when there is no Start (ET) column when pulling both visiting and home team scores)

In [71]:
#check
if len(soup.find_all('col')) == 9:
    print(soup.find_all('tr')[1].find_all('td','right')[0].text)
else:
    print(soup.find_all('tr')[1].find_all('td','right')[1].text)

if len(soup2.find_all('col')) == 9:
    print(soup2.find_all('tr')[1].find_all('td','right')[0].text)
else:
    print(soup2.find_all('tr')[1].find_all('td','right')[1].text)

87
95


## Ran into another issue. "Notes" column for some games denotes that the game was played elsewhere - not either teams' arenas

Pull in notes column. Since some arena's were neither of the playing team's arena, we will drop those rows from our data. (i.e. visitor - Chicago Stags vs home - Boston Celtics at Philadelpha, PA)

## Gather information from other rows/games - games for the month

In [25]:
date = []
visiting = []
visiting_score = []
home = []
home_score = []
season = []

for i,game in enumerate(soup.find_all('tr')):
    if game.a != None: #skips the chart label rows
        datestring = game.find('a').text[5:]
        dt = datetime.datetime.strptime(datestring,'%b %d, %Y') #change datestring to datetime object
        date.append(dt)
        visiting.append(game.find_all('td','left')[0].text)
        visiting_score.append(int(game.find_all('td','right')[1].text))
        home.append(game.find_all('td','left')[1].text)
        home_score.append(int(game.find_all('td','right')[2].text))
        season.append(soup.find_all('div','inactive')[0].find('li','index').text)

## Change columns of data to a DF to spot check

In [26]:
oct1819 = pd.DataFrame({'date':date[28:40],'season':season[28:40],'visiting_team':visiting[28:40],'visiting_score':visiting_score[28:40],
                  'home_team':home[28:40], 'home_score':home_score[28:40]})
oct1819

Unnamed: 0,date,season,visiting_team,visiting_score,home_team,home_score
0,2018-10-20,2018-19 NBA Season,Orlando Magic,115,Philadelphia 76ers,116
1,2018-10-20,2018-19 NBA Season,Detroit Pistons,118,Chicago Bulls,116
2,2018-10-20,2018-19 NBA Season,Charlotte Hornets,113,Miami Heat,112
3,2018-10-20,2018-19 NBA Season,Minnesota Timberwolves,136,Dallas Mavericks,140
4,2018-10-20,2018-19 NBA Season,Phoenix Suns,91,Denver Nuggets,119
5,2018-10-20,2018-19 NBA Season,San Antonio Spurs,108,Portland Trail Blazers,121
6,2018-10-20,2018-19 NBA Season,Houston Rockets,124,Los Angeles Lakers,115
7,2018-10-21,2018-19 NBA Season,Atlanta Hawks,133,Cleveland Cavaliers,111
8,2018-10-21,2018-19 NBA Season,Sacramento Kings,131,Oklahoma City Thunder,120
9,2018-10-21,2018-19 NBA Season,Golden State Warriors,98,Denver Nuggets,100


In [27]:
oct1819.dtypes #checking dtypes

date              datetime64[ns]
season                    object
visiting_team             object
visiting_score             int64
home_team                 object
home_score                 int64
dtype: object

## Gather games from other months of the season + games from other seasons

### Request all HTML pages and store in MongoDB

In [28]:
links = []
months = ['october','november','december','january','february','march','april','may','june','july','august','september']
#most recent seasons run from oct - june (inclusive of playoffs) however, it used to run from oct - april with some seasons running into may (i.e. '69-70') 
link = 'https://www.basketball-reference.com/leagues/NBA_1950_games-october.html'
for year in range(1950,datetime.datetime.now().year+1):
    if year!=2020:
#2020 season had two october months (spanned across years) because of covid. current year can pull from all the months where possible
#(i.e no 404 error when requesting data), we can then drop rows where the game schedule is out but the game has not been played yet cause the scores will either be blank or zero
        for month in months:
            links.append(f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html')
    else:
        for month in months:
            if month != 'october':
                links.append(f'https://www.basketball-reference.com/leagues/NBA_2020_games-{month}.html')
        links.append(f'https://www.basketball-reference.com/leagues/NBA_2020_games-october-2019.html')
        links.append(f'https://www.basketball-reference.com/leagues/NBA_2020_games-october-2020.html')
len(links)

865

In [29]:
test = requests.get('https://www.basketball-reference.com/leagues/NBA_1950_games-may.html') #may does not exist in this season

In [30]:
test.status_code #testing to see what a month that doesn't exist will provide - can use this to filter those months out

404

In [31]:
# basketball_charts = db.all_seasons
# for i,link in enumerate(links):
#     season_month = requests.get(link)
#     if season_month.status_code != 404:
#         basketball_charts.insert_one({'link':link,'html':season_month.text})
#         print(f'Website link: {link}')
#         time.sleep(2)