In [1]:
from pymongo import MongoClient
import pprint

import pandas as pd
import matplotlib.pyplot as plt

import requests

from bs4 import BeautifulSoup

import json
from datetime import datetime

## Request the webpage's raw HTML

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-october.html'
r = requests.get(url)

In [3]:
r.status_code

200

In [4]:
#pprint.pprint(r.text)

## Save the Raw HTML into a MongoDB

In [5]:
client = MongoClient()
db = client.basketball_reference

In [6]:
seasons = db.basketball_capstone1

In [7]:
seasons.insert_one({'link':url,'html':r.text})

<pymongo.results.InsertOneResult at 0x7fdcedbe6280>

## Parse the hypertext to get data with Beautiful Soup

In [8]:
soup = BeautifulSoup(r.text,'html.parser')

In [9]:
#print(soup.prettify())

In [10]:
soup.find_all('tr')[1] #get info for each game

<tr><th class="left" csk="201810160BOS" data-stat="date_game" scope="row"><a href="/boxscores/index.fcgi?month=10&amp;day=16&amp;year=2018">Tue, Oct 16, 2018</a></th><td class="right" data-stat="game_start_time">8:00p</td><td class="left" csk="PHI.201810160BOS" data-stat="visitor_team_name"><a href="/teams/PHI/2019.html">Philadelphia 76ers</a></td><td class="right" data-stat="visitor_pts">87</td><td class="left" csk="BOS.201810160BOS" data-stat="home_team_name"><a href="/teams/BOS/2019.html">Boston Celtics</a></td><td class="right" data-stat="home_pts">105</td><td class="center" data-stat="box_score_text"><a href="/boxscores/201810160BOS.html">Box Score</a></td><td class="center iz" data-stat="overtimes"></td><td class="right" data-stat="attendance">18,624</td><td class="left iz" data-stat="game_remarks"></td></tr>

In [11]:
soup.find_all('tr')[1].find('a').text[5:] 
#get date for each game - removed day (all dates have a three letter abbrev. with a space and ','
#five chars i.e. 'Tue, '

'Oct 16, 2018'

In [12]:
soup.find_all('tr')[1].find_all('td','left')[0].text #get name of visiting team

'Philadelphia 76ers'

In [13]:
soup.find_all('tr')[1].find_all('td','right')[1].text #get score of visiting team

'87'

In [14]:
soup.find_all('tr')[1].find_all('td','left')[1].text #get name of home team

'Boston Celtics'

In [15]:
soup.find_all('tr')[1].find_all('td','right')[2].text #get score of home team

'105'

## Gather information from other rows/games - games for the month

In [79]:
date = []
visiting = []
visiting_score = []
home = []
home_score = []

for i,game in enumerate(soup.find_all('tr')):
    if game.a != None: #skips the first row of the chart which are just chart labels 
        datestring = game.find('a').text[5:]
        dt = datetime.strptime(datestring,'%b %d, %Y') #change datestring to datetime object
        date.append(dt)
        visiting.append(game.find_all('td','left')[0].text)
        visiting_score.append(int(game.find_all('td','right')[1].text))
        home.append(game.find_all('td','left')[1].text)
        home_score.append(int(game.find_all('td','right')[2].text))
print(date)

[datetime.datetime(2018, 10, 16, 0, 0), datetime.datetime(2018, 10, 16, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 17, 0, 0), datetime.datetime(2018, 10, 18, 0, 0), datetime.datetime(2018, 10, 18, 0, 0), datetime.datetime(2018, 10, 18, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 10, 19, 0, 0), datetime.datetime(2018, 

## Change columns of data to a DF to spot check

In [80]:
oct1819 = pd.DataFrame({'date':date[:10],'visiting_team':visiting[:10],'visiting_score':visiting_score[:10],
                  'home_team':home[:10], 'home_score':home_score[:10]})
oct1819

Unnamed: 0,date,visiting_team,visiting_score,home_team,home_score
0,2018-10-16,Philadelphia 76ers,87,Boston Celtics,105
1,2018-10-16,Oklahoma City Thunder,100,Golden State Warriors,108
2,2018-10-17,Milwaukee Bucks,113,Charlotte Hornets,112
3,2018-10-17,Brooklyn Nets,100,Detroit Pistons,103
4,2018-10-17,Memphis Grizzlies,83,Indiana Pacers,111
5,2018-10-17,Miami Heat,101,Orlando Magic,104
6,2018-10-17,Atlanta Hawks,107,New York Knicks,126
7,2018-10-17,Cleveland Cavaliers,104,Toronto Raptors,116
8,2018-10-17,New Orleans Pelicans,131,Houston Rockets,112
9,2018-10-17,Minnesota Timberwolves,108,San Antonio Spurs,112


In [81]:
oct1819.dtypes #change date to datetime 

date              datetime64[ns]
visiting_team             object
visiting_score             int64
home_team                 object
home_score                 int64
dtype: object