# CS 109A/STAT 121A/AC 209A/CSCI E-109A: 
# Final Project - 2017

**Harvard University**<br/>
**Fall 2017**<br/>
**Instructors**: Pavlos Protopapas, Kevin Rader, Rahul Dave, Margo Levine<br/>
**Leading TF**: Albert Wu<br/>
**Project Group #**: 16 (Sports)

---

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import IFrame, HTML
import requests
from bs4 import BeautifulSoup

In [39]:
url_pbp = 'https://www.basketball-reference.com/boxscores/pbp/201612010GSW.html'
url_box = url_pbp.replace('/pbp', '')
url_box

'https://www.basketball-reference.com/boxscores/201612010GSW.html'

In [40]:
game_pd_raw = pd.read_html(url, header=1)[0]
game_pd_raw.head()

Unnamed: 0,Time,Houston,Unnamed: 2,Score,Unnamed: 4,Golden State
0,12:00.0,Start of 1st quarter,,,,
1,12:00.0,Jump ball: C. Capela vs. Z. Pachulia (R. Ander...,,,,
2,11:38.0,T. Ariza makes 2-pt shot from 1 ft,2.0,2-0,,
3,11:13.0,,,2-2,2.0,K. Durant makes 2-pt shot at rim (assist by Z....
4,11:01.0,C. Capela makes 2-pt shot from 3 ft,2.0,4-2,,


In [43]:
game_pd = game_pd_raw.copy()

# Set proper initial headers
away_team = game_pd.columns[1]
home_team = game_pd.columns[5]
game_pd.columns = ['time', 'away_events','away_pts','score','home_pts','home_events']
game_pd['away_team'] = away_team
game_pd['home_team'] = home_team

# Combine home and away team events
events = game_pd['home_events']
events = events.fillna(game_pd['away_events'])
game_pd['events'] = events
game_pd['is_home_event'] = 1 - pd.isnull(game_pd['home_events'])

# Fill invalid scores
game_pd['score'] = game_pd['score'].replace(to_replace='Score',method='ffill')
game_pd['score'] = game_pd['score'].fillna(method='backfill')
game_pd = game_pd.drop(game_pd.index[game_pd['score'].isnull()], axis=0)

# Process scores
score_str = game_pd['score'].str.split('-').tolist()
away_score,home_score = np.transpose(np.array(score_str))
away_score = away_score.astype(int)
home_score = home_score.astype(int)
away_score[:2] = 0
home_score[:2] = 0
game_pd['away_score'] = away_score
game_pd['home_score'] = home_score
game_pd['score_diff'] = home_score - away_score
game_pd['home_win'] = int(home_score[-1] > away_score[-1])

# Drop useless columns
game_pd = game_pd.drop(['away_events', 'away_pts', 'score', 'home_pts', 'home_events'], axis=1)

# Calculat elapsed time
game_pd['t_elapsed'] = game_pd['time'].str.split(':')
quarter_str = ['2nd Q', '3rd Q', '4th Q', '1st OT', '2nd OT', '3rd OT', '4th OT']
last_ind = 0
for i in range(len(quarter_str)):
    if any(game_pd['time'].str.contains(quarter_str[i])):
        ind_Q = game_pd.index[game_pd['time'].str.contains(quarter_str[i])][0]
        game_pd = game_pd.drop(np.arange(ind_Q-1, ind_Q+3, 1).tolist(), axis=0)
        game_pd.loc[last_ind:ind_Q, 't_elapsed'] = game_pd.loc[last_ind:ind_Q, 't_elapsed'].apply(
            lambda s: 720*np.min([4, i+1]) + 300*np.max([0, i-2]) - (float(s[0])*60+float(s[1])))
        last_ind = ind_Q
    else:
        game_pd.loc[last_ind:, 't_elapsed'] = game_pd.loc[last_ind:, 't_elapsed'].apply(
            lambda s: 720*4 + 300*np.max([0, i-3]) - (float(s[0])*60+float(s[1])))
        break
game_pd = game_pd.drop(0, axis=0).reset_index(drop=True)

# Process event information
event_type = ['makes', 'miss', '2-pt', '3-pt', 'free throw', 'assist', 'Defensive rebound', 
              'Offensive rebound', 'Turnover', 'foul']
for e in event_type:
    game_pd[e.replace(' ', '_')] = game_pd['events'].str.contains(e).astype(int)


    
game_pd

Unnamed: 0,time,away_team,home_team,events,is_home_event,away_score,home_score,score_diff,home_win,t_elapsed,makes,miss,2-pt,3-pt,free_throw,assist,Defensive_rebound,Offensive_rebound,Turnover,foul
0,12:00.0,Houston,Golden State,Jump ball: C. Capela vs. Z. Pachulia (R. Ander...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,11:38.0,Houston,Golden State,T. Ariza makes 2-pt shot from 1 ft,0,2,0,-2,0,22,1,0,1,0,0,0,0,0,0,0
2,11:13.0,Houston,Golden State,K. Durant makes 2-pt shot at rim (assist by Z....,1,2,2,0,0,47,1,0,1,0,0,1,0,0,0,0
3,11:01.0,Houston,Golden State,C. Capela makes 2-pt shot from 3 ft,0,4,2,-2,0,59,1,0,1,0,0,0,0,0,0,0
4,10:48.0,Houston,Golden State,S. Curry makes 2-pt shot from 12 ft (assist by...,1,4,4,0,0,72,1,0,1,0,0,1,0,0,0,0
5,10:26.0,Houston,Golden State,R. Anderson misses 2-pt shot from 3 ft (block ...,0,4,4,0,0,94,0,1,1,0,0,0,0,0,0,0
6,10:25.0,Houston,Golden State,Offensive rebound by R. Anderson,0,4,4,0,0,95,0,0,0,0,0,0,0,1,0,0
7,10:23.0,Houston,Golden State,R. Anderson makes 2-pt shot from 1 ft,0,6,4,-2,0,97,1,0,1,0,0,0,0,0,0,0
8,10:15.0,Houston,Golden State,Z. Pachulia misses 2-pt shot from 2 ft (block ...,1,6,4,-2,0,105,0,1,1,0,0,0,0,0,0,0
9,10:15.0,Houston,Golden State,Defensive rebound by Team,0,6,4,-2,0,105,0,0,0,0,0,0,1,0,0,0


In [33]:
game_pd['events']

0      Jump ball: C. Capela vs. Z. Pachulia (R. Ander...
1                     T. Ariza makes 2-pt shot from 1 ft
2      K. Durant makes 2-pt shot at rim (assist by Z....
3                    C. Capela makes 2-pt shot from 3 ft
4      S. Curry makes 2-pt shot from 12 ft (assist by...
5      R. Anderson misses 2-pt shot from 3 ft (block ...
6                       Offensive rebound by R. Anderson
7                  R. Anderson makes 2-pt shot from 1 ft
8      Z. Pachulia misses 2-pt shot from 2 ft (block ...
9                              Defensive rebound by Team
10     Loose ball foul by Z. Pachulia (drawn by P. Be...
11                   J. Harden makes 2-pt shot from 3 ft
12                 K. Durant misses 3-pt shot from 24 ft
13                        Defensive rebound by J. Harden
14     R. Anderson makes 3-pt shot from 26 ft (assist...
15      Personal foul by P. Beverley (drawn by S. Curry)
16                         Offensive foul by Z. Pachulia
17              Turnover by Z. 

In [44]:
box1_req = requests.get(url_box)
box1_soup = BeautifulSoup(box1_req.text, 'html.parser')

In [46]:
box1_tables = box1_soup.find_all('table', 'stats_table', 'sortable')

In [56]:
box1_tables[3]

<table class="sortable stats_table" data-cols-to-freeze="1" id="box_gsw_advanced"><caption> Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr class="over_header"><th></th>
<th aria-label="" class=" over_header center" colspan="15" data-stat="header_tmp">Advanced Box Score Stats</th>
</tr>
<tr>
<th aria-label="Starters" class=" poptip sort_default_asc center" data-stat="player" scope="col">Starters</th>
<th aria-label="Minutes Played" class=" poptip center" data-over-header="Advanced Box Score Stats" data-stat="mp" data-tip="Minutes Played" scope="col">MP</th>
<th aria-label="True Shooting Percentage" class=" poptip center" data-over-header="Advanced Box Score Stats" data-stat="ts_pct" data-tip="&lt;strong&gt;True Shooting Percentage&lt;/strong&gt;&lt;br&gt;A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws." scope="col">TS%</th>

In [28]:
players = []
for row in box1_rows:
    players.append(row.find('th').get_text())
players.remove('Reserves')

In [42]:
players

['Jaylen Brown',
 'Kyrie Irving',
 'Jayson Tatum',
 'Al Horford',
 'Gordon Hayward',
 'Marcus Smart',
 'Terry Rozier',
 'Aron Baynes',
 'Semi Ojeleye',
 'Shane Larkin',
 'Abdel Nader',
 'Daniel Theis']