# CS 109A/STAT 121A/AC 209A/CSCI E-109A: 
# Final Project - 2017

**Harvard University**<br/>
**Fall 2017**<br/>
**Instructors**: Pavlos Protopapas, Kevin Rader, Rahul Dave, Margo Levine<br/>
**Leading TF**: Albert Wu<br/>
**Project Group #**: 16 (Sports)

---

In [162]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import IFrame, HTML
import requests
from bs4 import BeautifulSoup
import seaborn as sns

In [39]:
url_pbp = 'https://www.basketball-reference.com/boxscores/pbp/201612010GSW.html'
url_box = url_pbp.replace('/pbp', '')
url_box

'https://www.basketball-reference.com/boxscores/201612010GSW.html'

In [40]:
game_pd_raw = pd.read_html(url, header=1)[0]
game_pd_raw.head()

Unnamed: 0,Time,Houston,Unnamed: 2,Score,Unnamed: 4,Golden State
0,12:00.0,Start of 1st quarter,,,,
1,12:00.0,Jump ball: C. Capela vs. Z. Pachulia (R. Ander...,,,,
2,11:38.0,T. Ariza makes 2-pt shot from 1 ft,2.0,2-0,,
3,11:13.0,,,2-2,2.0,K. Durant makes 2-pt shot at rim (assist by Z....
4,11:01.0,C. Capela makes 2-pt shot from 3 ft,2.0,4-2,,


In [89]:
def process_game_play(url):

    game_pd = pd.read_html(url, header=1)[0]

    # Set proper initial headers
    away_team = game_pd.columns[1]
    home_team = game_pd.columns[5]
    game_pd.columns = ['time', 'away_events','away_pts','score','home_pts','home_events']
    game_pd['away_team'] = away_team
    game_pd['home_team'] = home_team

    # Combine home and away team events
    events = game_pd['home_events']
    events = events.fillna(game_pd['away_events'])
    game_pd['events'] = events
    game_pd['is_home_event'] = 1 - pd.isnull(game_pd['home_events'])

    # Fill invalid scores
    game_pd['score'] = game_pd['score'].replace(to_replace='Score',method='ffill')
    game_pd['score'] = game_pd['score'].fillna(method='backfill')
    game_pd = game_pd.drop(game_pd.index[game_pd['score'].isnull()], axis=0)

    # Process scores
    score_str = game_pd['score'].str.split('-').tolist()
    away_score,home_score = np.transpose(np.array(score_str))
    away_score = away_score.astype(int)
    home_score = home_score.astype(int)
    away_score[:2] = 0
    home_score[:2] = 0
    game_pd['away_score'] = away_score
    game_pd['home_score'] = home_score
    game_pd['score_diff'] = home_score - away_score
    game_pd['home_win'] = (home_score[-1] > away_score[-1]).astype(int)

    # Drop useless columns
    game_pd = game_pd.drop(['away_events', 'away_pts', 'score', 'home_pts', 'home_events'], axis=1)

    # Calculat elapsed time
    game_pd['t_elapsed'] = game_pd['time'].str.split(':')
    quarter_str = ['2nd Q', '3rd Q', '4th Q', '1st OT', '2nd OT', '3rd OT', '4th OT']
    last_ind = 0
    for i in range(len(quarter_str)):
        if any(game_pd['time'].str.contains(quarter_str[i])):
            ind_Q = game_pd.index[game_pd['time'].str.contains(quarter_str[i])][0]
            game_pd = game_pd.drop(np.arange(ind_Q-1, ind_Q+3, 1).tolist(), axis=0)
            game_pd.loc[last_ind:ind_Q, 't_elapsed'] = game_pd.loc[last_ind:ind_Q, 't_elapsed'].apply(
                lambda s: 720*np.min([4, i+1]) + 300*np.max([0, i-2]) - (float(s[0])*60+float(s[1])))
            last_ind = ind_Q
        else:
            game_pd.loc[last_ind:, 't_elapsed'] = game_pd.loc[last_ind:, 't_elapsed'].apply(
                lambda s: 720*4 + 300*np.max([0, i-3]) - (float(s[0])*60+float(s[1])))
            break
    game_pd = game_pd.drop(0, axis=0).reset_index(drop=True)

    # Process event information
    event_type = ['makes', 'miss', '2-pt', '3-pt', 'free throw', 'assist', 'Defensive rebound', 
                  'Offensive rebound', 'Turnover', 'foul']
    for e in event_type:
        game_pd[e.replace(' ', '_')] = game_pd['events'].str.contains(e).astype(int)

    game_pd['substitution'] = game_pd['events'].str.contains('enters the game for').astype(int)
    
    # Calculate Shooting percetage
    home_perc = (game_pd['is_home_event']*(game_pd['makes']*(1-game_pd['free_throw']))).cumsum()/(game_pd['is_home_event']*(game_pd['makes']+game_pd['miss']-game_pd['free_throw'])).cumsum()
    home_perc = home_perc.fillna(0)
    away_perc = ((1-game_pd['is_home_event'])*(game_pd['makes']*(1-game_pd['free_throw']))).cumsum()/((1-game_pd['is_home_event'])*(game_pd['makes']+game_pd['miss']-game_pd['free_throw'])).cumsum()
    away_perc = away_perc.fillna(0)

    game_pd['home_perc'] = home_perc
    game_pd['away_perc'] = away_perc
    
    return game_pd


In [90]:
game_pd = process_game_play(url_pbp)
game_pd.head()

Unnamed: 0,time,away_team,home_team,events,is_home_event,away_score,home_score,score_diff,home_win,t_elapsed,...,3-pt,free_throw,assist,Defensive_rebound,Offensive_rebound,Turnover,foul,substitution,home_perc,away_perc
0,12:00.0,Houston,Golden State,Jump ball: C. Capela vs. Z. Pachulia (R. Ander...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,11:38.0,Houston,Golden State,T. Ariza makes 2-pt shot from 1 ft,0,2,0,-2,0,22,...,0,0,0,0,0,0,0,0,0.0,1.0
2,11:13.0,Houston,Golden State,K. Durant makes 2-pt shot at rim (assist by Z....,1,2,2,0,0,47,...,0,0,1,0,0,0,0,0,1.0,1.0
3,11:01.0,Houston,Golden State,C. Capela makes 2-pt shot from 3 ft,0,4,2,-2,0,59,...,0,0,0,0,0,0,0,0,1.0,1.0
4,10:48.0,Houston,Golden State,S. Curry makes 2-pt shot from 12 ft (assist by...,1,4,4,0,0,72,...,0,0,1,0,0,0,0,0,1.0,1.0


In [93]:
game_pd[['t_elapsed', 'score_diff', 'home_win']].tail(10)

Unnamed: 0,t_elapsed,score_diff,home_win
592,3472,-4,0
593,3472,-5,0
594,3472,-5,0
595,3472,-3,0
596,3472,-5,0
597,3472,-5,0
598,3474,-5,0
599,3477,-5,0
600,3478,-5,0
601,3479,-5,0


In [166]:
# Process all games from 2014-2015, 2015-2016, 2016-2017 season

url_base = 'https://www.basketball-reference.com/leagues/NBA_year_games-month.html'

years = ['2015', '2016', '2017']
months = ['october', 'november', 'december', 'january', 'february', 'march', 
         'april', 'may', 'june']

game_urls = []
for y in years[0:1]:
    for m in months[0:3]:
        url_schedule = url_base.replace('year', y)
        url_schedule = url_schedule.replace('month', m)
        response = requests.get(url_schedule)
        soup = BeautifulSoup(response.text, 'lxml')
        for ref in soup.find_all('a'):
            link = ref.get('href')
            if link.startswith('/boxscores/2'):
                game_urls.append('https://www.basketball-reference.com' + link.replace('boxscores/', 'boxscores/pbp/'))

In [167]:
len(game_urls)

481

In [168]:
game_pd_all = []
for url in game_urls:
    game_pd_all.append(process_game_play(url)[['t_elapsed', 'score_diff', 'home_win']])

In [None]:
game_pd_all[2].head()

Unnamed: 0,t_elapsed,score_diff,home_win
0,0,0,1
1,12,0,1
2,13,0,1
3,28,0,1
4,29,0,1


In [None]:
win_perc = np.zeros((80, 4000))
t_range = np.arange(0, 4000, 1)
l_range = np.arange(-40, 40, 1)
for it in range(len(t_range)):
    for il in range(len(l_range)):
        game_total = 0
        game_win = 0
        for game_pd in game_pd_all:
            t_condition = (game_pd['t_elapsed'] < t_range[it]+2) & (game_pd['t_elapsed'] > t_range[it]-2)
            l_condition = (game_pd['score_diff'] < l_range[il]+2) & (game_pd['score_diff'] > l_range[il]-2)
            fil_condition = t_condition & l_condition
            if fil_condition.any():
                game_total += 1
                game_win += game_pd.loc[0, 'home_win']
        if game_total > 0:
            win_perc[il, it] = game_win/game_total

In [None]:
plt.figure(figsize=(27, 18))
plt.axis('off')
sns.heatmap(win_perc, cmap='bwr')