In [1]:
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats
import pandas as pd

import seaborn as sns
from progressbar import ProgressBar

In [2]:
# getting links for box scores for each game
url0 = 'https://www.basketball-reference.com/leagues/NBA_2021_games-'
months = list(['december','january','february','march','april','may','june'])

box_score_url_list = list()

for i in range(len(months)):
    url = url0 + months[i] + '.html'
    r = requests.get(url)
    soup = BeautifulSoup(r.text,"lxml")
    soup_items = soup.findAll('td',attrs={"data-stat": "box_score_text"})
    bs0 = [element.find('a')['href'] for element in soup_items]
    
    [box_score_url_list.append(item) for item in bs0]
    

In [3]:
# go through box score for each game and get:
# - winner pts, loser pts, winner FTA, loser FTA, winner FTM


# because 2020-2021 season is short (72 games) discard items after the 1080th game (play-in/playoff games)
del box_score_url_list[1080:]

data_stat_list=['fg','fga','fg3','fg3a','ft','fta','pts']
data_win = np.zeros([1080,len(data_stat_list)])
data_lose = np.zeros([1080,len(data_stat_list)])

pbar=ProgressBar()
for i in pbar(range(len(box_score_url_list))):
    url = 'https://www.basketball-reference.com' + box_score_url_list[i]
    r = requests.get(url)
    soup = BeautifulSoup(r.text,"lxml")

    table_list = soup.findAll('table')
    data_temp=[np.zeros([1,len(data_stat_list)]),np.zeros([1,len(data_stat_list)])]
    k=-1
    for tt1 in table_list:
        is_summary_table=tt1['id'][-10:]=='game-basic'
        if is_summary_table:
            k=k+1
            k2=-1
            tt2=tt1.findAll('tfoot')[0]
            for data_stat in data_stat_list:
                k2=k2+1
                data_temp[k][0,k2]=float(tt2.findAll('td',attrs={'data-stat':data_stat})[0].string)

            if data_temp[0][:,-1]>data_temp[1][:,-1]:
                data_win[i,:]=data_temp[0]
                data_lose[i,:]=data_temp[1]
            else:
                data_win[i,:]=data_temp[1]
                data_lose[i,:]=data_temp[0]
    
    

df = pd.DataFrame(np.concatenate((data_win,data_lose),axis=1),columns=['fg_win','fga_win','fg3_win','fg3a_win','ft_win','fta_win','pts_win','fg_lose','fga_lose','fg3_lose','fg3a_lose','ft_lose','fta_lose','pts_lose'])
df['pt_diff'] = df.pts_win - df.pts_lose
df.to_csv('2021_season_games_winner_loser_shot_data.csv',index=False)



  2% |#                                                                       |

KeyboardInterrupt: 

In [None]:
print sum(df['ft_win']/df['fta_win'] > df['ft_lose']/df['fta_lose'])/float(len(df))
print sum(df['fta_win'] > df['fta_lose'])/float(len(df))
print sum(df['ft_win'] > df['ft_lose'])/float(len(df))
print sum(df['pt_diff'] < (df['fta_lose']-df['ft_lose']))/float(len(df))

In [None]:
fti_range = np.linspace(0,0.7,num=71)
games_flipped = np.zeros([len(fti_range),1])
for i,improv in enumerate(fti_range):
    extra_ft = np.amin([df['fta_lose']*improv,df['fta_lose']-df['ft_lose']],axis=0)
    games_flipped[i] = float(sum((df['pt_diff'] - extra_ft)<0))/len(df)*100
    
sns.despine()
plt.plot(fti_range*100,games_flipped)
plt.xlabel('losing team FT percentage increase (percent)')
plt.ylabel('percent of regular season games flipped')
plt.show()
    

# How many more games would a team win if their free-throw percentage was improved?

In [170]:
# function for getting table data
def get_current_opponent_tables(table_list,team_abbr):
    for i3,box_table in enumerate(table_list):
        if box_table['id'][-10:]=='game-basic' and box_table['id'][-14:-11]==team_abbr:
            current_table = box_table
        elif box_table['id'][-10:]=='game-basic' and box_table['id'][-14:-11]!=team_abbr:
            opponent_table = box_table
    
    return [current_table,opponent_table]

def get_table_data(table,stat_list):
    tt2=table.findAll('tfoot')
    data = list()
    for data_stat in stat_list:
        data.append(float(tt2[0].findAll('td',attrs={'data-stat':data_stat})[0].string))
    
    return np.array(data)


def get_list_of_teams(year):
    url = 'https://www.basketball-reference.com/leagues/NBA_' + str(year) + '_standings.html'
    r=requests.get(url)
    soup=BeautifulSoup(r.text,"lxml")
    t1=soup.findAll('table')
    t2a=list()
    [t2a.append(i.findAll('tbody')) for i in t1]
    team_url_list = list()
    for i in t2a:
        t3 = i[0].findAll('th',attrs={'data-stat':'team_name'})
        for k in t3: team_url_list.append(k.find('a')['href'][0:-9])
    team_url_list.sort()
    return team_url_list

def get_abbr_list_from_team_list(team_url_list):
    abbr_list = list()
    [abbr_list.append(item[7:-1]) for item in team_url_list]
    return abbr_list

In [178]:
# go through each game from last 5 seasons for each team
# get the box scores for each game
url0 = 'https://www.basketball-reference.com'
year_list = [2015,2016,2017,2018,2019]
data_stat_list=['fg','fga','fg3','fg3a','ft','fta','pts']
combined_data = list()
team_list_by_year = list()
for i0,year in enumerate(year_list):
    
    
    team_url_list = get_list_of_teams(year)
    team_abbr_list = get_abbr_list_from_team_list(team_url_list)
    team_list_by_year.append(team_url_list)
    
    all_team_data=list()
    print 'Year: ' + str(year) + '...'
    pbar=ProgressBar()
    for i1 in pbar(range(len(team_url_list))):
        team_url = team_url_list[i1]
        
        team_data = list()
        opponent_data = list()
        url = url0 + team_url + str(year) + '_games.html'
        r = requests.get(url)
        soup = BeautifulSoup(r.text,"lxml")
        
        t1=soup.find('table')
        t2=t1.findAll('td',attrs={'data-stat':'box_score_text'})
        t3=list()
        [t3.append(item.find('a')['href']) for item in t2]
        
        team_abbr = team_abbr_list[i1]
        ind_team_data = np.zeros([len(data_stat_list),len(t3),2])
        for i2,game_url in enumerate(t3):
            url2 = url0 + game_url
            r2 = requests.get(url2)
            soup = BeautifulSoup(r2.text,"lxml")
            qtable = get_current_opponent_tables(soup.findAll('table'),team_abbr)
            
            ind_team_data[:,i2,0]=get_table_data(qtable[0],data_stat_list)
            ind_team_data[:,i2,1]=get_table_data(qtable[1],data_stat_list)
        
        all_team_data.append(ind_team_data)
    combined_data.append(all_team_data)
                    

  0% |                                                                        |

Year: 2015...


100% |########################################################################|
  0% |                                                                        |

Year: 2016...


100% |########################################################################|
  0% |                                                                        |

Year: 2017...


100% |########################################################################|
  0% |                                                                        |

Year: 2018...


100% |########################################################################|
  0% |                                                                        |

Year: 2019...


100% |########################################################################|


In [177]:
for i0,year in enumerate(year_list): print year

2015
2016
2017
2018
2019


In [185]:
from tqdm import tqdm

for i in tqdm(range(100)):
    r=requests.get(url)

100%|██████████| 100/100 [00:08<00:00, 12.06it/s]


In [183]:
url_list

NameError: name 'url_list' is not defined

In [184]:
url

'https://www.basketball-reference.com/teams/WAS/2019_games.html'

In [186]:
!pip install pickle

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support pip 21.0 will remove support for this functionality.[0m
[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m
[31mERROR: No matching distribution found for pickle[0m
