In [81]:
## Imports 

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time


team_handles = {'Toronto Raptors': 'TOR',
                     'Boston Celtics': 'BOS',
                     'Philadelphia 76ers': 'PHI',
                     'Cleveland Cavaliers': 'CLE',
                     'Indiana Pacers': 'IND',
                     'Miami Heat': 'MIA',
                     'Milwaukee Bucks': 'MIL',
                     'Washington Wizards': 'WAS',
                     'Detroit Pistons': 'DET',
                     'Charlotte Hornets': 'CHO',
                     'New York Knicks': 'NYK',
                     'Brooklyn Nets': 'BRK',
                     'Chicago Bulls': 'CHI',
                     'Orlando Magic': 'ORL',
                     'Atlanta Hawks': 'ATL',
                     'Houston Rockets': 'HOU',
                     'Golden State Warriors': 'GSW',
                     'Portland Trail Blazers': 'POR',
                     'Oklahoma City Thunder': 'OKC',
                     'Utah Jazz': 'UTA',
                     'New Orleans Pelicans': 'NOP',
                     'San Antonio Spurs': 'SAS',
                     'Minnesota Timberwolves': 'MIN',
                     'Denver Nuggets': 'DEN',
                     'Los Angeles Clippers': 'LAC',
                     'Los Angeles Lakers': 'LAL',
                     'Sacramento Kings': 'SAC',
                     'Dallas Mavericks': 'DAL',
                     'Memphis Grizzlies': 'MEM',
                     'Phoenix Suns': 'PHO'}

# HELPER FUNCTIONS

# Clean date to calculated date format for webscraping
def clean_date(date):
    formatted_date = date.replace('-', '')
    return formatted_date

# INPUT: csv_file name as str, df that needs saved
def save_df(csv_name, df):
    compression_opts = dict(method='zip',
                        archive_name=csv_name+'.csv')

    df.to_csv(csv_name+'.zip', index=False, compression=compression_opts)
    return 'Saved'




In [16]:
# get the current season game info by importing team handle and year (2021)

def get_current_season_game_info(team_handle):
    season_page = requests.get(f'https://www.basketball-reference.com/teams/{team_handle}/2021_games.html')
    season_page = BeautifulSoup(season_page.text, 'html.parser')

    stats = ['date_game', 'game_start_time', 'network', 'opp_name', 'game_result', 'overtimes', 'pts', 'opp_pts', 'wins', 'losses', 'game_streak']
    stats_list = [[td.getText() for td in season_page.findAll('td', {'data-stat': stat})] for stat in stats]
    
    box_scores = []
    dates = []
    
    for row in season_page.find('table', {'id': 'games'}).tbody.find_all('tr'):
        _class = row.get("class")

        #skip table body header
        if _class is not None and "thead" == _class[0]:
            continue
            
        game_result = row.find('td', {'data-stat': 'game_result'}).getText()
        
        # if there isnt a game result yet, the game has not played and we dont need that info
        if game_result == '':
            return stats_list, box_scores, dates
        
        # only get every teams home game so we do not have duplicates
        game_loc = row.find('td', {'data-stat': 'game_location'}).getText()
        if game_loc == '':
            box_score = row.find('td', {'data-stat': 'box_score_text'}).find('a')['href']
            box_scores.append(box_score)
            date = row.find('td', {'data-stat': 'date_game'})['csk']
            dates.append(date)
    return stats_list, box_scores, dates


# function to get box score stats of advanced and basic with gid=/boxscores/201903010ATL.html
def get_box_score_stats(gid):
    box_score_page = requests.get(f'https://www.basketball-reference.com/{gid}')
    box_score_page = BeautifulSoup(box_score_page.text, 'html.parser')
    bs_page_teams = []
    bs_page_score = []
    
    # get team names 
    for item in box_score_page.find('div', attrs={'class', 'scorebox'}).find_all('strong'):
        team_slug = team_handles[item.text.replace('\n', '')]
        bs_page_teams.append(team_slug.lower())
    
    # get teams score
    for score in box_score_page.find('div', attrs={'class', 'scorebox'}).find_all('div', attrs={'class', 'score'}):
        bs_page_score.append(score.getText())
    
    box_score_data = []
    advanced_score_data = []
    tables = box_score_page.find_all("table")
    for i, table in enumerate(tables, start=1):
        for td in table.find_all('tfoot'):
            box_stats1 = ['mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']
            advanced_stats = ['ts_pct', 'efg_pct', 'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct','usg_pct', 'off_rtg', 'def_rtg']
            data = [[td1.getText() for td1 in td.findAll('td', {'data-stat': stat})] for stat in box_stats1]
            box_score_data.append(data)
            advanced_score = [[td2.getText() for td2 in td.findAll('td', {'data-stat': a_stat})] for a_stat in advanced_stats]
            advanced_score_data.append(advanced_score)
    return bs_page_teams, box_score_data, advanced_score_data, bs_page_score


In [18]:
# Processing basketball reference data

all_dates = []
all_box_scores = []
for team in team_handles.values():
    stats_list, box_scores, dates = get_current_season_game_info(team)
    all_box_scores.append(box_scores)
    all_dates.append(dates)



final_list_basic = []
final_list_advanced = []
ordered_teams = []
ordered_scores = []
for team_box_score in all_box_scores:
    for box_score in team_box_score:
        teams, b_score_data, a_score_data, scores = get_box_score_stats(box_score)

        # order score data
        for score in scores:
            ordered_scores.append(score)

        # get the basic score data into dataframe ready format
        for b in b_score_data:
            if b[1] != []:
                final_list_basic.append(b)


        # get the teams in dataframe format
        for team in teams:
            ordered_teams.append(team)

        # finding data and appending to final_list of data (need to ignore empty columns) for advanced
        for a in a_score_data:
            if a[-1] != []:
                final_list_advanced.append(a)
            
print(len(ordered_scores), len(ordered_teams), len(final_list_basic), len(final_list_advanced))

210 210 1482 210


In [106]:
## Putting the data into dataframes

advanced_stats_cols = ['ts_pct', 'efg_pct', 'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct','usg_pct', 'off_rtg', 'def_rtg']
box_stats_cols = ['mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']


# make dataframes have correct value type
df = pd.DataFrame(final_list_basic)
for i in range(0,19):
    df[i] = df[i].str[0]
    


df1 = pd.DataFrame(final_list_advanced)
for i in range(0,14):
    df1[i] = df1[i].str[0]
    

df.columns = box_stats_cols
df1.columns = advanced_stats_cols


df1['teams'] = ordered_teams
df1['score'] = ordered_scores

df_dates = []
for date in all_dates:
    for d in date:
        df_dates.append(clean_date(d))

        
df_full_game = df.loc[(df['mp'].astype(int)>=240)]
df_full_game['teams'] = ordered_teams
print(df_full_game.head())


# for full game join of basic statistics
df_full_game_join = df_full_game.join(df_full_game.shift(-1).add_prefix('away_'))
df_full_game_join[1::2] = ''
df_full_game_join = df_full_game_join[df_full_game_join.mp != '']
df_full_game_join['date'] = df_dates
df_full_game_join['date'] = df_full_game_join['date'].apply(str)
df_full_game_join['key'] = df_full_game_join['date'] + df_full_game_join['teams'] + df_full_game_join['away_teams']



# for full join of advanced statistics
df_join = df1.join(df1.shift(-1).add_prefix('away_'))
df_join[1::2] = ''
df_join = df_join[df_join.ts_pct != '']
df_join['date'] = df_dates
df_join['date'] = df_join['date'].apply(str)
df_join['key'] = df_join['date'] + df_join['teams'] + df_join['away_teams']



     mp  fg fga fg_pct fg3 fg3a fg3_pct  ft fta ft_pct orb drb trb ast stl  \
0   240  42  80   .525  19   42    .452  10  20   .500   8  37  45  30  13   
7   240  38  89   .427  14   46    .304   9  12   .750   7  28  35  26  10   
14  240  32  88   .364   3   36    .083  16  20   .800   9  39  48  21   6   
21  240  34  83   .410  17   52    .327  15  21   .714   9  44  53  21   7   
28  240  41  84   .488  15   29    .517  29  34   .853  12  44  56  19   3   

   blk tov  pf  pts teams  
0    7  24  17  113   nop  
7    5  20  22   99   tor  
14   3  15  21   83   nyk  
21   6  16  22  100   tor  
28   9  19  29  126   bos  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full_game['teams'] = ordered_teams


In [114]:
# save df for per quareter points
df_quarters = df.loc[(df['mp'].astype(int) != 25)]
count = 0

total_teams = []
quarters_halfs = []

for team in ordered_teams:
    total_teams.append(team)
    quarters_halfs.append('game')
    total_teams.append(team)
    quarters_halfs.append('q1')
    total_teams.append(team)
    quarters_halfs.append('q2')
    total_teams.append(team)
    quarters_halfs.append('first_half')
    total_teams.append(team)
    quarters_halfs.append('q3')
    total_teams.append(team)
    quarters_halfs.append('q4')
    total_teams.append(team)
    quarters_halfs.append('second_half')
        

df_quarters['quarter'] = quarters_halfs
df_quarters['team'] = total_teams

df_quarters.head(14)

   
    

save_df('per_quarter_stats', df_quarters)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_quarters['quarter'] = quarters_halfs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_quarters['team'] = total_teams


'Saved'

In [70]:
## Scrape Lines (can add getting current by not putting dates and can get 1st half, second half, and quarters)

# get spreads
def get_betting_spreads(date):
    info_list = []
    betting_page = requests.get(f'https://classic.sportsbookreview.com/betting-odds/nba-basketball/?date={date}')
    time.sleep(2)
    betting_page = BeautifulSoup(betting_page.text, 'html.parser')
    teams_list = []
    for row in betting_page.find_all('div', {'class': 'eventLine-value'}):
        teams_list.append(row.text)
    betting_lines = []
    for item in betting_page.find_all('div', {'class': 'event-holder holder-complete'}):
        for line in item.find('div', {'class': 'el-div eventLine-book'}):
            betting_lines.append(line.text)
    betting_lines = [line.replace('\xa0', ' ') for line in betting_lines]
    date_list = [date for item in betting_lines]
    zipped_teams_lines = zip(date_list, teams_list, betting_lines)
    return list(zipped_teams_lines)

# get overs unders
def get_betting_totals(date):
    info_list = []
    betting_page = requests.get(f'https://classic.sportsbookreview.com/betting-odds/nba-basketball/totals/?date={date}')
    time.sleep(2)
    betting_page = BeautifulSoup(betting_page.text, 'html.parser')
    teams_list = []
    for row in betting_page.find_all('div', {'class': 'eventLine-value'}):
        teams_list.append(row.text)
    betting_lines = []
    for item in betting_page.find_all('div', {'class': 'event-holder holder-complete'}):
        for line in item.find('div', {'class': 'el-div eventLine-book'}):
            betting_lines.append(line.text)
    betting_lines = [line.replace('\xa0', ' ') for line in betting_lines]
    date_list = [date for item in betting_lines]
    zipped_teams_lines = zip(date_list, teams_list, betting_lines)
    return list(zipped_teams_lines)

# Get money lines
def get_betting_money_lines(date):
    info_list = []
    betting_page = requests.get(f'https://classic.sportsbookreview.com/betting-odds/nba-basketball/money-line/?date={date}')
    time.sleep(2)
    betting_page = BeautifulSoup(betting_page.text, 'html.parser')
    teams_list = []
    for row in betting_page.find_all('div', {'class': 'eventLine-value'}):
        teams_list.append(row.text)
    betting_lines = []
    for item in betting_page.find_all('div', {'class': 'event-holder holder-complete'}):
        for line in item.find('div', {'class': 'el-div eventLine-book'}):
            betting_lines.append(line.text)
    betting_lines = [line.replace('\xa0', ' ') for line in betting_lines]
    date_list = [date for item in betting_lines]
    zipped_teams_lines = zip(date_list, teams_list, betting_lines)
    return list(zipped_teams_lines)




In [31]:
# Processing line data
clean_dates = []
for date in all_dates:
    for d in date:
        if clean_date(d) not in clean_dates:
            clean_dates.append(clean_date(d))
    

all_spreads = []
all_money_lines = []
all_totals = []
for date in clean_dates:
    all_spreads.append(get_betting_spreads(date))
    all_money_lines.append(get_betting_money_lines(date))
    all_totals.append(get_betting_totals(date))

['20201223']
['20201223', '20201231']
['20201223', '20201231', '20210104']
['20201223', '20201231', '20210104', '20201225']
['20201223', '20201231', '20210104', '20201225', '20201230']
['20201223', '20201231', '20210104', '20201225', '20201230', '20201229']
['20201223', '20201231', '20210104', '20201225', '20201230', '20201229', '20210102']
['20201223', '20201231', '20210104', '20201225', '20201230', '20201229', '20210102', '20201227']
['20201223', '20201231', '20210104', '20201225', '20201230', '20201229', '20210102', '20201227', '20210101']
['20201223', '20201231', '20210104', '20201225', '20201230', '20201229', '20210102', '20201227', '20210101', '20201226']
['20201223', '20201231', '20210104', '20201225', '20201230', '20201229', '20210102', '20201227', '20210101', '20201226', '20210103']
['20201223', '20201231', '20210104', '20201225', '20201230', '20201229', '20210102', '20201227', '20210101', '20201226', '20210103', '20201222']
['20201223', '20201231', '20210104', '20201225', '20

In [50]:
# Team abriviations
team_abr = {'Toronto': 'TOR',
                     'Boston': 'BOS',
                     'Philadelphia': 'PHI',
                     'Cleveland': 'CLE',
                     'Indiana': 'IND',
                     'Miami': 'MIA',
                     'Milwaukee': 'MIL',
                     'Washington': 'WAS',
                     'Detroit': 'DET',
                     'Charlotte': 'CHO',
                     'New York': 'NYK',
                     'Brooklyn': 'BRK',
                     'Chicago': 'CHI',
                     'Orlando': 'ORL',
                     'Atlanta': 'ATL',
                     'Houston': 'HOU',
                     'Golden State': 'GSW',
                     'Portland': 'POR',
                     'Oklahoma City': 'OKC',
                     'Utah': 'UTA',
                     'New Orleans': 'NOP',
                     'San Antonio': 'SAS',
                     'Minnesota': 'MIN',
                     'Denver': 'DEN',
                     'LA': 'LAC',
                     'L.A. Lakers': 'LAL',
                     'Sacramento': 'SAC',
                     'Dallas': 'DAL',
                     'Memphis': 'MEM',
                     'Phoenix': 'PHO'}

# Processing Totals Data
totals_date = []
totals_team = []
totals_line = []
totals_total = []
# holds 2 things in it, needs to split
total_line = []

for d in all_totals:
    for game in d:
        totals_date.append(game[0])
        totals_team.append(game[1])
        total_line.append(game[2])

# splits the 2 things       
for item in total_line:
    spl = item.split(' ')
    totals_total.append(spl[0])
    totals_line.append(spl[1])
    
totals_team_abr_list = []
for t in totals_team:
    abr = team_abr.get(t)
    totals_team_abr_list.append(abr.lower())
    
# Processing Spread Data
spreads_date = []
spreads_team = []
spreads_line = []
spread = []


for d in all_spreads:
    for game in d:
        spreads_date.append(game[0])
        spreads_team.append(game[1])
        spreads_line.append(game[2])

# splits the 2 things       
for item in spreads_line:
    spl = item.split(' ')
    spread.append(spl[0])
        
spreads_team_abr_list = []
for t in spreads_team:
    abr = team_abr.get(t)
    spreads_team_abr_list.append(abr.lower())
    
# Processing Money Line (ml) Data
ml_date = []
ml_team = []
ml_line = []


for d in all_money_lines:
    for game in d:
        ml_date.append(game[0])
        ml_team.append(game[1])
        ml_line.append(game[2])

ml_team_abr_list = []
for t in ml_team:
    abr = team_abr.get(t)
    ml_team_abr_list.append(abr.lower())
    
    

    

In [71]:
# Creating the dataframes

# Totals Dataframe
df_totals = pd.DataFrame(list(zip(totals_date, totals_team_abr_list, totals_line)), 
               columns =['date', 'team', 'line']) 

df_totals_join = df_totals.join(df_totals.shift(-1).add_prefix('away_'))
df_totals_join[1::2] = ''
df_totals_join['total'] = totals_total
df_totals_join = df_totals_join[df_totals_join.date != '']
df_totals_join['date'] = df_totals_join['date'].apply(str)
df_totals_join['key'] = df_totals_join['date'] + df_totals_join['team'] + df_totals_join['away_team']


# Spreads Dataframe
df_spreads = pd.DataFrame(list(zip(spreads_date, spreads_team_abr_list, spread)), 
               columns =['date', 'team', 'line']) 

df_spreads_join = df_spreads.join(df_spreads.shift(-1).add_prefix('away_'))
df_spreads_join[1::2] = ''
df_spreads_join = df_spreads_join[df_spreads_join.date != '']
df_spreads_join['date'] = df_spreads_join['date'].apply(str)
df_spreads_join['key'] = df_spreads_join['date'] + df_spreads_join['team'] + df_spreads_join['away_team']


# ML Dataframe
df_ml = pd.DataFrame(list(zip(ml_date,  ml_team_abr_list, ml_line)), 
               columns =['date', 'team', 'money_line']) 

df_ml_join = df_ml.join(df_ml.shift(-1).add_prefix('away_'))
df_ml_join[1::2] = ''
df_ml_join = df_ml_join[df_ml_join.date != ''] 
df_ml_join['date'] = df_ml_join['date'].apply(str)
df_ml_join['key'] = df_ml_join['date'] + df_ml_join['team'] + df_ml_join['away_team']




In [75]:
print(len(df_totals_join), len(df_spreads_join), len(df_ml_join), len(df_full_game_join), len(df_join))

105 105 105 105 105


In [80]:
# JOIN ALL THE DATA FRAMES INTO ONE MASSIVE DF WITH ALL LINES AND ALL STATS

merge_df = pd.merge(df_totals_join, df_spreads_join, how='left', left_on=['key'], right_on = ['key'])

merge_df1 = pd.merge(merge_df, df_ml_join, how='left', left_on=['key'], right_on = ['key'])

merge_df2 = pd.merge(merge_df1, df_full_game_join, how='left', left_on=['key'], right_on = ['key'])

merge_df3 = pd.merge(merge_df2, df_join, how='left', left_on=['key'], right_on = ['key'])

merge_df3.head()





Unnamed: 0,date_x,team_x,line_x,away_date_x,away_team_x,away_line_x,total,key,date_y,team_y,...,away_ast_pct,away_stl_pct,away_blk_pct,away_tov_pct,away_usg_pct,away_off_rtg,away_def_rtg,away_teams_y,away_score,date
0,20201223,cho,-107,20201223,cle,-104,217,20201223chocle,20201223,cho,...,73.9,11.8,6.5,15.8,100.0,119.2,112.3,cle,121,20201223
1,20201223,was,-113,20201223,phi,102,230½,20201223wasphi,20201223,was,...,53.7,10.4,13.8,13.8,100.0,106.9,101.2,phi,113,20201223
2,20201223,nyk,-104,20201223,ind,-106,215½,20201223nykind,20201223,nyk,...,60.9,5.7,15.8,10.9,100.0,114.2,101.0,ind,121,20201223
3,20201223,mia,-114,20201223,orl,103,218½,20201223miaorl,20201223,mia,...,54.8,14.3,4.8,14.4,100.0,107.7,102.0,orl,113,20201223
4,20201223,mil,104,20201223,bos,-114,223½,20201223milbos,20201223,mil,...,47.9,8.0,10.9,5.4,100.0,122.1,121.1,bos,122,20201223


In [85]:
# CLEAN THE DATA SET
print(merge_df3.columns)


final_df = merge_df3.drop(columns=['date_x', 'team_x', 'line_x', 'away_date_x', 'teams_x', 'away_line_x', 'date_y', 'team_y', 'line_y',
       'away_date_y', 'away_team_y', 'away_line_y', 'date_x', 'away_date', 'away_team', 'away_teams_x', 'teams_y', 'away_teams_y', 'away_mp', 'key'])

print(final_df.head())


Index(['date_x', 'team_x', 'line_x', 'away_date_x', 'away_team_x',
       'away_line_x', 'total', 'key', 'date_y', 'team_y', 'line_y',
       'away_date_y', 'away_team_y', 'away_line_y', 'date_x', 'team',
       'money_line', 'away_date', 'away_team', 'away_money_line', 'mp', 'fg',
       'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'teams_x',
       'away_mp', 'away_fg', 'away_fga', 'away_fg_pct', 'away_fg3',
       'away_fg3a', 'away_fg3_pct', 'away_ft', 'away_fta', 'away_ft_pct',
       'away_orb', 'away_drb', 'away_trb', 'away_ast', 'away_stl', 'away_blk',
       'away_tov', 'away_pf', 'away_pts', 'away_teams_x', 'date_y', 'ts_pct',
       'efg_pct', 'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
       'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct',
       'off_rtg', 'def_rtg', 'teams_y', 'score', 'away_ts_pct', 'away_efg_pct',
       'away_fg3a_per_fga_pct', '

In [87]:
save_df('current_nba_data', final_df)

'Saved'

In [104]:
## Getting averages for each team and game




basic_cols = ['mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']
df_full_game[basic_cols] = df_full_game[basic_cols].apply(pd.to_numeric)

advanced_cols = ['ts_pct', 'efg_pct', 'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct',
       'drb_pct', 'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct',
       'usg_pct', 'off_rtg', 'def_rtg', 'score']


df1[advanced_cols] = df1[advanced_cols].apply(pd.to_numeric)
df_avg_basic_stats = df_full_game.groupby('teams', axis=0).mean().reset_index()

df_avg_advanced_stats = df1.groupby('teams', axis=0).mean().reset_index()

df_avg_merge = pd.merge(df_avg_basic_stats, df_avg_advanced_stats, how='left', left_on=['teams'], right_on = ['teams'])

save_df('current_nba_team_avgs', df_avg_merge)





'Saved'