### Folder checks and import libraries

In [None]:
#check where the notebook resides
os.getcwd()

In [None]:
#check where the processed nbalogs reside - is it in same folder as notebook
os.listdir()

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
import collections
import datetime

pd.options.display.max_columns = 999

now = datetime.datetime.now()

#determine east and west teams (west auto excluded via list comprehension in working cells below)
ec = ['Atlanta', 'Boston', 'Brooklyn', 'Charlotte', 'Chicago','Cleveland','Detroit','Indiana',
      'Miami','Milwaukee','New York','Orlando','Philadelphia','Toronto','Washington']

### Scrap web and process file to nbalogs

In [2]:
#Define fixed variables
list_teams=['Boston Celtics','Brooklyn Nets','New York Knicks', 'Philadelphia 76ers', 'Toronto Raptors',
           'Chicago Bulls', 'Cleveland Cavaliers', 'Detroit Pistons', 'Indiana Pacers', 'Milwaukee Bucks',
           'Atlanta Hawks', 'Charlotte Hornets', 'Miami Heat', 'Orlando Magic', 'Washington Wizards',
           'Denver Nuggets', 'Minnesota Timberwolves', 'Oklahoma City Thunder', 'Portland Trail Blazers', 'Utah Jazz',
           'Golden State Warriors', 'Los Angeles Clippers', 'Los Angeles Lakers', 'Phoenix Suns', 'Sacramento Kings',
           'Dallas Mavericks', 'Houston Rockets', 'Memphis Grizzlies', 'New Orleans Pelicans', 'San Antonio Spurs']

dict_teams={'Boston Celtics': 'Boston',
 'Brooklyn Nets': 'Brooklyn',
 'New York Knicks': 'New York',
 'Philadelphia 76ers': 'Philadelphia',
 'Toronto Raptors': 'Toronto',
 'Chicago Bulls': 'Chicago',
 'Cleveland Cavaliers': 'Cleveland',
 'Detroit Pistons': 'Detroit',
 'Indiana Pacers': 'Indiana',
 'Milwaukee Bucks': 'Milwaukee',
 'Atlanta Hawks': 'Atlanta',
 'Charlotte Hornets': 'Charlotte',
 'Miami Heat': 'Miami',
 'Orlando Magic': 'Orlando',
 'Washington Wizards': 'Washington',
 'Denver Nuggets': 'Denver',
 'Minnesota Timberwolves': 'Minnesota',
 'Oklahoma City Thunder': 'Oklahoma City',
 'Portland Trail Blazers': 'Portland',
 'Utah Jazz': 'Utah',
 'Golden State Warriors': 'Golden State',
 'Los Angeles Clippers': 'LA Clippers',
 'Los Angeles Lakers': 'LA Lakers',
 'Phoenix Suns': 'Phoenix',
 'Sacramento Kings': 'Sacramento',
 'Dallas Mavericks': 'Dallas',
 'Houston Rockets': 'Houston',
 'Memphis Grizzlies': 'Memphis',
 'New Orleans Pelicans': 'New Orleans',
 'San Antonio Spurs': 'San Antonio'}

# years = [str(now.year -1), str(now.year-2), str(now.year-3), str(now.year-4)]
url_nba = 'https://www.oddsshark.com/nba/game-logs'

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [3]:
#Define function to count number of teams as sense-check (should be 30)
def count_teams(d):
    return sum([count_teams(v) if isinstance(v, dict) else 1 for v in d.values()])

In [4]:
#Define function to web scrap game logs (Take Note use below)
#Eg if I run From July to Mid Oct in year N during offseason, the results will be more season N-1/N
#(run during offseason - get data for previous season)

#Eg if I run from Mid Oct to Dec in year N of season, the results will be more season N/N+1
#Eg if I run from Jan to Jun in year N of season, the results will be more season N-1/N
#(run during season - get current data as season progress)

def read_nba_logs(url):
    urls_currentyr={}
    final_urls = collections.defaultdict(dict)
    
    html = urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')

    tags = soup('a')

    for tag in tags:
        
        try:
            tag.contents[0]
        except:
            continue
        
        if tag.contents[0] in list_teams:
            team = tag.contents[0]
            base_url = re.sub(url,tag.get('href', None),url)
            urls_currentyr[team] = base_url
            final_urls[str(now.year)] = urls_currentyr
    
    final_urls = dict(final_urls)
    
    print('Total number of records: ', count_teams(final_urls))
    return final_urls

final_urls = read_nba_logs(url_nba)

Total number of records:  30


In [5]:
#check final_urls
final_urls

{'2023': {'Boston Celtics': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20722',
  'Brooklyn Nets': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20749',
  'New York Knicks': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20747',
  'Philadelphia 76ers': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20731',
  'Toronto Raptors': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20742',
  'Chicago Bulls': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20732',
  'Cleveland Cavaliers': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20735',
  'Detroit Pistons': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20743',
  'Indiana Pacers': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20737',
  'Milwaukee Bucks': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20725',
  'Atlanta Hawks': 'https://www.oddsshark.com/stats/gamelog/basketball/nba/20734',
  'Charlotte Hornets': 'https://www.oddsshark.com/stats/ga

In [6]:
#Define function to read into rawlogs dataframe
def process_to_rawlogs(url_dict):
    combineddf = pd.DataFrame()

    for nested_dict in url_dict.values():
        for team,url in nested_dict.items():
            scores = pd.read_html(url)[0]
            scores['Team'] = team
            cols = [scores.columns[-1]] + [col for col in scores if col != scores.columns[-1]]
            scores = scores[cols]
            combineddf = pd.concat([combineddf,scores])

    combineddf.index.rename('Played_Games',inplace=True)
    combineddf.reset_index(inplace=True)
    
    return combineddf

In [7]:
#check combineddf
combineddf = process_to_rawlogs(final_urls)
combineddf.tail()

Unnamed: 0,Played_Games,Team,Date,Opponent,Game,Result,Score,ATS,Spread,OU,Total
2455,77,San Antonio Spurs,"Apr 2, 2023",@ Sacramento,REG,,,,,,
2456,78,San Antonio Spurs,"Apr 4, 2023",@ Phoenix,REG,,,,,,
2457,79,San Antonio Spurs,"Apr 6, 2023",vs Portland,REG,,,,,,
2458,80,San Antonio Spurs,"Apr 8, 2023",vs Minnesota,REG,,,,,,
2459,81,San Antonio Spurs,"Apr 9, 2023",@ Dallas,REG,,,,,,


In [8]:
#define 2regex functions for Score cleaning
def regex_win(string):
    str1 = re.search('\w+(?=-)',string)
    if str1:
        return int(str1.group())
    else:
        return np.nan
        
def regex_lose(string):
    str2 = re.search('(?<=-)\w+',string)
    if str2:
        return int(str2.group())
    else:
        return np.nan

In [9]:
#Define function to process rawlogs to nbalogs dataframe
def process_to_nbalogs(rawlogs_df):
    
    #change date column to datetime
    rawlogs_df['Date']=pd.to_datetime(rawlogs_df['Date'])

    #identify home and away
    rawlogs_df['Home_Away'] = np.where(rawlogs_df.Opponent.str.match('vs'),'Home','Away')

    #regex to split out opponents
    rawlogs_df['Opponent_clean'] = rawlogs_df['Opponent'].apply(lambda x:re.findall('(?<=@\s).*|(?<=vs\s).*',x)[0])

    rawlogs_df['Score']=rawlogs_df.Score.astype(str)

    #regex to split out Wpoints and Losepoints
    rawlogs_df['Wpoints'] = rawlogs_df.Score.apply(regex_win)
    rawlogs_df['Lpoints'] = rawlogs_df.Score.apply(regex_lose)

    #dealing with ATS
    rawlogs_df['ATS_margin_W'] = rawlogs_df[['Wpoints','Spread']].sum(axis=1).where(rawlogs_df['Result']=='W')
    rawlogs_df['ATS_margin_L'] = rawlogs_df[['Lpoints','Spread']].sum(axis=1).where(rawlogs_df['Result']=='L')
    rawlogs_df['ATS_combi'] = rawlogs_df['ATS_margin_W'].fillna(rawlogs_df['ATS_margin_L'])

    #process ATS
    rawlogs_df['minusL'] = -1 * rawlogs_df['Lpoints']
    rawlogs_df['minusATScombi'] = -1 * rawlogs_df['ATS_combi']
    rawlogs_df['ATS_clear_W'] = rawlogs_df[['ATS_combi','minusL']].sum(axis=1).where(rawlogs_df['Result']=='W')
    rawlogs_df['ATS_clear_L'] = rawlogs_df[['Wpoints','minusATScombi']].sum(axis=1).where(rawlogs_df['Result']=='L')
    rawlogs_df['ATS_clear_L'] = -1* rawlogs_df['ATS_clear_L']
    rawlogs_df['ATS_final'] = rawlogs_df['ATS_clear_W'].fillna(rawlogs_df['ATS_clear_L'])

    rawlogs_df.drop(['ATS_margin_W', 'ATS_margin_L', 'ATS_combi', 'minusL', 'minusATScombi', 
                   'ATS_clear_W', 'ATS_clear_L'], axis=1,inplace=True)

    rawlogs_df = rawlogs_df.replace('nan',np.nan)
    
    #process OU
    rawlogs_df.rename(columns={'Total': "Total_HC"},inplace=True)
    rawlogs_df['Total_actual'] = rawlogs_df['Wpoints'] + rawlogs_df['Lpoints']
    
    rawlogs_df['minus_OU'] = -1 * rawlogs_df['Total_HC']
    rawlogs_df['O_margin'] = rawlogs_df[['Total_actual','minus_OU']].sum(axis=1).where(rawlogs_df['OU']=='O')
    rawlogs_df['U_margin'] = rawlogs_df[['Total_actual','minus_OU']].sum(axis=1).where(rawlogs_df['OU']=='U')
    rawlogs_df['P_margin'] = rawlogs_df[['Total_actual','minus_OU']].sum(axis=1).where(rawlogs_df['OU']=='P')
    rawlogs_df['OU_final'] = rawlogs_df['O_margin'].fillna(rawlogs_df['U_margin'])
    rawlogs_df['OU_final'] = rawlogs_df['OU_final'].fillna(rawlogs_df['P_margin'])

    rawlogs_df.drop(['O_margin', 'U_margin', 'P_margin','minus_OU'], axis=1,inplace=True)
    
    #drop Playoff games
    rawlogs_df = rawlogs_df.loc[rawlogs_df.Game == 'REG']
    
    #further processing
    rawlogs_df = rawlogs_df[['Played_Games', 'Team', 'Date', 'Game', 'Result', 'Score', 'ATS', 
                   'Spread', 'OU', 'Total_HC', 'Home_Away', 'Opponent_clean', 'Wpoints', 'Lpoints', 
                   'Total_actual','ATS_final', 'OU_final']]
    
    rawlogs_df = rawlogs_df[~rawlogs_df.Result.isna()]
    
    rawlogs_df['Team'] = rawlogs_df['Team'].map(dict_teams)
    
    rawlogs_df.reset_index(drop=True,inplace=True)
    
    return rawlogs_df

In [10]:
#check processed nbalogs
nbalogs = process_to_nbalogs(combineddf)
nbalogs.head()

Unnamed: 0,Played_Games,Team,Date,Game,Result,Score,ATS,Spread,OU,Total_HC,Home_Away,Opponent_clean,Wpoints,Lpoints,Total_actual,ATS_final,OU_final
0,0,Boston,2022-10-18,REG,W,126-117,W,-3.0,O,216.5,Home,Philadelphia,126.0,117.0,243.0,6.0,26.5
1,1,Boston,2022-10-21,REG,W,111-104,W,-2.0,U,221.0,Away,Miami,111.0,104.0,215.0,5.0,-6.0
2,2,Boston,2022-10-22,REG,W,126-120,L,-8.5,O,215.5,Away,Orlando,126.0,120.0,246.0,-2.5,30.5
3,3,Boston,2022-10-24,REG,L,120-102,L,-5.5,U,226.0,Away,Chicago,120.0,102.0,222.0,-23.5,-4.0
4,4,Boston,2022-10-28,REG,L,132-123,L,-7.0,O,218.5,Home,Cleveland,132.0,123.0,255.0,-16.0,36.5


In [11]:
#export to csv
nbalogs.to_csv(f'nbalogs_{str(now.date())}.csv')

### Process nbalogs to streaklogs

In [12]:
#to kickstart the process if did the above halfway without going to streaklogs. Beware of filename (check folder)
# nbalogs = pd.read_csv('nbalogs_2022-10-18.csv',index_col=0)
nbalogs = pd.read_csv('nbalogs_2023-01-03.csv',index_col=0)

In [13]:
nbalogs

Unnamed: 0,Played_Games,Team,Date,Game,Result,Score,ATS,Spread,OU,Total_HC,Home_Away,Opponent_clean,Wpoints,Lpoints,Total_actual,ATS_final,OU_final
0,0,Boston,2022-10-18,REG,W,126-117,W,-3.0,O,216.5,Home,Philadelphia,126.0,117.0,243.0,6.0,26.5
1,1,Boston,2022-10-21,REG,W,111-104,W,-2.0,U,221.0,Away,Miami,111.0,104.0,215.0,5.0,-6.0
2,2,Boston,2022-10-22,REG,W,126-120,L,-8.5,O,215.5,Away,Orlando,126.0,120.0,246.0,-2.5,30.5
3,3,Boston,2022-10-24,REG,L,120-102,L,-5.5,U,226.0,Away,Chicago,120.0,102.0,222.0,-23.5,-4.0
4,4,Boston,2022-10-28,REG,L,132-123,L,-7.0,O,218.5,Home,Cleveland,132.0,123.0,255.0,-16.0,36.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,32,San Antonio,2022-12-26,REG,W,126-122,W,4.5,O,236.0,Home,Utah,126.0,122.0,248.0,8.5,12.0
1116,33,San Antonio,2022-12-27,REG,L,130-114,L,7.0,O,236.0,Away,Oklahoma City,130.0,114.0,244.0,-9.0,8.0
1117,34,San Antonio,2022-12-29,REG,W,122-115,W,4.5,O,222.0,Home,New York,122.0,115.0,237.0,11.5,15.0
1118,35,San Antonio,2022-12-31,REG,L,126-125,W,7.0,O,231.5,Home,Dallas,126.0,125.0,251.0,6.0,19.5


In [14]:
#Define function to process nbalogs to interim_nbalogs
def process_to_interim_nbalogs(nbalogs_df):
    
#     nbalogs_df.drop('Unnamed: 0', axis=1, inplace=True)
    nbalogs_df = nbalogs_df.loc[:,['Played_Games','Team','Date','ATS','OU','ATS_final','OU_final']]
    
    #assign binary values to ATS and OU
    conditions_ATS = [
    (nbalogs_df['ATS'] == 'W'),
    (nbalogs_df['ATS'] == 'L'),
    (nbalogs_df['ATS'] == 'P'),
    ]

    values_ATS = [1,2,3]

    conditions_OU = [
        (nbalogs_df['OU'] == 'O'),
        (nbalogs_df['OU'] == 'U'),
        (nbalogs_df['OU'] == 'P'),
    ]

    values_OU = [1,2,3]

    nbalogs_df['value_ATS']= np.select(conditions_ATS,values_ATS)
    nbalogs_df['value_OU']= np.select(conditions_OU,values_OU)
    
    return nbalogs_df

In [15]:
#test processing to interim_nbalogs
interim_nbalogs = process_to_interim_nbalogs(nbalogs)
interim_nbalogs.head()

Unnamed: 0,Played_Games,Team,Date,ATS,OU,ATS_final,OU_final,value_ATS,value_OU
0,0,Boston,2022-10-18,W,O,6.0,26.5,1,1
1,1,Boston,2022-10-21,W,U,5.0,-6.0,1,2
2,2,Boston,2022-10-22,L,O,-2.5,30.5,2,1
3,3,Boston,2022-10-24,L,U,-23.5,-4.0,2,2
4,4,Boston,2022-10-28,L,O,-16.0,36.5,2,1


In [16]:
#Define function to process interim_nbalogs to streaklogs_ATS
def process_to_streaklogs_ATS(interim_nbalogs):
    
    team_new=[]
    streak_final = "ATS_final"
    streak_avg = "ATS_avg"
    streak_min = "ATS_min"
    streak_max = "ATS_max"
    
    interim_nbalogs['flag_ATS'] = interim_nbalogs['value_ATS'].groupby \
    ([interim_nbalogs.Team, interim_nbalogs['value_ATS'].diff().ne(0).cumsum()]).transform('size').ge(3).astype(int) 
    #change to ge4 when needed - now is at streak 3
    interim_nbalogs = interim_nbalogs[interim_nbalogs['flag_ATS'] == 1]

    non_sequence = pd.Series(interim_nbalogs.Played_Games).diff() != 1
    non_sequence_1 = pd.Series(interim_nbalogs.value_ATS).diff() != 0
    merged = pd.concat([non_sequence,non_sequence_1],axis=1)
    merged = merged.any(axis=1)
    grouper = merged.cumsum().values

    newframe_ATS = pd.DataFrame()
            
    for group, frame in interim_nbalogs.groupby(['Team',grouper]):
        
        average = frame[streak_final].mean()
        minimum = frame[streak_final].min()
        maximum = frame[streak_final].max()
        frame[streak_avg] = average
        frame[streak_min] = minimum
        frame[streak_max] = maximum
        newframe_ATS = pd.concat([newframe_ATS,frame])
        
    newframe_ATS.reset_index(drop=True,inplace=True)
    
    return newframe_ATS

In [17]:
#test processing to streaklogs_ATS
streaklogs_ATS = process_to_streaklogs_ATS(interim_nbalogs)
streaklogs_ATS.head(20)

Unnamed: 0,Played_Games,Team,Date,ATS,OU,ATS_final,OU_final,value_ATS,value_OU,flag_ATS,ATS_avg,ATS_min,ATS_max
0,7,Atlanta,2022-11-02,W,U,15.5,-22.0,1,2,1,13.666667,1.0,24.5
1,8,Atlanta,2022-11-05,W,O,1.0,11.0,1,1,1,13.666667,1.0,24.5
2,9,Atlanta,2022-11-07,W,U,24.5,-5.0,1,2,1,13.666667,1.0,24.5
3,14,Atlanta,2022-11-16,L,U,-25.0,-7.5,2,2,1,-12.333333,-25.0,-3.5
4,15,Atlanta,2022-11-19,L,O,-3.5,20.0,2,1,1,-12.333333,-25.0,-3.5
5,16,Atlanta,2022-11-21,L,U,-8.5,-12.0,2,2,1,-12.333333,-25.0,-3.5
6,18,Atlanta,2022-11-25,L,O,-13.0,17.5,2,1,1,-8.5,-13.0,-0.5
7,19,Atlanta,2022-11-27,L,U,-12.0,-22.0,2,2,1,-8.5,-13.0,-0.5
8,20,Atlanta,2022-11-28,L,U,-0.5,-15.5,2,2,1,-8.5,-13.0,-0.5
9,26,Atlanta,2022-12-11,L,O,-2.5,10.0,2,1,1,-10.666667,-15.5,-2.5


In [18]:
#Define function to process interim_nbalogs to streaklogs_OU
def process_to_streaklogs_OU(interim_nbalogs):
    
    team_new=[]
    streak_final = "OU_final"
    streak_avg = "OU_avg"
    streak_min = "OU_min"
    streak_max = "OU_max"
    
    interim_nbalogs['flag_OU'] = interim_nbalogs['value_OU'].groupby \
    ([interim_nbalogs.Team, interim_nbalogs['value_OU'].diff().ne(0).cumsum()]).transform('size').ge(3).astype(int) 
    #change to ge4 when needed - now is at streak 3
    interim_nbalogs = interim_nbalogs[interim_nbalogs['flag_OU'] == 1]

    non_sequence = pd.Series(interim_nbalogs.Played_Games).diff() != 1
    non_sequence_1 = pd.Series(interim_nbalogs.value_OU).diff() != 0
    merged = pd.concat([non_sequence,non_sequence_1],axis=1)
    merged = merged.any(axis=1)
    grouper = merged.cumsum().values

    newframe_OU = pd.DataFrame()
            
    for group, frame in interim_nbalogs.groupby(['Team',grouper]):
        
        average = frame[streak_final].mean()
        minimum = frame[streak_final].min()
        maximum = frame[streak_final].max()
        frame[streak_avg] = average
        frame[streak_min] = minimum
        frame[streak_max] = maximum
        newframe_OU = pd.concat([newframe_OU,frame])
        
    newframe_OU.reset_index(drop=True,inplace=True)
    
    return newframe_OU

In [19]:
#test processing to streaklogs_OU
streaklogs_OU = process_to_streaklogs_OU(interim_nbalogs)
streaklogs_OU.head(20)

Unnamed: 0,Played_Games,Team,Date,ATS,OU,ATS_final,OU_final,value_ATS,value_OU,flag_ATS,flag_OU,OU_avg,OU_min,OU_max
0,2,Atlanta,2022-10-23,L,O,-27.0,8.0,2,1,0,1,12.6,2.0,25.5
1,3,Atlanta,2022-10-26,L,O,-2.5,2.0,2,1,0,1,12.6,2.0,25.5
2,4,Atlanta,2022-10-28,W,O,18.0,18.0,1,1,0,1,12.6,2.0,25.5
3,5,Atlanta,2022-10-29,L,O,-2.5,9.5,2,1,0,1,12.6,2.0,25.5
4,6,Atlanta,2022-10-31,L,O,-28.0,25.5,2,1,0,1,12.6,2.0,25.5
5,25,Atlanta,2022-12-09,W,O,3.5,3.5,1,1,0,1,14.25,3.5,33.0
6,26,Atlanta,2022-12-11,L,O,-2.5,10.0,2,1,1,1,14.25,3.5,33.0
7,27,Atlanta,2022-12-12,L,O,-15.5,10.5,2,1,1,1,14.25,3.5,33.0
8,28,Atlanta,2022-12-14,L,O,-14.0,33.0,2,1,1,1,14.25,3.5,33.0
9,6,Boston,2022-11-02,L,O,-2.5,7.0,2,1,0,1,16.5,7.0,24.0


In [20]:
#export streaklogs_ATS and streaklogs_OU to csv - fyi
streaklogs_ATS.to_csv(f'streaklogs_ATS_{str(now.date())}.csv')
streaklogs_OU.to_csv(f'streaklogs_OU_{str(now.date())}.csv')