In [261]:
from nba_api.stats.endpoints import leaguegamefinder  
import dataframe_image as dfi
import pandas as pd
import numpy as np
from threading import Thread, Event
import requests
import time
from bs4 import BeautifulSoup, Comment
from datetime import datetime
import matplotlib.pyplot as plt
from datetime import timedelta

In [263]:
def getAttendance():
    from timeit import default_timer as timer
    from datetime import timedelta
    import time

    start = timer()
    year = int(datetime.now().strftime('%Y')) + 1
    endMonth = datetime.now().strftime('%B').lower()
    nba_months = ['october','november','december','january','february','march','april']
    tables = pd.read_html(f"https://www.basketball-reference.com/leagues/NBA_{year}_games-october.html")
    columns = tables[0].columns
    attendances = pd.DataFrame({}, columns=columns) 
    for month in nba_months:
        try:
            url = f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"
            print(url);
            tables = pd.read_html(f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html")
            attendances = pd.concat([attendances, tables[0]])
            time.sleep(12)
        except:
            time.sleep(12)
            continue
        if(month == endMonth):
            break;
    attendances['Date'] = [datetime.strptime(game['Date'], '%a, %b %d, %Y') for index, game in attendances.iterrows()]
    attendances.to_csv('nba-attendances-recent.csv')
    end = timer()
    print(timedelta(seconds=end-start))
    return attendances

In [264]:
def addAttendance(games):
    # Add Attendance to boxscore
    from timeit import default_timer as timer
    from datetime import timedelta
    import time
     
    #attendances = pd.read_csv('nba-attendances-recent.csv', index_col=0)
    attendances = getAttendance()
    start = timer()
    games_attendance = []
    
    for index, score in games.iterrows():
        game = attendances.loc[(attendances['Home/Neutral'] ==  score['HOME_TEAM_NAME']) 
                              & (attendances['Visitor/Neutral'] ==  score['AWAY_TEAM_NAME']) 
                              & (attendances['Date'] == score['HOME_GAME_DATE'])]
        if len(game) == 1:
            games_attendance.append(game.iloc[0, 8])
        else:
            games_attendance.append(np.nan)
    games['ATTENDANCE'] = games_attendance        
    end = timer()
    print(timedelta(seconds=end-start)) 
    games.to_csv('boxscores-recent.csv')
    return games 

In [265]:
def transformData(games):
    from timeit import default_timer as timer
    from datetime import timedelta
    import time
    
    start = timer()
    columns = games.columns.to_series().add_prefix("HOME_").index.to_list() + games.columns.to_series().add_prefix("AWAY_").index.to_list()
    transformed_games = pd.DataFrame({}, columns=columns)
    game_ids = games['GAME_ID'].unique()
    for id in game_ids:
        grouped_game = games.loc[games['GAME_ID'] == id]
        if 'vs' in grouped_game.iloc[0, 6]:
            home = grouped_game.iloc[[0]]
            home = home.add_prefix('HOME_')
            away = grouped_game.iloc[[1]]
            away = away.add_prefix('AWAY_')
        else:
            home = grouped_game.iloc[[1]]
            home = home.add_prefix('HOME_')
            away = grouped_game.iloc[[0]]
            away = away.add_prefix('AWAY_')
        game_played = pd.concat([home,away.set_index(home.index)], axis=1)
        transformed_games = pd.concat([transformed_games, game_played]) 
    transformed_games = transformed_games.reset_index(drop=True)
    
    transformed_games.to_csv('nba-transformed-recent.csv')
    end = timer()
    print(timedelta(seconds=end-start))
    return transformed_games 

In [266]:
def shouldUpdate():
    gamefinder = leaguegamefinder.LeagueGameFinder(league_id_nullable='00', season_nullable='2022-23', season_type_nullable="Regular Season")
    allGames = gamefinder.get_data_frames()[0]
    allGames = allGames.reindex(index=allGames.index[::-1]).reset_index(drop=True)
    allGames = allGames.dropna()
    currentGames = pd.read_csv('nba-transformed-current.csv', index_col=0)
    if not(allGames.iloc[-1]['GAME_DATE'][0:10] == currentGames.iloc[-1]['HOME_GAME_DATE'][0:10]):
        date = (datetime.strptime(currentGames.iloc[-1]['HOME_GAME_DATE'][0:10], '%Y-%m-%d') + timedelta(days = 1)).strftime('%m/%d/%Y')
        gamefinder = leaguegamefinder.LeagueGameFinder(league_id_nullable='00', season_nullable='2022-23', date_from_nullable=date, season_type_nullable="Regular Season")
        allGames = gamefinder.get_data_frames()[0]
        allGames = allGames.reindex(index=allGames.index[::-1]).reset_index(drop=True)
        allGames['WL'] = [1 if game['WL'] == 'W' else 0 for index, game in allGames.iterrows()]
        allGames = transformData(allGames)
        allGames = addAttendance(allGames)
        pd.concat([currentGames, allGames]).reset_index(drop=True).to_csv('nba-transformed-today.csv') 
        return pd.concat([currentGames, allGames]).reset_index(drop=True)
    return []

In [267]:
def createCumAvgs():
    #Calculate Cum AVGs
    from timeit import default_timer as timer
    from datetime import timedelta
    import time

    start = timer()
    gamefinder = leaguegamefinder.LeagueGameFinder(league_id_nullable='00', season_nullable='2022-23', season_type_nullable="Regular Season")
    games = gamefinder.get_data_frames()[0]
    games = games.reindex(index=games.index[::-1]).reset_index(drop=True)
    games['WL'] = [1 if game['WL'] == 'W' else 0 for index, game in games.iterrows()]
    newColumns = ['GAME_NUMBER', 'TEAM_NAME', 'GAME_ID', 'SEASON_ID', 'GAME_DATE', 'LAST_GAME_DAYS', 'HOME_STREAK', 'AWAY_STREAK', 'WL', 'WL_AVG', 'PTS_AVG', 'FGM_AVG', 'FGA_AVG', 'FG3M_AVG', 'FG3A_AVG', 'FTM_AVG',
                'FTA_AVG', 'OREB_AVG', 'DREB_AVG', 'REB_AVG', 'AST_AVG', 'STL_AVG', 'BLK_AVG', 
                  'TOV_AVG', 'PF_AVG']
    seasons = games['SEASON_ID'].unique()
    means   = []
    theGame = []
    for i, season in enumerate(seasons):
        scores = games[games['SEASON_ID'] == season]
        scores = scores.sort_values(by=['GAME_DATE'], ascending=False)
        teams  = scores['TEAM_NAME'].unique()
        for team in teams:
            homeStreak = 0
            awayStreak = 0
            lastGameDate = None
            teamGames = scores[(scores['TEAM_NAME'] == team)]
            teamGames = teamGames.sort_values(by=['GAME_DATE'], ascending=False)
            teamGames = teamGames.reindex(index=teamGames.index[::-1]).reset_index(drop=True)
            newDf = pd.DataFrame({}, columns=newColumns)
            for j, game in teamGames.iterrows():
                newList = []
                if 'vs' in game['MATCHUP']:
                    homeStreak += 1
                    awayStreak = 0
                else:
                    homeStreak = 0
                    awayStreak += 1
                if j == 0:
                    days = 0
                else:
                    d1 = datetime.strptime(lastGameDate, '%Y-%m-%d')
                    d2 = datetime.strptime(game['GAME_DATE'], '%Y-%m-%d')
                    delta = d2 - d1
                    days = delta.days 
                lastGameDate = game['GAME_DATE']
                teamAvg = teamGames.iloc[0:j][['WL', 'PTS', 'FGM', 'FGA', 'FG3M', 'FG3A',
                'FTM', 'FTA', 'OREB', 'DREB','REB', 'AST', 'STL', 'BLK', 'TOV',
                'PF']].mean()
                newList.append(j+1)
                newList.append(game['TEAM_NAME'])
                newList.append(game['GAME_ID'])
                newList.append(game['SEASON_ID'])
                newList.append(game['GAME_DATE'])
                newList.append(days)
                newList.append(homeStreak)
                newList.append(awayStreak)
                newList.append(game['WL'])
                newList.append(teamAvg['WL'] if j > 0 else 0)
                newList.append(teamAvg['PTS'] if j > 0 else 0)
                newList.append(teamAvg['FGM'] if j > 0 else 0)
                newList.append(teamAvg['FGA'] if j > 0 else 0)
                newList.append(teamAvg['FG3M'] if j > 0 else 0)
                newList.append(teamAvg['FG3A'] if j > 0 else 0)
                newList.append(teamAvg['FTM'] if j > 0 else 0)
                newList.append(teamAvg['FTA'] if j > 0 else 0)
                newList.append(teamAvg['OREB'] if j > 0 else 0)
                newList.append(teamAvg['DREB'] if j > 0 else 0)
                newList.append(teamAvg['REB'] if j > 0 else 0)
                newList.append(teamAvg['AST'] if j > 0 else 0)
                newList.append(teamAvg['STL'] if j > 0 else 0)
                newList.append(teamAvg['BLK'] if j > 0 else 0)
                newList.append(teamAvg['TOV'] if j > 0 else 0)
                newList.append(teamAvg['PF'] if j > 0 else 0)
                row   = pd.DataFrame([newList], columns=newColumns)
                newDf = pd.concat([newDf, row])
            newDf.to_csv(f"nba/computed_avgs_new/{team.replace(' ', '_').replace('/', '')}_cum_game_avg_{str(int(season) % 20000)}.csv") 
    end = timer()
    print(timedelta(seconds=end-start)) 

In [268]:
def createFinalTransformation(games):
    from timeit import default_timer as timer
    from datetime import timedelta
    import time

    start = timer()
    newColumns = ['HOME_TEAM_NAME', 'AWAY_TEAM_NAME','GAME_ID','SEASON_ID' ,'DATE','HOME_TEAM_GAME_NUMBER', 'HOME_LAST_GAME_DAYS', 'HOME_HOME_STREAK', 'HOME_AWAY_STREAK', 'HOME_WL_AVG', 'HOME_PTS_AVG', 'HOME_FGM_AVG', 'HOME_FGA_AVG', 'HOME_FG3M_AVG', 'HOME_FG3A_AVG', 'HOME_FTM_AVG',
                'HOME_FTA_AVG', 'HOME_OREB_AVG', 'HOME_DREB_AVG', 'HOME_REB_AVG', 'HOME_AST_AVG', 'HOME_STL_AVG', 'HOME_BLK_AVG', 'HOME_TOV_AVG', 'HOME_PF_AVG',
                'AWAY_TEAM_GAME_NUMBER', 'AWAY_LAST_GAME_DAYS', 'AWAY_HOME_STREAK', 'AWAY_AWAY_STREAK','AWAY_WL_AVG', 'AWAY_PTS_AVG', 'AWAY_FGM_AVG', 'AWAY_FGA_AVG', 'AWAY_FG3M_AVG', 'AWAY_FG3A_AVG',
                'AWAY_FTM_AVG', 'AWAY_FTA_AVG', 'AWAY_OREB_AVG', 'AWAY_DREB_AVG','AWAY_REB_AVG', 'AWAY_AST_AVG', 'AWAY_STL_AVG', 'AWAY_BLK_AVG', 'AWAY_TOV_AVG',
                'AWAY_PF_AVG','HOME_PTS', 'AWAY_PTS', 'HOME_WL', 'AWAY_WL','ATTENDANCE']
    
    newDf = pd.DataFrame({}, columns=newColumns)
    scores = games[games['HOME_SEASON_ID'] == games.iloc[-1]['HOME_SEASON_ID']]
    scores = scores.sort_values(by=['HOME_GAME_DATE'], ascending=False)
    scores = scores.reindex(index=scores.index[::-1]).reset_index(drop=True)    
    for i, score in scores.iterrows():
        homeTeam = pd.read_csv(f"nba/computed_avgs_new/{score['HOME_TEAM_NAME'].replace(' ', '_').replace('/', '')}_cum_game_avg_2022.csv")
        awayTeam = pd.read_csv(f"nba/computed_avgs_new/{score['AWAY_TEAM_NAME'].replace(' ', '_').replace('/', '')}_cum_game_avg_2022.csv")
        homeGame = homeTeam[homeTeam['GAME_ID'] == int(score['HOME_GAME_ID'])].iloc[0]
        awayGame = awayTeam[awayTeam['GAME_ID'] == int(score['HOME_GAME_ID'])].iloc[0]
        newList  = []
        newList.append(score['HOME_TEAM_NAME'])
        newList.append(score['AWAY_TEAM_NAME'])
        newList.append(score['HOME_GAME_ID'])
        newList.append(score['HOME_SEASON_ID'])
        newList.append(score['HOME_GAME_DATE'])
        newList.append(homeGame['GAME_NUMBER'])
        newList.append(homeGame['LAST_GAME_DAYS'])
        newList.append(homeGame['HOME_STREAK'])
        newList.append(homeGame['AWAY_STREAK'])
        newList.append(homeGame['WL_AVG'])
        newList.append(homeGame['PTS_AVG'])
        newList.append(homeGame['FGM_AVG'])
        newList.append(homeGame['FGA_AVG'])
        newList.append(homeGame['FG3M_AVG'])
        newList.append(homeGame['FG3A_AVG'])
        newList.append(homeGame['FTM_AVG'])
        newList.append(homeGame['FTA_AVG'])
        newList.append(homeGame['OREB_AVG'])
        newList.append(homeGame['DREB_AVG'])
        newList.append(homeGame['REB_AVG'])
        newList.append(homeGame['AST_AVG'])
        newList.append(homeGame['STL_AVG'])
        newList.append(homeGame['BLK_AVG'])
        newList.append(homeGame['TOV_AVG'])
        newList.append(homeGame['PF_AVG'])
        newList.append(awayGame['GAME_NUMBER'])
        newList.append(awayGame['LAST_GAME_DAYS'])
        newList.append(awayGame['HOME_STREAK'])
        newList.append(awayGame['AWAY_STREAK'])
        newList.append(awayGame['WL_AVG'])
        newList.append(awayGame['PTS_AVG'])
        newList.append(awayGame['FGM_AVG'])
        newList.append(awayGame['FGA_AVG'])
        newList.append(awayGame['FG3M_AVG'])
        newList.append(awayGame['FG3A_AVG'])
        newList.append(awayGame['FTM_AVG'])
        newList.append(awayGame['FTA_AVG'])
        newList.append(awayGame['OREB_AVG'])
        newList.append(awayGame['DREB_AVG'])
        newList.append(awayGame['REB_AVG'])
        newList.append(awayGame['AST_AVG'])
        newList.append(awayGame['STL_AVG'])
        newList.append(awayGame['BLK_AVG'])
        newList.append(awayGame['TOV_AVG'])
        newList.append(awayGame['PF_AVG'])
        newList.append(score['HOME_PTS'])
        newList.append(score['AWAY_PTS'])
        newList.append(homeGame['WL'])
        newList.append(awayGame['WL'])
        newList.append(score['ATTENDANCE'])
        row   = pd.DataFrame([newList], columns=newColumns)
        newDf = pd.concat([newDf, row])
    cumeGameAvgs = pd.read_csv('league_cum_game_avgs_current.csv', index_col=0)
    pd.concat([cumeGameAvgs, newDf]).reset_index(drop=True).to_csv(f"league_cum_game_avgs_recent.csv") 
    end = timer()
    print(timedelta(seconds=end-start)) 
    return pd.concat([cumeGameAvgs, newDf]).reset_index(drop=True)

In [269]:
def createCumAvgsPreds():
    from datetime import timedelta
    attendance = pd.read_csv('nba-attendances-current-season.csv', index_col=0)
    attendance['Date'] = [datetime.strptime(game['Date'], '%Y-%m-%d') for index, game in attendance.iterrows()]
    teams = attendance['Home/Neutral'].unique()
    newColumns = ['GAME_NUMBER', 'TEAM_NAME', 'GAME_ID', 'SEASON_ID', 'GAME_DATE', 'LAST_GAME_DAYS', 'HOME_STREAK', 'AWAY_STREAK', 'WL_AVG', 'PTS_AVG', 'FGM_AVG', 'FGA_AVG', 'FG3M_AVG', 'FG3A_AVG', 'FTM_AVG',
                'FTA_AVG', 'OREB_AVG', 'DREB_AVG', 'REB_AVG', 'AST_AVG', 'STL_AVG', 'BLK_AVG', 
                  'TOV_AVG', 'PF_AVG']
    currentGames = pd.read_csv('nba-transformed-current.csv', index_col=0)
    date = datetime.strptime(currentGames.iloc[0]['HOME_GAME_DATE'][0:10], '%Y-%m-%d')
    for team in teams:
        teamName = 'LA Clippers' if team == 'Los Angeles Clippers' else team 
        teamCumAvgs = pd.read_csv(f"nba/computed_avgs_new/{teamName.replace(' ', '_').replace('/', '')}_cum_game_avg_2022.csv").iloc[-1]
        schedule = attendance[((attendance['Home/Neutral'] == team) | (attendance['Visitor/Neutral'] == team)) & (attendance['Date'] >= date.strftime('%Y-%m-%d'))]
        schedule = schedule.reset_index(drop=True)
        homeStreak = teamCumAvgs['HOME_STREAK']
        awayStreak = teamCumAvgs['AWAY_STREAK']
        lastGameDate = None
        newDf = pd.DataFrame({}, columns=newColumns)
        for i, game in schedule.iterrows():
            newList = []
            if team == game['Home/Neutral']:
                    homeStreak += 1
                    awayStreak = 0
            else:
                homeStreak = 0
                awayStreak += 1
            if i == 0:
                days = 0
            else:
                delta = game['Date'] - lastGameDate
                days = delta.days
            newList.append(teamCumAvgs['GAME_NUMBER']+(i+1))
            newList.append(team)
            newList.append(teamCumAvgs['GAME_ID'])
            newList.append(teamCumAvgs['SEASON_ID'])
            newList.append(game['Date'])
            newList.append(days)
            newList.append(homeStreak)
            newList.append(awayStreak)
            newList.append(teamCumAvgs['WL_AVG'])
            newList.append(teamCumAvgs['PTS_AVG'])
            newList.append(teamCumAvgs['FGM_AVG'])
            newList.append(teamCumAvgs['FGA_AVG'])
            newList.append(teamCumAvgs['FG3M_AVG'])
            newList.append(teamCumAvgs['FG3A_AVG'])
            newList.append(teamCumAvgs['FTM_AVG'])
            newList.append(teamCumAvgs['FTA_AVG'])
            newList.append(teamCumAvgs['OREB_AVG'])
            newList.append(teamCumAvgs['DREB_AVG'])
            newList.append(teamCumAvgs['REB_AVG'])
            newList.append(teamCumAvgs['AST_AVG'])
            newList.append(teamCumAvgs['STL_AVG'])
            newList.append(teamCumAvgs['BLK_AVG'])
            newList.append(teamCumAvgs['TOV_AVG'])
            newList.append(teamCumAvgs['PF_AVG'])
            row   = pd.DataFrame([newList], columns=newColumns)
            newDf = pd.concat([newDf, row])
            newDf.reset_index(drop=True).to_csv(f"nba/computed_avgs_new_preds/{team.replace(' ', '_').replace('/', '')}_cum_game_avg_2022.csv")
            lastGameDate = game['Date']

In [270]:
def createNewPredictionSet(): 
    attendance = pd.read_csv('nba-attendances-current-season.csv', index_col=0)
    attendance['Date'] = [datetime.strptime(game['Date'], '%Y-%m-%d') for index, game in attendance.iterrows()]
    newColumns = ['HOME_TEAM_NAME', 'AWAY_TEAM_NAME','GAME_ID','SEASON_ID' ,'DATE','HOME_TEAM_GAME_NUMBER', 'HOME_LAST_GAME_DAYS', 'HOME_HOME_STREAK', 'HOME_AWAY_STREAK', 'HOME_WL_AVG', 'HOME_PTS_AVG', 'HOME_FGM_AVG', 'HOME_FGA_AVG', 'HOME_FG3M_AVG', 'HOME_FG3A_AVG', 'HOME_FTM_AVG',
                'HOME_FTA_AVG', 'HOME_OREB_AVG', 'HOME_DREB_AVG', 'HOME_REB_AVG', 'HOME_AST_AVG', 'HOME_STL_AVG', 'HOME_BLK_AVG', 'HOME_TOV_AVG', 'HOME_PF_AVG',
                'AWAY_TEAM_GAME_NUMBER', 'AWAY_LAST_GAME_DAYS', 'AWAY_HOME_STREAK', 'AWAY_AWAY_STREAK','AWAY_WL_AVG', 'AWAY_PTS_AVG', 'AWAY_FGM_AVG', 'AWAY_FGA_AVG', 'AWAY_FG3M_AVG', 'AWAY_FG3A_AVG',
                'AWAY_FTM_AVG', 'AWAY_FTA_AVG', 'AWAY_OREB_AVG', 'AWAY_DREB_AVG','AWAY_REB_AVG', 'AWAY_AST_AVG', 'AWAY_STL_AVG', 'AWAY_BLK_AVG', 'AWAY_TOV_AVG',
                'AWAY_PF_AVG']
    newDf = pd.DataFrame({}, columns=newColumns)
    currentGames = pd.read_csv('nba-transformed-current.csv', index_col=0)
    date = datetime.strptime(currentGames.iloc[0]['HOME_GAME_DATE'][0:10], '%Y-%m-%d')
    schedule = attendance[attendance['Date'] >= date]
    for i, game in schedule.iterrows():
        newList  = []
        homeTeam = pd.read_csv(f"nba/computed_avgs_new_preds/{game['Home/Neutral'].replace(' ', '_').replace('/', '')}_cum_game_avg_2022.csv", index_col=0)
        awayTeam = pd.read_csv(f"nba/computed_avgs_new_preds/{game['Visitor/Neutral'].replace(' ', '_').replace('/', '')}_cum_game_avg_2022.csv", index_col=0)
        homeTeam = homeTeam.loc[homeTeam['GAME_DATE'] == game['Date'].strftime('%Y-%m-%d')].iloc[0]
        awayTeam = awayTeam.loc[awayTeam['GAME_DATE'] == game['Date'].strftime('%Y-%m-%d')].iloc[0]
        newList  = []
        newList.append(homeTeam['TEAM_NAME'])
        newList.append(awayTeam['TEAM_NAME'])
        newList.append(homeTeam['GAME_ID'])
        newList.append(homeTeam['SEASON_ID'])
        newList.append(game['Date'])
        newList.append(homeTeam['GAME_NUMBER'])
        newList.append(homeTeam['LAST_GAME_DAYS'])
        newList.append(homeTeam['HOME_STREAK'])
        newList.append(homeTeam['AWAY_STREAK'])
        newList.append(homeTeam['WL_AVG'])
        newList.append(homeTeam['PTS_AVG'])
        newList.append(homeTeam['FGM_AVG'])
        newList.append(homeTeam['FGA_AVG'])
        newList.append(homeTeam['FG3M_AVG'])
        newList.append(homeTeam['FG3A_AVG'])
        newList.append(homeTeam['FTM_AVG'])
        newList.append(homeTeam['FTA_AVG'])
        newList.append(homeTeam['OREB_AVG'])
        newList.append(homeTeam['DREB_AVG'])
        newList.append(homeTeam['REB_AVG'])
        newList.append(homeTeam['AST_AVG'])
        newList.append(homeTeam['STL_AVG'])
        newList.append(homeTeam['BLK_AVG'])
        newList.append(homeTeam['TOV_AVG'])
        newList.append(homeTeam['PF_AVG'])
        newList.append(awayTeam['GAME_NUMBER'])
        newList.append(awayTeam['LAST_GAME_DAYS'])
        newList.append(awayTeam['HOME_STREAK'])
        newList.append(awayTeam['AWAY_STREAK'])
        newList.append(awayTeam['WL_AVG'])
        newList.append(awayTeam['PTS_AVG'])
        newList.append(awayTeam['FGM_AVG'])
        newList.append(awayTeam['FGA_AVG'])
        newList.append(awayTeam['FG3M_AVG'])
        newList.append(awayTeam['FG3A_AVG'])
        newList.append(awayTeam['FTM_AVG'])
        newList.append(awayTeam['FTA_AVG'])
        newList.append(awayTeam['OREB_AVG'])
        newList.append(awayTeam['DREB_AVG'])
        newList.append(awayTeam['REB_AVG'])
        newList.append(awayTeam['AST_AVG'])
        newList.append(awayTeam['STL_AVG'])
        newList.append(awayTeam['BLK_AVG'])
        newList.append(awayTeam['TOV_AVG'])
        newList.append(awayTeam['PF_AVG'])
        row   = pd.DataFrame([newList], columns=newColumns)
        newDf = pd.concat([newDf, row])
    nbaCurrent = pd.read_csv('nba-transformed-today.csv', index_col=0) 
    nbaCurrent.to_csv('nba-transformed-current.csv')
    newDf.reset_index(drop=True).to_csv(f"league_cum_game_avgs_new_preds.csv") 
    return newDf.reset_index(drop=True)

In [271]:
def trainAttendance():
    from joblib import dump, load
    from sklearn.ensemble import GradientBoostingRegressor
    # Attendance Features Response Splits
    cumeGameAvgs = pd.read_csv("league_cum_game_avgs_recent.csv", index_col=0)
    preds = pd.read_csv("league_cum_game_avgs_new_preds.csv", index_col=0) 
    cumeGameAvgs = cumeGameAvgs[(cumeGameAvgs['ATTENDANCE'] > 0) & ((cumeGameAvgs['HOME_TEAM_GAME_NUMBER'] > 1) & (cumeGameAvgs['AWAY_TEAM_GAME_NUMBER'] > 1))]

    box_score_avgs = cumeGameAvgs.dropna()
    box_score_avgs = box_score_avgs.reset_index(drop=True)
    box_score_avgs = box_score_avgs.drop(columns=['HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'GAME_ID', 'SEASON_ID',
                                                      'DATE', 'HOME_AWAY_STREAK','AWAY_HOME_STREAK',
                                                      'HOME_WL', 'AWAY_WL','HOME_TEAM_GAME_NUMBER', 'AWAY_TEAM_GAME_NUMBER', 'HOME_PTS', 'AWAY_PTS'])

    features = box_score_avgs.drop(columns=['ATTENDANCE'])
    response = box_score_avgs['ATTENDANCE']
    
    box_score_avgs_preds = preds.dropna()
    box_score_avgs_preds = preds.reset_index(drop=True)
    box_score_avgs_preds = preds.drop(columns=['HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'GAME_ID', 'SEASON_ID',
                                              'DATE', 'HOME_AWAY_STREAK','AWAY_HOME_STREAK',
                                              'HOME_TEAM_GAME_NUMBER', 'AWAY_TEAM_GAME_NUMBER'])

    
    # GradientBoostingRegressor Train
    gbr = GradientBoostingRegressor(learning_rate=.1, max_depth=8, n_estimators=1000)
    gbr.fit(features, response)
    
    dump(gbr, 'nba-attendance-preds-gbr-new.joblib')
    
    preds = gbr.predict(box_score_avgs_preds)
    pd.DataFrame(preds, columns=['ATTENDANCE']).to_csv('attendances_preds_new.csv')
    return pd.DataFrame(preds, columns=['ATTENDANCE'])

In [272]:
def trainScore():
    from joblib import dump, load
    from sklearn.multioutput import MultiOutputRegressor
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.neighbors import KNeighborsRegressor
    
    cumeGameAvgs = pd.read_csv('league_cum_game_avgs_recent.csv', index_col=0)
    cumeGameAvgsPreds = pd.read_csv("league_cum_game_avgs_new_preds.csv", index_col=0) 
    cumeGameAvgs = cumeGameAvgs[(cumeGameAvgs['HOME_TEAM_GAME_NUMBER'] > 1) & (cumeGameAvgs['AWAY_TEAM_GAME_NUMBER'] > 1)]

    box_score_avgs = cumeGameAvgs.dropna()
    box_score_avgs = box_score_avgs.reset_index(drop=True)
    box_score_avgs = box_score_avgs.drop(columns=['HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'GAME_ID', 'SEASON_ID',
                                                      'DATE', 'HOME_AWAY_STREAK','AWAY_HOME_STREAK',
                                                      'HOME_WL', 'AWAY_WL', 'ATTENDANCE','HOME_TEAM_GAME_NUMBER', 'AWAY_TEAM_GAME_NUMBER'])
    

    features = box_score_avgs.drop(columns=['HOME_PTS', 'AWAY_PTS'])
    response = box_score_avgs[['HOME_PTS', 'AWAY_PTS']]
    
    box_score_avgs_preds = cumeGameAvgsPreds.drop(columns=['HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'GAME_ID', 'SEASON_ID',
                                                  'DATE', 'HOME_AWAY_STREAK','AWAY_HOME_STREAK',
                                                  'HOME_TEAM_GAME_NUMBER', 'AWAY_TEAM_GAME_NUMBER']).reset_index(drop=True)
    
    
    knn = KNeighborsRegressor(n_neighbors = 50, weights='distance', n_jobs=-1)
    knn.fit(features, response)
    
    rfr = RandomForestRegressor(min_samples_split=10, max_depth=7, n_estimators=1000, n_jobs=-1)
    rfr.fit(features, response)
    
    gbr = MultiOutputRegressor(GradientBoostingRegressor(min_samples_split = 2, max_depth=8, n_estimators=1000, learning_rate=.1), n_jobs=-1)
    gbr.fit(features, response)
    
    dump(knn, 'nba-preds-knn-new.joblib')
    dump(gbr, 'nba-preds-gbr-new.joblib')
    dump(rfr, 'nba-preds-rfr-new.joblib')
    
    knnPreds = knn.predict(box_score_avgs_preds)
    gbrPreds = gbr.predict(box_score_avgs_preds)
    rfrPreds = rfr.predict(box_score_avgs_preds)
    
    preds = (knnPreds + gbrPreds + rfrPreds) / 3

    predictionDf = pd.DataFrame(preds, columns=['HOME_PTS', 'AWAY_PTS'])
    predictionDf['DIFF'] = predictionDf['HOME_PTS'] - predictionDf['AWAY_PTS']
    predictionDf['HOME_WL'] = [1 if row['HOME_PTS'] > row['AWAY_PTS'] else 0 for i, row in predictionDf.iterrows()]
    cumeGameAvgsPreds['HOME_PTS'] = predictionDf['HOME_PTS']
    cumeGameAvgsPreds['AWAY_PTS'] = predictionDf['AWAY_PTS']
    cumeGameAvgsPreds['DIFF']     = predictionDf['DIFF']
    cumeGameAvgsPreds['HOME_WL']  = predictionDf['HOME_WL']
    cumeGameAvgsPreds.to_csv('nba_scores_predictions_2022_2023_new.csv')
    return cumeGameAvgsPreds.copy()

In [273]:
def trainScore2():
    from joblib import dump, load
    from sklearn.multioutput import MultiOutputRegressor
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.neighbors import KNeighborsRegressor
    cumeGameAvgs = pd.read_csv('league_cum_game_avgs_recent.csv', index_col=0)
    cumeGameAvgsPreds = pd.read_csv("league_cum_game_avgs_new_preds.csv", index_col=0) 
    cumeGameAvgs = cumeGameAvgs[(cumeGameAvgs['HOME_TEAM_GAME_NUMBER'] > 1) & (cumeGameAvgs['AWAY_TEAM_GAME_NUMBER'] > 1)]

    box_score_avgs = cumeGameAvgs.dropna()
    box_score_avgs = box_score_avgs.reset_index(drop=True)
    box_score_avgs = box_score_avgs.drop(columns=['HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'GAME_ID', 'SEASON_ID',
                                                      'DATE', 'HOME_AWAY_STREAK','AWAY_HOME_STREAK',
                                                      'HOME_WL', 'AWAY_WL', 'ATTENDANCE','HOME_TEAM_GAME_NUMBER', 'AWAY_TEAM_GAME_NUMBER'])
    

    features = box_score_avgs.drop(columns=['HOME_PTS', 'AWAY_PTS'])
    response = box_score_avgs[['HOME_PTS', 'AWAY_PTS']]
    
    box_score_avgs_preds = cumeGameAvgsPreds.drop(columns=['HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'GAME_ID', 'SEASON_ID',
                                                  'DATE', 'HOME_AWAY_STREAK','AWAY_HOME_STREAK',
                                                  'HOME_TEAM_GAME_NUMBER', 'AWAY_TEAM_GAME_NUMBER']).reset_index(drop=True)
    
    knn = load('nba-preds-knn-new.joblib')
    gbr = load('nba-preds-gbr-new.joblib')
    rfr = load('nba-preds-rfr-new.joblib')
    
    knnPreds = knn.predict(box_score_avgs_preds)
    gbrPreds = gbr.predict(box_score_avgs_preds)
    rfrPreds = rfr.predict(box_score_avgs_preds)
    
    preds = (knnPreds + gbrPreds + rfrPreds) / 3

    predictionDf = pd.DataFrame(preds, columns=['HOME_PTS', 'AWAY_PTS'])
    predictionDf['DIFF'] = predictionDf['HOME_PTS'] - predictionDf['AWAY_PTS']
    predictionDf['HOME_WL'] = [1 if row['HOME_PTS'] > row['AWAY_PTS'] else 0 for i, row in predictionDf.iterrows()]
    cumeGameAvgsPreds['HOME_PTS'] = predictionDf['HOME_PTS']
    cumeGameAvgsPreds['AWAY_PTS'] = predictionDf['AWAY_PTS']
    cumeGameAvgsPreds['DIFF']     = predictionDf['DIFF']
    cumeGameAvgsPreds['HOME_WL']  = predictionDf['HOME_WL']
    cumeGameAvgsPreds.to_csv('nba_scores_predictions_2022_2023_new.csv')
    return cumeGameAvgsPreds.copy()

In [341]:
def retrainModel():
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing
    from sklearn.neighbors import KNeighborsClassifier
    import seaborn as sns
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    from sklearn.model_selection import ParameterGrid
    from sklearn import metrics
    from joblib import dump, load
    from sklearn.ensemble import RandomForestRegressor
    
    #attendance = pd.read_csv('attendances_preds_new.csv', index_col=0)
    attendance = trainAttendance()
    curPreds = pd.read_csv('nba_predictions_2022_2023.csv', index_col=0)
    preds = trainScore()
    transformed_games = pd.read_csv('nba-transformed-current.csv', index_col=0)    
    preds['ATTENDANCE'] = attendance['ATTENDANCE']
    m = (preds['DATE'] >= curPreds.iloc[0]['DATE'] ) & (preds['DATE'] < datetime.today().strftime('%Y-%m-%d'))
    preds[['ACTUAL_HOME', 'ACTUAL_AWAY', 'ACTUAL_ATTENDANCE']] = 0
    preds.loc[m, ['HOME_PTS', 'AWAY_PTS', 'DIFF', 'HOME_WL', 'ATTENDANCE']] = curPreds[(curPreds['DATE'] < datetime.today().strftime('%Y-%m-%d'))][['HOME_PTS', 'AWAY_PTS', 'DIFF', 'HOME_WL', 'ATTENDANCE']].to_numpy()
    preds[['ACTUAL_HOME', 'ACTUAL_AWAY', 'ACTUAL_ATTENDANCE']] = 0
    
    for i, pred in preds.iterrows():
        if pred['DATE'] <= transformed_games.iloc[-1]['HOME_GAME_DATE']:
            homeTeam  = pred['HOME_TEAM_NAME'] if pred['HOME_TEAM_NAME'] != 'Los Angeles Clippers' else 'LA Clippers' 
            awayTeam  = pred['AWAY_TEAM_NAME'] if pred['AWAY_TEAM_NAME'] != 'Los Angeles Clippers' else 'LA Clippers' 
            game = transformed_games.loc[(transformed_games['HOME_GAME_DATE'] == pred['DATE']) & (transformed_games['HOME_TEAM_NAME'] == homeTeam) & (transformed_games['AWAY_TEAM_NAME'] == awayTeam)].iloc[0]
            pred['ACTUAL_HOME'] = game['HOME_PTS'] 
            pred['ACTUAL_AWAY'] = game['AWAY_PTS']
            pred['ACTUAL_ATTENDANCE'] = game['ATTENDANCE'] 
            preds.at[i , ['ACTUAL_HOME', 'ACTUAL_AWAY', 'ACTUAL_ATTENDANCE']] = game[['HOME_PTS', 'AWAY_PTS', 'ATTENDANCE']].to_numpy()
            
    preds['ACTUAL_WL'] = [1 if row['ACTUAL_HOME'] > row['ACTUAL_AWAY'] else 0 for i, row in preds.iterrows()]
    preds.to_csv('nba_predictions_2022_2023_new.csv')
    return preds

In [336]:
def autoUpdate():
    transformed_games = shouldUpdate()
    if len(transformed_games) > 0:
        createCumAvgs()
        createFinalTransformation(transformed_games)
        createCumAvgsPreds()
        createNewPredictionSet()
        retrainModel()

In [340]:
pd.read_csv('nba_predictions_2022_2023_new.csv', index_col=0).head()

Unnamed: 0,HOME_TEAM_NAME,AWAY_TEAM_NAME,GAME_ID,SEASON_ID,DATE,HOME_TEAM_GAME_NUMBER,HOME_LAST_GAME_DAYS,HOME_HOME_STREAK,HOME_AWAY_STREAK,HOME_WL_AVG,...,AWAY_PF_AVG,HOME_PTS,AWAY_PTS,DIFF,HOME_WL,ATTENDANCE,ACTUAL_HOME,ACTUAL_AWAY,ACTUAL_ATTENDANCE,ACTUAL_WL
0,Charlotte Hornets,Los Angeles Clippers,22200377,22022,2022-12-05,27,0,2,0,0.28,...,20.0,110.578774,109.447433,1.131341,1.0,16366.0,117,119,,0
1,Orlando Magic,Milwaukee Bucks,22200379,22022,2022-12-05,28,0,4,0,0.230769,...,19.0,110.05845,115.294815,-5.236365,0.0,17009.0,102,109,16174.0,0
2,Atlanta Hawks,Oklahoma City Thunder,22200380,22022,2022-12-05,27,0,1,0,0.52,...,21.52,117.403598,116.091496,1.312102,1.0,16728.0,114,121,16301.0,0
3,Toronto Raptors,Boston Celtics,22200379,22022,2022-12-05,27,0,1,0,0.52,...,20.5,112.138627,112.119533,0.019094,1.0,18613.0,110,116,19800.0,0
4,Houston Rockets,Philadelphia 76ers,22200375,22022,2022-12-05,26,0,1,0,0.291667,...,20.083333,111.225864,111.868984,-0.64312,0.0,15933.0,132,123,15331.0,1
