In [104]:
import time
import pandas as pd
from nba_api.stats.endpoints import leaguegamefinder, commonteamroster

def determine_winning_team(row):
    if row['PTS_HOME'] > row['PTS_AWAY']:
        return row['HOME_TEAM']
    elif row['PTS_HOME'] < row['PTS_AWAY']:
        return row['AWAY_TEAM']

def calculate_k_factor(winner_elo, loser_elo, margin_of_victory):
    elo_difference = winner_elo - loser_elo
    k_factor = 20 * ((margin_of_victory + 3) ** 0.08) / (7.5 + 0.006 * abs(elo_difference))
    return k_factor

def calculate_elo_rating(prev_elo, k_factor, result, opponent_elo, margin_of_victory):
    expected_win_probability = 1 / (1 + 10 ** ((prev_elo - opponent_elo) / 400))
    elo_change = k_factor * (result - expected_win_probability)
    elo_change = elo_change.real
    new_elo = round(float(prev_elo) + elo_change, 2)
    return new_elo

seasons = ['2014-15', '2015-16', '2016-17', 
           '2017-18', '2018-19', '2019-20', '2020-21', 
           '2021-22', '2022-23', '2023-24']

combined_games_df = pd.DataFrame()

team_names = ['Atlanta Hawks', 'Boston Celtics', 'Brooklyn Nets', 'Charlotte Hornets', 'Chicago Bulls', 
              'Cleveland Cavaliers', 'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons', 
              'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers', 'LA Clippers', 'Los Angeles Clippers',  
              'Los Angeles Lakers', 'Memphis Grizzlies', 'Miami Heat', 'Milwaukee Bucks', 
              'Minnesota Timberwolves', 'New Orleans Pelicans', 'New York Knicks', 
              'Oklahoma City Thunder', 'Orlando Magic', 'Philadelphia 76ers', 'Phoenix Suns', 
              'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs', 'Toronto Raptors', 
              'Utah Jazz', 'Washington Wizards']

for season in seasons:
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season, league_id_nullable='00', 
                                                   season_type_nullable='Regular Season', 
                                                   date_from_nullable='10/01/2014', date_to_nullable='04/30/2024')
    games = gamefinder.get_data_frames()[0]
    games = games.sort_values(by=['GAME_DATE'])

    games['TEAM_NAME'] = games['TEAM_NAME'].replace({'Los Angeles Clippers': 'LA Clippers'})
    games = games[games['TEAM_NAME'].str.lower().isin([name.lower() for name in team_names])]

    home_team_data = games[games['MATCHUP'].str.contains('vs.')].copy()
    away_team_data = games[games['MATCHUP'].str.contains('@')].copy()

    aggregated_games = pd.merge(home_team_data, away_team_data, on=['GAME_ID'], suffixes=('_HOME', '_AWAY'))

    home_cols = ['SEASON_ID_HOME', 'GAME_ID', 'GAME_DATE_HOME', 'TEAM_ABBREVIATION_HOME', 'PTS_HOME', 'MIN_HOME',
                 'FGM_HOME', 'FGA_HOME', 'FG_PCT_HOME', 'FG3M_HOME', 'FG3A_HOME', 'FG3_PCT_HOME',
                 'FTM_HOME', 'FTA_HOME', 'FT_PCT_HOME', 'OREB_HOME', 'DREB_HOME', 'REB_HOME',
                 'AST_HOME', 'STL_HOME', 'BLK_HOME', 'TOV_HOME', 'PF_HOME', 'PLUS_MINUS_HOME', 'WL_HOME']

    away_cols = ['SEASON_ID_AWAY', 'GAME_ID', 'GAME_DATE_AWAY', 'TEAM_ABBREVIATION_AWAY', 'PTS_AWAY', 'MIN_AWAY',
                 'FGM_AWAY', 'FGA_AWAY', 'FG_PCT_AWAY', 'FG3M_AWAY', 'FG3A_AWAY', 'FG3_PCT_AWAY',
                 'FTM_AWAY', 'FTA_AWAY', 'FT_PCT_AWAY', 'OREB_AWAY', 'DREB_AWAY', 'REB_AWAY',
                 'AST_AWAY', 'STL_AWAY', 'BLK_AWAY', 'TOV_AWAY', 'PF_AWAY', 'PLUS_MINUS_AWAY', 'WL_AWAY']

    games_df = pd.merge(aggregated_games[home_cols], aggregated_games[away_cols], on='GAME_ID', 
                        suffixes=('_HOME', '_AWAY'))

    games_df['SEASON_ID'] = games_df['SEASON_ID_HOME'].combine_first(games_df['SEASON_ID_AWAY'])
    games_df = games_df.drop(columns=['SEASON_ID_HOME', 'SEASON_ID_AWAY'])

    games_df['HOME_TEAM'] = games_df['TEAM_ABBREVIATION_HOME']
    games_df['AWAY_TEAM'] = games_df['TEAM_ABBREVIATION_AWAY']

    games_df['WINNER'] = games_df.apply(determine_winning_team, axis=1)
    column_names = [
        'SEASON_ID', 'GAME_ID', 'GAME_DATE_HOME', 'HOME_TEAM', 'PTS_HOME', 'MIN_HOME',
        'FGM_HOME', 'FGA_HOME', 'FG_PCT_HOME', 'FG3M_HOME', 'FG3A_HOME', 'FG3_PCT_HOME',
        'FTM_HOME', 'FTA_HOME', 'FT_PCT_HOME', 'OREB_HOME', 'DREB_HOME', 'REB_HOME',
        'AST_HOME', 'STL_HOME', 'BLK_HOME', 'TOV_HOME', 'PF_HOME', 'PLUS_MINUS_HOME', 'WL_HOME',
        'GAME_DATE_AWAY', 'AWAY_TEAM', 'PTS_AWAY', 'MIN_AWAY', 'FGM_AWAY', 'FGA_AWAY',
        'FG_PCT_AWAY', 'FG3M_AWAY', 'FG3A_AWAY', 'FG3_PCT_AWAY', 'FTM_AWAY', 'FTA_AWAY',
        'FT_PCT_AWAY', 'OREB_AWAY', 'DREB_AWAY', 'REB_AWAY', 'AST_AWAY', 'STL_AWAY',
        'BLK_AWAY', 'TOV_AWAY', 'PF_AWAY', 'PLUS_MINUS_AWAY', 'WL_AWAY'
    ]

    games_df = games_df[column_names]

    games_df.rename(columns={'GAME_DATE_HOME': 'GAME_DATE'}, inplace=True)
    games_df.drop(columns=['GAME_DATE_AWAY'], inplace=True)
    games_df.drop(columns=['WL_HOME'], inplace=True)
    games_df.drop(columns=['WL_AWAY'], inplace=True)
    games_df['WINNING_TEAM'] = games_df.apply(determine_winning_team, axis=1)
    games_df = games_df.sort_values(by=['GAME_DATE'])
    combined_games_df = pd.concat([combined_games_df, games_df], ignore_index=True)

data = pd.read_csv("games.csv", index_col=0)
initial_elo = 1500
elo_ratings = {team: initial_elo for team in set(data['HOME_TEAM'])}

for index, row in data.iterrows():
    home_team = row['HOME_TEAM']
    away_team = row['AWAY_TEAM']

    home_team_elo = elo_ratings[home_team]
    away_team_elo = elo_ratings[away_team]

    if row['PTS_HOME'] > row['PTS_AWAY']:
        winner_elo, loser_elo = home_team_elo, away_team_elo
        result, margin_of_victory = 1, row['PTS_HOME'] - row['PTS_AWAY']
    else:
        winner_elo, loser_elo = away_team_elo, home_team_elo
        result, margin_of_victory = 0, row['PTS_AWAY'] - row['PTS_HOME']

    home_team_elo = calculate_elo_rating(home_team_elo, calculate_k_factor(winner_elo, loser_elo, margin_of_victory), result, loser_elo, margin_of_victory)
    away_team_elo = calculate_elo_rating(away_team_elo, calculate_k_factor(winner_elo, loser_elo, -margin_of_victory), 1 - result, winner_elo, -margin_of_victory)
    data.at[index, 'HOME_TEAM_ELO'] = home_team_elo
    data.at[index, 'AWAY_TEAM_ELO'] = away_team_elo
    elo_ratings[home_team] = home_team_elo
    elo_ratings[away_team] = away_team_elo

columns_to_keep = ["GAME_ID", "HOME_TEAM_ELO", "AWAY_TEAM_ELO"]
data = data[columns_to_keep]

data['GAME_ID'] = data['GAME_ID'].astype(int)
combined_games_df['GAME_ID'] = combined_games_df['GAME_ID'].astype(int)
combined_games_df = pd.merge(combined_games_df, data, on=['GAME_ID'])

combined_games_df.to_csv("games.csv")


df = combined_games_df
home_data = df.groupby(['HOME_TEAM', 'SEASON_ID']).agg({
    'PTS_HOME': 'mean',
    'MIN_HOME': 'mean',
    'FGM_HOME': 'mean',
    'FGA_HOME': 'mean',
    'FG_PCT_HOME': 'mean',
    'FG3M_HOME': 'mean',
    'FG3A_HOME': 'mean',
    'FG3_PCT_HOME': 'mean',
    'FTM_HOME': 'mean',
    'FTA_HOME': 'mean',
    'FT_PCT_HOME': 'mean',
    'OREB_HOME': 'mean',
    'DREB_HOME': 'mean',
    'REB_HOME': 'mean',
    'AST_HOME': 'mean',
    'STL_HOME': 'mean',
    'BLK_HOME': 'mean',
    'TOV_HOME': 'mean',
    'PF_HOME': 'mean',
    'PLUS_MINUS_HOME': 'mean',
}).reset_index()

home_data.rename(columns={
    'PTS_HOME': 'Average_PTS_Home',
    'MIN_HOME': 'Average_MIN_Home',
    'FGM_HOME': 'Average_FGM_Home',
    'FGA_HOME': 'Average_FGA_Home',
    'FG_PCT_HOME': 'Average_FG_PCT_Home',
    'FG3M_HOME': 'Average_FG3M_Home',
    'FG3A_HOME': 'Average_FG3A_Home',
    'FG3_PCT_HOME': 'Average_FG3_PCT_Home',
    'FTM_HOME': 'Average_FTM_Home',
    'FTA_HOME': 'Average_FTA_Home',
    'FT_PCT_HOME': 'Average_FT_PCT_Home',
    'OREB_HOME': 'Average_OREB_Home',
    'DREB_HOME': 'Average_DREB_Home',
    'REB_HOME': 'Average_REB_Home',
    'AST_HOME': 'Average_AST_Home',
    'STL_HOME': 'Average_STL_Home',
    'BLK_HOME': 'Average_BLK_Home',
    'TOV_HOME': 'Average_TOV_Home',
    'PF_HOME': 'Average_PF_Home',
    'PLUS_MINUS_HOME': 'Average_PLUS_MINUS_Home',  
}, inplace=True)

away_data = df.groupby(['AWAY_TEAM', 'SEASON_ID']).agg({
    'PTS_AWAY': 'mean',
    'MIN_AWAY': 'mean',
    'FGM_AWAY': 'mean',
    'FGA_AWAY': 'mean',
    'FG_PCT_AWAY': 'mean',
    'FG3M_AWAY': 'mean',
    'FG3A_AWAY': 'mean',
    'FG3_PCT_AWAY': 'mean',
    'FTM_AWAY': 'mean',
    'FTA_AWAY': 'mean',
    'FT_PCT_AWAY': 'mean',
    'OREB_AWAY': 'mean',
    'DREB_AWAY': 'mean',
    'REB_AWAY': 'mean',
    'AST_AWAY': 'mean',
    'STL_AWAY': 'mean',
    'BLK_AWAY': 'mean',
    'TOV_AWAY': 'mean',
    'PF_AWAY': 'mean',
    'PLUS_MINUS_AWAY': 'mean', 
}).reset_index()

away_data.rename(columns={
    'PTS_AWAY': 'Average_PTS_Away',
    'MIN_AWAY': 'Average_MIN_Away',
    'FGM_AWAY': 'Average_FGM_Away',
    'FGA_AWAY': 'Average_FGA_Away',
    'FG_PCT_AWAY': 'Average_FG_PCT_Away',
    'FG3M_AWAY': 'Average_FG3M_Away',
    'FG3A_AWAY': 'Average_FG3A_Away',
    'FG3_PCT_AWAY': 'Average_FG3_PCT_Away',
    'FTM_AWAY': 'Average_FTM_Away',
    'FTA_AWAY': 'Average_FTA_Away',
    'FT_PCT_AWAY': 'Average_FT_PCT_Away',
    'OREB_AWAY': 'Average_OREB_Away',
    'DREB_AWAY': 'Average_DREB_Away',
    'REB_AWAY': 'Average_REB_Away',
    'AST_AWAY': 'Average_AST_Away',
    'STL_AWAY': 'Average_STL_Away',
    'BLK_AWAY': 'Average_BLK_Away',
    'TOV_AWAY': 'Average_TOV_Away',
    'PF_AWAY': 'Average_PF_Away',
    'PLUS_MINUS_AWAY': 'Average_PLUS_MINUS_Away', 
}, inplace=True)


average = pd.merge(home_data, away_data, left_on=['HOME_TEAM', 'SEASON_ID'], 
                   right_on=['AWAY_TEAM', 'SEASON_ID'], suffixes=('_HOME', '_AWAY'))

cols_to_avg = ['PTS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 
               'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 
               'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

for col in cols_to_avg:
    average[f'Average_{col}'] = (average[f'Average_{col}_Home'] + average[f'Average_{col}_Away']) / 2
    
average = average[['HOME_TEAM', 'SEASON_ID'] + [f'Average_{col}' for col in cols_to_avg]]

average.rename(columns={
    'HOME_TEAM' : 'TEAM' 
}, inplace=True)


average.to_csv("game_averages.csv")


In [2]:
# performance for each player's performance this season + moving average

from nba_api.stats.endpoints import leaguegamefinder, commonteamroster, PlayerGameLog
import pandas as pd
from requests.exceptions import ReadTimeout
from datetime import datetime
import time

def retry_request(request_func, max_retries=3, timeout=60, delay=0):
    retries = 0
    while retries < max_retries:
        try:
            return request_func()
        except ReadTimeout:
            print(f"delay")
            time.sleep(delay)
            retries += 1
    raise Exception("can't retry")


season = "2023-24"
game_finder = leaguegamefinder.LeagueGameFinder(season_nullable="2023-24")
games_data = retry_request(game_finder.get_data_frames)[0]

all_team_ids = list(set(games_data['TEAM_ID'].unique()))

all_rosters = pd.DataFrame()

batch_size = 10

for i in range(0, len(all_team_ids), batch_size):
    batch_team_ids = all_team_ids[i:i+batch_size]
    for team_id in batch_team_ids:
        roster_data = retry_request(lambda: commonteamroster.CommonTeamRoster(team_id=team_id, season=season).get_data_frames()[0])
        all_rosters = all_rosters.append(roster_data, ignore_index=True)
        time.sleep(.125)

unique_players = all_rosters['PLAYER_ID'].unique()

all_player_logs = pd.DataFrame()
last_10_player_logs = pd.DataFrame()

for player_id in unique_players:
    try:
        player_game_log = PlayerGameLog(player_id=player_id, season=season, season_type_all_star='Regular Season')
        player_game_log_data = player_game_log.get_data_frames()[0]
        player_game_log_data = player_game_log_data.sort_values(by='GAME_DATE', ascending=True)
        all_player_logs = all_player_logs.append(player_game_log_data, ignore_index=True)
        last_10_games_data = player_game_log_data.head(10)
        last_10_player_logs = last_10_player_logs.append(last_10_games_data, ignore_index=True)
    except Exception as e:
        print(f"Error for player_id {player_id}: {str(e)}")
    time.sleep(.125)

all_player_logs.to_csv("all_games.csv", index=False)
last_10_player_logs.to_csv("last10games.csv", index=False)

# Calculate averages for last 10 games
moving_average = pd.read_csv("last10games.csv", index_col=[0])
columns_to_drop = ["Game_ID", "GAME_DATE", "MATCHUP", "WL", "VIDEO_AVAILABLE"]
moving_average = moving_average.drop(columns=columns_to_drop)
moving_average.to_csv("player_avg_last10.csv", index=False)

In [97]:
## highest_accuracy model yet
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

ml_data = pd.read_csv("games.csv")
ml_data = ml_data.drop(['GAME_DATE'], axis=1)
target = ml_data['WINNING_TEAM']

features = ml_data.copy()
home_cols = ['HOME_TEAM_ELO', 'PTS_HOME', 'MIN_HOME', 'FGM_HOME', 'FGA_HOME', 'FG_PCT_HOME', 'FG3M_HOME', 'FG3A_HOME', 'FG3_PCT_HOME',
             'FTM_HOME', 'FTA_HOME', 'FT_PCT_HOME', 'OREB_HOME', 'DREB_HOME', 'REB_HOME', 'AST_HOME', 'STL_HOME', 'BLK_HOME', 'TOV_HOME', 'PF_HOME']

away_cols = ['AWAY_TEAM_ELO', 'PTS_AWAY', 'MIN_AWAY', 'FGM_AWAY', 'FGA_AWAY', 'FG_PCT_AWAY', 'FG3M_AWAY', 'FG3A_AWAY', 'FG3_PCT_AWAY',
             'FTM_AWAY', 'FTA_AWAY', 'FT_PCT_AWAY', 'OREB_AWAY', 'DREB_AWAY', 'REB_AWAY', 'AST_AWAY', 'STL_AWAY', 'BLK_AWAY', 'TOV_AWAY', 'PF_AWAY']

for col in home_cols:
    features[col + '_DIFF'] = features[col] - features[col.replace('HOME', 'AWAY')]

features = features.drop(home_cols + away_cols, axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
categorical_cols = ['HOME_TEAM', 'AWAY_TEAM']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter = 100000))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
# print(f'Confusion Matrix:\n{conf_matrix}')
# print(f'Classification Report:\n{classification_rep}')


Accuracy: 0.5682


In [7]:
# USE THIS

def calculate_elo(df):
    before_elo_ratings = {}  
    after_elo_ratings = {}   
    before_elo_list = []
    after_elo_list = []

    for _, row in df.iterrows():
        team = row['TEAM']
        opponent_team = row['OPP']
        before_elo_team = before_elo_ratings.get(team, 1500)
        before_elo_opponent = before_elo_ratings.get(opponent_team, 1500)

    # Check if it's the first game in the dataset
        if row.name == 0:
        # Calculate ELO using the specified equation for the first game
            k_factor = 20 * ((abs(row['PLUS_MINUS']) + 3) ** 0.08) / (7.5 + 0.006 * abs(before_elo_team - before_elo_opponent))
            expected_win = 1 / (1 + 10**((before_elo_opponent - before_elo_team) / 400))
            after_elo_game = before_elo_team + k_factor * (int(row['WL'] == 'W') - expected_win)
        else:
            k_factor = 20 * ((abs(row['PLUS_MINUS']) + 3) ** 0.08) / (7.5 + 0.006 * abs(before_elo_team - before_elo_opponent))
            expected_win = 1 / (1 + 10**((before_elo_opponent - before_elo_team) / 400))
            after_elo_game = before_elo_team + k_factor * (int(row['WL'] == 'W') - expected_win)

        # Update before_elo_ratings with after_elo_game
        before_elo_ratings[team] = after_elo_game
        after_elo_ratings[team] = after_elo_game
        after_elo_list.append(after_elo_game)
        before_elo_list.append(before_elo_team)

    df['BEFORE_ELO'] = before_elo_list
    df['AFTER_ELO'] = after_elo_list

    return df



## def calculate_moving_average(df, team_name, date_of_game, stats, max_window_size=10):
#    team_df = df[df['TEAM'] == team_name].sort_values(by='GAME_DATE')
#    team_df = team_df[team_df['GAME_DATE'] < date_of_game]
#    window_size = min(len(team_df), max_window_size)
#    if window_size <= 0:
#        return df[(df['TEAM'] == team_name) & (df['GAME_DATE'] == date_of_game)][stats].iloc[0]
#    moving_averages = team_df[stats].rolling(window=window_size, min_periods=1).mean().shift(1).iloc[-1]
#    return moving_averages


import time
import pandas as pd
from nba_api.stats.endpoints import leaguegamefinder, commonteamroster

seasons = ['2014-15', '2015-16', '2016-17', 
           '2017-18', '2018-19', '2019-20', '2020-21', 
           '2021-22', '2022-23', '2023-24']

games_df = pd.DataFrame()

selected_stats = ['PTS', 'FGM', 'FGA', 'FG_PCT',
                  'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
                  'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB',
                  'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']



team_names = ['Atlanta Hawks', 'Boston Celtics', 'Brooklyn Nets', 'Charlotte Hornets', 'Chicago Bulls', 
              'Cleveland Cavaliers', 'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons', 
              'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers', 'LA Clippers', 'Los Angeles Clippers',  
              'Los Angeles Lakers', 'Memphis Grizzlies', 'Miami Heat', 'Milwaukee Bucks', 
              'Minnesota Timberwolves', 'New Orleans Pelicans', 'New York Knicks', 
              'Oklahoma City Thunder', 'Orlando Magic', 'Philadelphia 76ers', 'Phoenix Suns', 
              'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs', 'Toronto Raptors', 
              'Utah Jazz', 'Washington Wizards']

for season in seasons:
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season, league_id_nullable='00', 
                                                   season_type_nullable='Regular Season', 
                                                   date_from_nullable='10/01/2014', date_to_nullable='04/30/2024')
    games = gamefinder.get_data_frames()[0]
    games = games.sort_values(by=['GAME_DATE'])

    games['TEAM_NAME'] = games['TEAM_NAME'].replace({'Los Angeles Clippers': 'LA Clippers'})
    games = games[games['TEAM_NAME'].str.lower().isin([name.lower() for name in team_names])]
    
    games_df = pd.concat([games_df, games], ignore_index=True)

games_df['MATCHUP'] = games_df['MATCHUP'].str[-3:]
games_df = games_df.rename(columns={'MATCHUP': 'OPP', "TEAM_ABBREVIATION": "TEAM"})
games_df = games_df.drop(columns=['TEAM_NAME', 'MIN'])
games_df = round(calculate_elo(games_df),2)

games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])

for stat in selected_stats:
    games_df[f"SLIDING_{stat}"] = games_df.groupby('TEAM')[stat].transform(lambda x: x.rolling(window=10,                                                                       min_periods=1).mean().shift())
for stat in selected_stats:
    games_df.loc[games_df.groupby('TEAM').cumcount() <= 1, f"SLIDING_{stat}"] = games_df[stat]
games_df = games_df.round({'SLIDING_' + stat: 2 for stat in selected_stats})

games_df.to_csv("newGames.csv")

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load your dataset
df = pd.read_csv("newGames.csv")

df_filtered = df[df['WL'] != 2]

# Define features and label
features = ['BEFORE_ELO', 'SLIDING_PTS', 'SLIDING_FGM', 'SLIDING_FGA', 'SLIDING_FG_PCT',
            'SLIDING_FG3M', 'SLIDING_FG3A', 'SLIDING_FG3_PCT', 'SLIDING_FTM', 'SLIDING_FTA',
            'SLIDING_FT_PCT', 'SLIDING_OREB', 'SLIDING_DREB', 'SLIDING_REB', 'SLIDING_AST',
            'SLIDING_STL', 'SLIDING_BLK', 'SLIDING_TOV', 'SLIDING_PF', 'SLIDING_PLUS_MINUS']

label = 'WL'  # Assuming 'WL' is your target label

train_size = int(0.8 * len(df))
train_data, test_data = df[:train_size], df[train_size:]

X_train = train_data[features]
y_train = train_data[label].apply(lambda x: 1 if x == 'W' else 0)

X_test = test_data[features]
y_test = test_data[label].apply(lambda x: 1 if x == 'W' else 0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

logreg = LogisticRegression(random_state=42)

grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_

best_logreg = LogisticRegression(**best_params, random_state=42)
best_logreg.fit(X_train_scaled, y_train)

predictions = best_logreg.predict(X_test_scaled)

accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy with Best Hyperparameters: {accuracy}")
print(f"Best Hyperparameters: {best_params}")


Model Accuracy with Best Hyperparameters: 0.5838277301691935
Best Hyperparameters: {'C': 0.1}


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("games.csv")
df_filtered = df[df['WL'] != 2]

features = ['BEFORE_ELO', 'SLIDING_PTS', 'SLIDING_FGM', 'SLIDING_FGA', 'SLIDING_FG_PCT',
            'SLIDING_FG3M', 'SLIDING_FG3A', 'SLIDING_FG3_PCT', 'SLIDING_FTM', 'SLIDING_FTA',
            'SLIDING_FT_PCT', 'SLIDING_OREB', 'SLIDING_DREB', 'SLIDING_REB', 'SLIDING_AST',
            'SLIDING_STL', 'SLIDING_BLK', 'SLIDING_TOV', 'SLIDING_PF', 'SLIDING_PLUS_MINUS']

label = 'WL'  
le = LabelEncoder()
df[label] = le.fit_transform(df[label])
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_data, validation_data = train_test_split(train_data, test_size=0.25, random_state=42)

X_train = train_data[features]
y_train = train_data[label]

X_validation = validation_data[features]
y_validation = validation_data[label]

X_test = test_data[features]
y_test = test_data[label]

param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [8, 11, 14],
    'min_samples_split': [8, 11, 14],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=1))

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)



KeyboardInterrupt: 

In [41]:
## USE THIS 

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("newGames.csv")
df = df[df['WL'].isin(['W', 'L'])]

features = ['BEFORE_ELO', 'SLIDING_PTS', 'SLIDING_FGM', 'SLIDING_FGA', 'SLIDING_FG_PCT',
            'SLIDING_FG3M', 'SLIDING_FG3A', 'SLIDING_FG3_PCT', 'SLIDING_FTM', 'SLIDING_FTA',
            'SLIDING_FT_PCT', 'SLIDING_OREB', 'SLIDING_DREB', 'SLIDING_REB', 'SLIDING_AST',
            'SLIDING_STL', 'SLIDING_BLK', 'SLIDING_TOV', 'SLIDING_PF', 'SLIDING_PLUS_MINUS']

label = 'WL'  
le = LabelEncoder()
df[label] = le.fit_transform(df[label])
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_data, validation_data = train_test_split(train_data, test_size=0.25, random_state=42)

X_train = train_data[features]
y_train = train_data[label]

X_validation = validation_data[features]
y_validation = validation_data[label]

X_test = test_data[features]
y_test = test_data[label]

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}



rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))


Accuracy: 0.5884

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.54      0.57      2293
           1       0.58      0.64      0.61      2260

    accuracy                           0.59      4553
   macro avg       0.59      0.59      0.59      4553
weighted avg       0.59      0.59      0.59      4553





Predicted Probabilities for Team 1: [0.30763766 0.69236234]
Predicted Class for Team 1: W
Predicted Probabilities for Team 2: [0.40238306 0.59761694]
Predicted Class for Team 2: W




[[0.50447191 0.49552809]]


[0 1]


In [42]:
team_name = 'GSW'  # Replace 'YourTeam' with the actual team name
opp_name = 'NOP'
# Filter the dataset for the specified team
team_data1 = df[df['TEAM'] == team_name]
team_data2 = df[df['TEAM'] == opp_name]
team_data1 = team_data1[features]
team_data2 = team_data2[features]

team_data1_array = team_data1[features].values[-1].reshape(1, -1)
team_data2_array = team_data2[features].values[-1].reshape(1, -1)

# Set feature names for RandomForestClassifier
best_rf.feature_names_in_ = features

# Predict probabilities
probabilities_team1 = best_rf.predict_proba(team_data1_array)[0][1]  # Probability for class 1 (win)
probabilities_team2 = best_rf.predict_proba(team_data2_array)[0][1]  # Probability for class 1 (win)

# Determine the winner based on the higher probability
winner = team_name if probabilities_team1 > probabilities_team2 else opp_name

# Display the results
#print(f"Predicted Probability for {team_name}: {probabilities_team1:.4f}")
#print(f"Predicted Probability for {opp_name}: {probabilities_team2:.4f}")
print(f"The winner is: {winner}")


The winner is: GSW


