#### CSC 180 Intelligent Systems 

#### William Lorence, Ajaydeep Singh, Romin Akoliya, Abdurraziq Paikur

#### California State University, Sacramento

# Final Project: NBA Outcome Predictions

NBA statistics can be very complex, so first, we will break down exactly what we are looking for.



## Fetch and Preprocess the data

In [3]:
import pandas as pd
import numpy as np
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import playercareerstats
from sklearn.preprocessing import LabelEncoder

# Fetch data
nba_teams = teams.get_teams()
team_abbr_to_id = {team['abbreviation']: team['id'] for team in nba_teams}
games_info = pd.DataFrame()

# Display all the columns
for team in nba_teams:
    team_id = team['id']
    gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable = team_id)
    games = gamefinder.get_data_frames()[0]
    games_info = pd.concat([games_info, games], ignore_index=True)
print(games_info.columns)

# Preprocess data
games_info['GAME_DATE'] = pd.to_datetime(games_info['GAME_DATE'])
games_info['WIN'] = games_info['WL'].apply(lambda x: 1 if x == 'W' else 0)
games_info['PTS'] = games_info['PTS'].astype(float)
games_info['Points_Per_Game'] = games_info.groupby('TEAM_ID')['PTS'].transform('mean')


Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')


In [4]:
# GET the OPPONENT_TEAM_ID from the MATCHUP
def get_opponent_team_id(matchup, team_abbr_to_id, team_id):
    if '@' in matchup:
        opponent_abbr = matchup.split(' @ ')[-1]
    else:
        opponent_abbr = matchup.split(' vs. ')[-1]
    return team_abbr_to_id.get(opponent_abbr, team_id)

games_info['OPPONENT_TEAM_ID'] = games_info.apply(
    lambda row: get_opponent_team_id(row['MATCHUP'], team_abbr_to_id, row['TEAM_ID']), axis=1
)

# Add HOME_GAME feature
games_info['HOME_GAME'] = games_info['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)

# Add LAST_GAME_RESULT feature
games_info['LAST_GAME_RESULT'] = games_info.groupby('TEAM_ID')['WIN'].shift(1).fillna(0)


print(games_info.columns)

le = LabelEncoder()
games_info['TEAM_ID'] = le.fit_transform(games_info['TEAM_ID'])
games_info['OPPONENT_TEAM_ID'] = le.fit_transform(games_info['OPPONENT_TEAM_ID'])

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS', 'WIN',
       'Points_Per_Game', 'OPPONENT_TEAM_ID', 'HOME_GAME', 'LAST_GAME_RESULT'],
      dtype='object')


## Normalization and Split Data

In [11]:
# Normalization
def normalize_columns(df, columns = []):
    for col in columns:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
    return df

games_info = normalize_columns(games_info, ['PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF'])

games_info

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,STL,BLK,TOV,PF,PLUS_MINUS,WIN,Points_Per_Game,OPPONENT_TEAM_ID,HOME_GAME,LAST_GAME_RESULT
0,22024,0,ATL,Atlanta Hawks,0022400030,2024-11-22,ATL @ CHI,L,240,1.406138,...,1.009785,-0.374917,-0.840809,-1.212852,-14.0,0,101.432026,4,0,0.0
1,22024,0,ATL,Atlanta Hawks,0022400258,2024-11-20,ATL @ GSW,L,238,-0.370060,...,0.356642,0.001795,0.570972,-1.008434,-23.0,0,101.432026,7,0,0.0
2,22024,0,ATL,Atlanta Hawks,0022400250,2024-11-18,ATL @ SAC,W,239,0.482515,...,-0.296500,0.755219,-0.134918,-0.599599,1.0,1,101.432026,21,0,0.0
3,22024,0,ATL,Atlanta Hawks,0022400239,2024-11-17,ATL @ POR,L,240,0.553563,...,0.683213,0.755219,2.453348,-0.599599,-4.0,0,101.432026,20,0,1.0
4,22024,0,ATL,Atlanta Hawks,0022400012,2024-11-15,ATL vs. WAS,W,240,1.903473,...,0.683213,1.885356,0.335675,-1.008434,12.0,1,101.432026,27,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106621,21988,29,CHH,Charlotte Hornets,0028800062,1988-11-12,CHH @ ATL,L,238,0.624611,...,1.336356,-1.128342,1.041566,0.831324,,0,99.926358,0,0,0.0
106622,21988,29,CHH,Charlotte Hornets,0028800052,1988-11-11,CHH @ WAS,L,240,-1.080539,...,0.356642,-1.505054,1.982754,0.831324,,0,99.926358,27,0,0.0
106623,21988,29,CHH,Charlotte Hornets,0028800024,1988-11-08,CHH vs. LAC,W,240,1.050898,...,0.356642,-1.505054,0.570972,1.853412,,1,99.926358,9,1,0.0
106624,21988,29,CHH,Charlotte Hornets,0028800015,1988-11-05,CHH @ DET,L,240,-1.222635,...,0.030071,0.378507,-0.840809,-0.190764,,0,99.926358,28,0,1.0


In [15]:
# Split data into the respective team files so that we can 
# refer to them to feed the appropriate data to the model

def create_team_csv_files(df, column = 'TEAM_ABBREVIATION'):
    for team in df[column].unique():
        team_df = df.loc[df[column] == team]
        team_df.to_csv('team_data/' + team + '.csv', index=False)

create_team_csv_files(games_info)

### With our csv files now created to refer to, we can read from them and feed the appropriate data to the model. However, some manipulation needs to be done first that would cause the size of the CSV files to be unnecessarily large.

In [16]:
team1 = "LAL"
df_team1 = pd.read_csv('team_data/' + team1 + '.csv')

df_team1

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,STL,BLK,TOV,PF,PLUS_MINUS,WIN,Points_Per_Game,OPPONENT_TEAM_ID,HOME_GAME,LAST_GAME_RESULT
0,22024,10,LAL,Los Angeles Lakers,22400270,2024-11-23,LAL vs. DEN,L,239,-0.014820,...,1.009785,-0.751630,0.100378,-1.212852,-25.0,0,105.09995,6,1,0.0
1,22024,10,LAL,Los Angeles Lakers,22400263,2024-11-21,LAL vs. ORL,L,240,1.121946,...,-1.276214,0.378507,-1.076106,-1.417269,-1.0,0,105.09995,16,1,0.0
2,22024,10,LAL,Los Angeles Lakers,22400026,2024-11-19,LAL vs. UTA,W,240,1.548234,...,0.356642,0.001795,-0.134918,-1.212852,6.0,1,105.09995,25,1,0.0
3,22024,10,LAL,Los Angeles Lakers,22400231,2024-11-16,LAL @ NOP,W,240,0.127275,...,0.683213,-0.751630,-0.840809,-1.621687,5.0,1,105.09995,3,0,1.0
4,22024,10,LAL,Los Angeles Lakers,22400015,2024-11-15,LAL @ SAS,W,241,1.264042,...,0.683213,-0.751630,-1.076106,-2.234940,5.0,1,105.09995,22,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3987,21983,10,LAL,Los Angeles Lakers,28300066,1983-11-08,LAL @ DEN,W,240,2.187665,...,1.009785,1.131931,1.982754,1.853412,,1,105.09995,6,0,1.0
3988,21983,10,LAL,Los Angeles Lakers,28300053,1983-11-05,LAL @ DAL,L,240,-0.014820,...,-0.296500,1.131931,1.276863,1.035742,,0,105.09995,5,0,1.0
3989,21983,10,LAL,Los Angeles Lakers,28300035,1983-11-02,LAL @ SDC,L,240,0.269371,...,1.989499,0.755219,2.218051,0.422489,,0,105.09995,10,0,0.0
3990,21983,10,LAL,Los Angeles Lakers,28300010,1983-10-29,LAL @ UTH,W,240,1.264042,...,0.683213,1.885356,1.276863,3.897588,,1,105.09995,10,0,0.0


In [17]:
# Split Data into X and y, where y is the target var (win/loss)
from sklearn.model_selection import train_test_split

X = games_info[['TEAM_ID', 'OPPONENT_TEAM_ID', 'Points_Per_Game', 'HOME_GAME', 'LAST_GAME_RESULT']]
y = games_info['WIN']  # Target variable (win/loss)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shape of training and test sets
print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)

Training features shape: (74638, 5)
Test features shape: (31988, 5)


## Initilize and train the Random Forest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## Evaluate the Model

In [21]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5869388520695261
              precision    recall  f1-score   support

           0       0.58      0.60      0.59     15921
           1       0.59      0.57      0.58     16067

    accuracy                           0.59     31988
   macro avg       0.59      0.59      0.59     31988
weighted avg       0.59      0.59      0.59     31988



In [22]:
# Fetching team abreviations for reference

from nba_api.stats.static import teams

nba_teams = teams.get_teams()

# Extract team abbreviations
team_abbreviations = [team['abbreviation'] for team in nba_teams]
print("Team Abbreviations:", team_abbreviations)

Team Abbreviations: ['ATL', 'BOS', 'CLE', 'NOP', 'CHI', 'DAL', 'DEN', 'GSW', 'HOU', 'LAC', 'LAL', 'MIA', 'MIL', 'MIN', 'BKN', 'NYK', 'ORL', 'IND', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'OKC', 'TOR', 'UTA', 'MEM', 'WAS', 'DET', 'CHA']


## Feature Importance
This will help us gain insights into which factors most influence game outcomes. Now, the below will extract features importances from the trained Random Forest Classifier. 

In [24]:
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
# Each feature will be ranked based on its influence 
print("Feature Importances:\n", feature_importances)
     

Feature Importances:
                   importance
OPPONENT_TEAM_ID    0.509600
HOME_GAME           0.273959
Points_Per_Game     0.094148
TEAM_ID             0.082507
LAST_GAME_RESULT    0.039785


## Predictions and Results

In [67]:
team_abbr = 'LAL'
opponent_abbr = 'BOS'
average_points_per_game = 110.5  # Replace with the actual average points per game

new_data = pd.DataFrame({
    'TEAM_ID': [le.transform([team_abbr_to_id[team_abbr]])[0]],
    'OPPONENT_TEAM_ID': [le.transform([team_abbr_to_id[opponent_abbr]])[0]],
    'Points_Per_Game': [average_points_per_game],
    'HOME_GAME': [1],  # Replace with 1 if home game, else 0
    'LAST_GAME_RESULT': [1]  # Replace with last game result (1 if win, 0 if loss)
})

predictions = model.predict(new_data)
prediction_probabilities = model.predict_proba(new_data)

print("Predictions: ", predictions) # Outputs 1 if our team wins

# Convert probabilities to percentages
percentages = prediction_probabilities * 100

# Format and print the percentages
for idx, prediction in enumerate(percentages):
    print(f"Prediction Probabilities: [Loss - {prediction[0]:.2f}%], [Win - {prediction[1]:.2f}%]")


Predictions:  [0]
Prediction Probabilities: [Loss - 53.22%], [Win - 46.78%]
