In [2]:
#import required libraries

import platform
import numpy as np
import pandas as pd
import sklearn as sk
import os

In [3]:
#Load the 2019-2020 NBA dataset

df = pd.read_csv('./datasets/NBA Regular Season Results 2019-2020.csv')

In [4]:
from sklearn.metrics import f1_score, make_scorer, classification_report

scorer = make_scorer(f1_score, pos_label = None, average = 'weighted')

In [5]:
#View the dataset

df.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,"Tue, Oct 22, 2019",8:00p,New Orleans Pelicans,122,Toronto Raptors,130,Box Score,OT,20787.0,
1,"Tue, Oct 22, 2019",10:30p,Los Angeles Lakers,102,Los Angeles Clippers,112,Box Score,,19068.0,
2,"Wed, Oct 23, 2019",7:00p,Chicago Bulls,125,Charlotte Hornets,126,Box Score,,15424.0,
3,"Wed, Oct 23, 2019",7:00p,Detroit Pistons,119,Indiana Pacers,110,Box Score,,17923.0,
4,"Wed, Oct 23, 2019",7:00p,Cleveland Cavaliers,85,Orlando Magic,94,Box Score,,18846.0,


In [6]:
#Read the data as an object field

df.dtypes

Date                object
Start (ET)          object
Visitor/Neutral     object
PTS                  int64
Home/Neutral        object
PTS.1                int64
Unnamed: 6          object
Unnamed: 7          object
Attend.            float64
Notes               object
dtype: object

In [7]:
df = pd.read_csv('./datasets/NBA Regular Season Results 2019-2020.csv', parse_dates = ['Date'])

In [8]:
df.columns = ['Date', 'Start Time (ET)', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Box Score', 
              'OT', 'Attendance', 'Notes']

In [9]:
#View the dimensions of the dataset

df.shape

(1230, 10)

In [10]:
#View the updated dataset

df.head()

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes
0,2019-10-22,8:00p,New Orleans Pelicans,122,Toronto Raptors,130,Box Score,OT,20787.0,
1,2019-10-22,10:30p,Los Angeles Lakers,102,Los Angeles Clippers,112,Box Score,,19068.0,
2,2019-10-23,7:00p,Chicago Bulls,125,Charlotte Hornets,126,Box Score,,15424.0,
3,2019-10-23,7:00p,Detroit Pistons,119,Indiana Pacers,110,Box Score,,17923.0,
4,2019-10-23,7:00p,Cleveland Cavaliers,85,Orlando Magic,94,Box Score,,18846.0,


In [11]:
df.dtypes

Date               datetime64[ns]
Start Time (ET)            object
Visitor Team               object
Visitor Score               int64
Home Team                  object
Home Score                  int64
Box Score                  object
OT                         object
Attendance                float64
Notes                      object
dtype: object

In [12]:
# Create feature that identifies home wins

df['Home Win'] = df['Visitor Score'] < df['Home Score']

y_true = df['Home Win'].values

In [13]:
#Validate that field is correctly populated

df.head()

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win
0,2019-10-22,8:00p,New Orleans Pelicans,122,Toronto Raptors,130,Box Score,OT,20787.0,,True
1,2019-10-22,10:30p,Los Angeles Lakers,102,Los Angeles Clippers,112,Box Score,,19068.0,,True
2,2019-10-23,7:00p,Chicago Bulls,125,Charlotte Hornets,126,Box Score,,15424.0,,True
3,2019-10-23,7:00p,Detroit Pistons,119,Indiana Pacers,110,Box Score,,17923.0,,False
4,2019-10-23,7:00p,Cleveland Cavaliers,85,Orlando Magic,94,Box Score,,18846.0,,True


In [14]:
#Baseline: It is typically assumed that Home Teams win more due to Home Court advantage

In [15]:
# Baseline Win % for Home Teams

n_games = df['Home Win'].count()
n_homewins = df['Home Win'].sum()
win_percentage = n_homewins / n_games

print('Home Team Win Rate : {0:.2f}%'.format(100 * win_percentage))

Home Team Win Rate : 54.96%


In [16]:
# Predicting the baseline for Home Teams with a classifier

from sklearn.metrics import f1_score

y_pred = [1] * len(y_true)
print('F1 : {0:.4f}%'.format(f1_score(y_true, y_pred, pos_label = None, average = 'weighted') * 100))

F1 : 38.9848%


In [17]:
# Creating columns for Home Team's and Visitor Team's Last Wins

df['Home Last Win'] = False
df['Visitor Last Win'] = False

In [18]:
# Identify if the home and visitor teams won their last games
# This will update the Home Last Win and Visitor Last Win columns

from collections import defaultdict

won_last = defaultdict(int)

for index, row in df.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['Home Last Win'] = won_last[home_team]
    row['Visitor Last Win'] = won_last[visitor_team]
    # Set current win
    won_last[home_team] = row['Home Win']
    won_last[visitor_team] = not row['Home Win']
df.loc[20:25]

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win,Home Last Win,Visitor Last Win
20,2019-10-25,8:00p,Dallas Mavericks,123,New Orleans Pelicans,116,Box Score,,17027.0,,False,False,False
21,2019-10-25,8:00p,Washington Wizards,97,Oklahoma City Thunder,85,Box Score,,18203.0,,False,False,False
22,2019-10-25,9:00p,Phoenix Suns,107,Denver Nuggets,108,Box Score,OT,19557.0,,True,False,False
23,2019-10-25,10:00p,Portland Trail Blazers,122,Sacramento Kings,112,Box Score,,17583.0,,False,False,False
24,2019-10-25,10:30p,Utah Jazz,86,Los Angeles Lakers,95,Box Score,,18997.0,,True,False,False
25,2019-10-26,5:00p,Miami Heat,131,Milwaukee Bucks,126,Box Score,OT,17467.0,,False,False,False


In [19]:
df.loc[90:95][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Last Win', 'Visitor Last Win']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Last Win,Visitor Last Win
90,2019-11-04,Detroit Pistons,99,Washington Wizards,115,True,False,False
91,2019-11-04,New Orleans Pelicans,125,Brooklyn Nets,135,True,False,False
92,2019-11-04,Houston Rockets,107,Memphis Grizzlies,100,False,False,False
93,2019-11-04,Milwaukee Bucks,134,Minnesota Timberwolves,106,False,False,False
94,2019-11-04,Philadelphia 76ers,109,Phoenix Suns,114,True,False,False
95,2019-11-04,Portland Trail Blazers,118,Golden State Warriors,127,True,False,False


In [20]:
# Basic Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

# Remove random_state to get non-replicable results

dtc = DecisionTreeClassifier(random_state = 14)

In [21]:
from sklearn.model_selection import cross_val_score

# Use selected features as target
X_previouswins = df[['Home Last Win', 'Visitor Last Win']].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_previouswins, y_true, scoring = scorer)

# Display Outcome
print("Using The Last Result from Home and Visitor Teams : \n")
print('F1 : {0:.4f}%'.format(np.mean(scores) * 100))

Using The Last Result from Home and Visitor Teams : 

F1 : 38.9849%


In [22]:
# Adding Winning Streaks into the Equation
df["Home Win Streak"] = 0
df["Visitor Win Streak"] = 0

from collections import defaultdict
win_streak = defaultdict(int)

for index, row in df.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["Home Win Streak"] = win_streak[home_team]
    row["Visitor Win Streak"] = win_streak[visitor_team]
    df.loc[index] = row    
    
    # Set current Win Streak
    if row["Home Win"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1

In [23]:
df.loc[50:60][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Win Streak', 'Visitor Win Streak']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Win Streak,Visitor Win Streak
50,2019-10-28,Charlotte Hornets,96,Los Angeles Clippers,111,True,0,0
51,2019-10-29,Atlanta Hawks,97,Miami Heat,112,True,0,0
52,2019-10-29,Dallas Mavericks,109,Denver Nuggets,106,False,3,0
53,2019-10-29,Memphis Grizzlies,91,Los Angeles Lakers,120,True,2,1
54,2019-10-30,Chicago Bulls,111,Cleveland Cavaliers,117,True,0,0
55,2019-10-30,New York Knicks,83,Orlando Magic,95,True,0,1
56,2019-10-30,Minnesota Timberwolves,95,Philadelphia 76ers,117,True,3,3
57,2019-10-30,Milwaukee Bucks,105,Boston Celtics,116,True,2,1
58,2019-10-30,Indiana Pacers,118,Brooklyn Nets,108,False,0,0
59,2019-10-30,Detroit Pistons,113,Toronto Raptors,125,True,2,1


In [24]:
# Use selected features as target
X_winstreak = df[["Home Last Win", "Visitor Last Win", "Home Win Streak", "Visitor Win Streak"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_winstreak, y_true, scoring = scorer)

# Display Outcome
print("Based On Current Team Seed : ")
print("F1 : {0:.4f}%".format(np.mean(scores) * 100))

Based On Current Team Seed : 
F1 : 49.1123%


In [25]:
# Taking Previous Seasons Seeds into Account
# Load the seed dataset

rank = pd.read_csv('./datasets/2018-2019 Regular Season Standings.csv', index_col = 'Team')

In [26]:
rank.head(3)

Unnamed: 0_level_0,Rk,Overall,Home,Road,E,W,A,C,SE,NW,...,Post,?3,?10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Milwaukee Bucks,1,60-22,33-8,27-14,40-12,20-Oct,13-May,14-Feb,13-May,06-Apr,...,17-Aug,05-Jun,45-5,7-0,08-Jun,10-Apr,12-Mar,10-Jan,10-Jun,03-Feb
Toronto Raptors,2,58-24,32-9,26-15,36-16,22-Aug,12-Apr,10-Aug,14-Apr,06-Apr,...,15-Aug,11-Jul,33-9,07-Jan,12-Mar,08-Jul,10-May,08-Jan,09-Jun,04-Jan
Golden State Warriors,3,57-25,30-Nov,27-14,22-Aug,35-17,06-Apr,08-Feb,08-Feb,12-Jun,...,16-Sep,07-Jul,34-10,08-Jan,07-Jul,10-May,11-Feb,07-Apr,09-May,05-Jan


In [27]:
# Rename columns in the seed DataFrame

rank.columns = ['Rank', 'Overall', 'Home', 'Road', 'Eastern Conference', 'Western Conference', 'Atlantic Div', 'Central Div', 'Southeast Div', 'Northwest Div', 'Pacific Div', 'Southwest Div', 'Pre All-Star', 'Post All-Star', 'Margin ≤3', 'Margin ≥10', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar', 'Apr']

In [28]:
rank.head(3)

Unnamed: 0_level_0,Rank,Overall,Home,Road,Eastern Conference,Western Conference,Atlantic Div,Central Div,Southeast Div,Northwest Div,...,Post All-Star,Margin ≤3,Margin ≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Milwaukee Bucks,1,60-22,33-8,27-14,40-12,20-Oct,13-May,14-Feb,13-May,06-Apr,...,17-Aug,05-Jun,45-5,7-0,08-Jun,10-Apr,12-Mar,10-Jan,10-Jun,03-Feb
Toronto Raptors,2,58-24,32-9,26-15,36-16,22-Aug,12-Apr,10-Aug,14-Apr,06-Apr,...,15-Aug,11-Jul,33-9,07-Jan,12-Mar,08-Jul,10-May,08-Jan,09-Jun,04-Jan
Golden State Warriors,3,57-25,30-Nov,27-14,22-Aug,35-17,06-Apr,08-Feb,08-Feb,12-Jun,...,16-Sep,07-Jul,34-10,08-Jan,07-Jul,10-May,11-Feb,07-Apr,09-May,05-Jan


In [29]:
# Create Home Team Ranks Higher to denote a team with a higher seed

def home_team_ranks_higher(row):
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    
    home_rank = rank.loc[home_team]["Rank"]
    visitor_rank = rank.loc[visitor_team]["Rank"]
    
    return home_rank < visitor_rank   # The higher seed will be the lower value

In [30]:
# Create new column by using .apply(home_team_ranks_higher) across the DataFrame
df["Home Team Ranks Higher"] = df.apply(home_team_ranks_higher, axis = 1)

# Display the selected columns on the sliced DataFrame (first 5 rows)
df[:5][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Ranks Higher']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Team Ranks Higher
0,2019-10-22,New Orleans Pelicans,122,Toronto Raptors,130,True,True
1,2019-10-22,Los Angeles Lakers,102,Los Angeles Clippers,112,True,True
2,2019-10-23,Chicago Bulls,125,Charlotte Hornets,126,True,True
3,2019-10-23,Detroit Pistons,119,Indiana Pacers,110,False,True
4,2019-10-23,Cleveland Cavaliers,85,Orlando Magic,94,True,True


In [31]:
# Decision Tree Classifier based on if Home Team has a Higher Seed

# Use selected features as input for the classifier (target)
X_homehigher = df[["Home Last Win", "Visitor Last Win", "Home Team Ranks Higher"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_homehigher, y_true, scoring = scorer)

# Display Outcome
print("If the Home Team has a Higher Seed : \n")
print("F1 : {0:.4f}%".format(np.mean(scores) * 100))

If the Home Team has a Higher Seed : 

F1 : 57.3390%


In [32]:
# Adjusting the parameters of the algorithm using GridSearchCV to test if there is an improvement

from sklearn.model_selection import GridSearchCV

parameter_space = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
}
dtc = DecisionTreeClassifier(random_state = 14)

grid = GridSearchCV(dtc, parameter_space, scoring = scorer)
grid.fit(X_homehigher, y_true)

print('F1 : {0:.4f}%'.format(grid.best_score_ * 100))

F1 : 57.3390%


In [33]:
# Build a function that determines whether a team won the last matchup
# This does not take into consideration the home/visitor teams

last_game_winner = defaultdict(int)

def home_team_won_last(row):
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    
    # Sort for ordering
    teams = tuple(sorted([home_team, visitor_team]))
    
    # Parse the row for which team won the last matchup, then add a 1 if the Home Team won
    result = 1 if last_game_winner[teams] == row['Home Team'] else 0
    
    # Update record for next matchup
    winner = row['Home Team'] if row['Home Win'] else row['Visitor Team']
    last_game_winner[teams] = winner
    
    return result

In [34]:
#The above function id working but is not being applied to the entire DataFrame

In [35]:
# Add a feature without using function -> Determines whether a team won the last matchup
# This does not take into consideration the home/visitor teams

last_game_winner = defaultdict(int)
df["Home Team Won Last"] = 0

for index, row in df.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    
    # Set in the row, who won the last match
    row["Home Team Won Last"] = 1 if last_game_winner[teams] == row["Home Team"] else 0
    df.loc[index] = row
    
    winner = row["Home Team"] if row["Home Win"] else row["Visitor Team"]
    last_game_winner[teams] = winner

In [36]:
df[90:100][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Won Last']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Team Won Last
90,2019-11-04,Detroit Pistons,99,Washington Wizards,115,True,0
91,2019-11-04,New Orleans Pelicans,125,Brooklyn Nets,135,True,0
92,2019-11-04,Houston Rockets,107,Memphis Grizzlies,100,False,0
93,2019-11-04,Milwaukee Bucks,134,Minnesota Timberwolves,106,False,0
94,2019-11-04,Philadelphia 76ers,109,Phoenix Suns,114,True,0
95,2019-11-04,Portland Trail Blazers,118,Golden State Warriors,127,True,0
96,2019-11-05,Indiana Pacers,120,Charlotte Hornets,122,True,0
97,2019-11-05,Boston Celtics,119,Cleveland Cavaliers,113,False,0
98,2019-11-05,San Antonio Spurs,100,Atlanta Hawks,108,True,0
99,2019-11-05,Los Angeles Lakers,118,Chicago Bulls,112,False,0


In [37]:
# Use selected features as target
X_home_higher = df[['Home Last Win', 'Visitor Last Win', "Home Team Ranks Higher", "Home Team Won Last"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_home_higher, y_true, scoring = scorer)

# Display Outcome
print("If the Home Team won the previous Matchup : \n")
print("F1 : {0:.4f}%".format(np.mean(scores) * 100))

If the Home Team won the previous Matchup : 

F1 : 58.1951%


In [38]:
df['Home Team'].values

array(['Toronto Raptors', 'Los Angeles Clippers', 'Charlotte Hornets',
       ..., 'Portland Trail Blazers', 'Sacramento Kings',
       'San Antonio Spurs'], dtype=object)

In [39]:
# Assigning Numbers to the Teams
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoding = LabelEncoder()
encoding.fit(df["Home Team"].values)

LabelEncoder()

In [40]:
# Numerical values assigned to the home and visitor teams
home_teams = encoding.transform(df["Home Team"].values)
visitor_teams = encoding.transform(df["Visitor Team"].values)

# Stacks arrays in a sequence vertically and transpose the data
X_teams = np.vstack([home_teams, visitor_teams]).T

In [41]:
# OneHotEncoder takes each of the features of the numerical valued-teams and identifies if it was the home or visitor team

onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

In [42]:
#View data size of X_teams

X_teams.shape

(1230, 60)

In [43]:
# Print the first 30 elements in X_teams

print('Home:', X_teams[0:30])
print('Visitor:', X_teams[0,30:])

Home: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]
Visitor: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]


In [44]:
# Run the classifier on the new teams

dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_teams, y_true, scoring = scorer)

# Display Outcome
print("F1 : {0:.4f}%".format(np.mean(scores) * 100))

F1 : 56.9505%


In [45]:
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
scores = cross_val_score(rfc, X_teams, y_true, scoring = scorer)

# Display Outcome
print("F1 : {0:.4f}%".format(np.mean(scores) * 100))

F1 : 59.1875%


In [46]:
# Set up more detailed parameters for the Random Forest Classifier in an attempt to improve accuracy score
# Adjusting the parameters of the algorithm using GridSearchCV

parameter_space = {
    "max_features": [2, 10, 50, 'auto'],
    "n_estimators": [50, 100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [1, 2, 4, 6],
}

# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
grid = GridSearchCV(rfc, parameter_space, scoring = scorer)
grid.fit(X_teams, y_true)

# Display Outcome
print("F1 : {0:.4f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)

F1 : 62.4693%
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=14, verbose=0,
                       warm_start=False)


In [47]:
# Combine the team features with the other previously determined features
X_all = np.hstack([X_home_higher, X_teams])
print(X_all.shape)

(1230, 64)


In [48]:
# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
scores = cross_val_score(rfc, X_all, y_true, scoring = scorer)

# Display Outcome
print("F1 : {0:.4f}%".format(np.mean(scores) * 100))

F1 : 60.3828%


In [50]:
# Attempt to improve accuracy using more detailed features
# Adjusting the parameters of the algorithm using GridSearchCV

parameter_space = {
    "max_features": [2, 10, 50, 'auto'],
    "n_estimators": [50, 100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [1, 2, 4, 6],
}

# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
grid = GridSearchCV(rfc, parameter_space, scoring = scorer)
grid.fit(X_all, y_true)

# Display Outcome
print("F1 : {0:.4f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)

F1 : 63.0377%
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features=50,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=14, verbose=0,
                       warm_start=False)


In [None]:
#Testing on the 2020-2021 NBA Season

In [None]:
# Load the data for the 2020-21 Season
df_21 = pd.read_csv('./datasets/NBA Regular Season Results 2020-2021.csv', parse_dates = ['Date'], dtype = {'Attendance':int})

# Correct column names
df_21.columns = ['Date', 'Start Time (ET)', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Box Score', 'OT', 'Attendance', 'Notes']

df_21.head(7)

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes
0,2020-12-22,7:00p,Golden State Warriors,99,Brooklyn Nets,125,Box Score,,0.0,
1,2020-12-22,10:00p,Los Angeles Clippers,116,Los Angeles Lakers,109,Box Score,,0.0,
2,2020-12-23,7:00p,Charlotte Hornets,114,Cleveland Cavaliers,121,Box Score,,300.0,
3,2020-12-23,7:00p,New York Knicks,107,Indiana Pacers,121,Box Score,,0.0,
4,2020-12-23,7:00p,Miami Heat,107,Orlando Magic,113,Box Score,,3396.0,
5,2020-12-23,7:00p,Washington Wizards,107,Philadelphia 76ers,113,Box Score,,0.0,
6,2020-12-23,7:30p,New Orleans Pelicans,113,Toronto Raptors,99,Box Score,,3800.0,


In [None]:
# Load the previous season's standings
rank_20 = pd.read_csv('./datasets/2019-2020 Regular Season Standings.csv', index_col = 'Team')

# Rename columns
rank_20.columns = ['Rank', 'Overall', 'Home', 'Road', 'Eastern Conference', 'Western Conference', 
                'Atlantic Div', 'Central Div', 'Southeast Div', 'Northwest Div', 'Pacific Div', 'Southwest Div', 
                'Pre All-Star', 'Post All-Star', 'Margin ≤3', 'Margin ≥10', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 
                'Mar', 'Jul','Aug']

rank_20.head()

Unnamed: 0_level_0,Rank,Overall,Home,Road,Eastern Conference,Western Conference,Atlantic Div,Central Div,Southeast Div,Northwest Div,...,Margin ≤3,Margin ≥10,Oct,Nov,Dec,Jan,Feb,Mar,Jul,Aug
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Milwaukee Bucks,1,56-17,30-May,26-Dec,37-7,19-Oct,10-Apr,13-Jan,14-Feb,07-Mar,...,03-Feb,38-8,02-Feb,15-Jan,13-Feb,11-Feb,10-Jan,02-Apr,1-0,02-May
Toronto Raptors,2,53-19,26-Oct,27-Sep,34-11,19-Aug,09-May,13-Mar,12-Mar,07-Mar,...,06-Apr,23-Oct,04-Jan,10-Mar,09-Jul,12-Mar,07-Mar,04-Jan,,07-Jan
Los Angeles Lakers,3,52-19,25-Oct,27-Sep,16-Sep,36-10,05-May,04-Mar,07-Jan,12-Mar,...,07-Mar,25-Nov,03-Jan,14-Jan,09-May,10-Apr,09-Feb,04-Jan,1-0,02-May
Los Angeles Clippers,4,49-23,27-Sep,22-14,17-Jul,32-16,05-Mar,04-Mar,08-Jan,11-May,...,08-Mar,32-8,04-Feb,10-Apr,10-May,09-Apr,07-Apr,04-Jan,0-1,05-Feb
Boston Celtics,5,48-24,26-Oct,22-14,30-13,18-Nov,09-Jun,09-Apr,12-Mar,06-Mar,...,06-May,29-Jul,03-Jan,10-Apr,10-Mar,09-Jul,09-Mar,02-Mar,0-1,05-Feb


In [None]:
# Determine whether the home team won
df_21['Home Win'] = df_21['Visitor Score'] < df_21['Home Score']

# Assign class values to the Home Win column
y_true = df_21['Home Win'].values

In [None]:
df_21.head()

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win
0,2020-12-22,7:00p,Golden State Warriors,99,Brooklyn Nets,125,Box Score,,0.0,,True
1,2020-12-22,10:00p,Los Angeles Clippers,116,Los Angeles Lakers,109,Box Score,,0.0,,False
2,2020-12-23,7:00p,Charlotte Hornets,114,Cleveland Cavaliers,121,Box Score,,300.0,,True
3,2020-12-23,7:00p,New York Knicks,107,Indiana Pacers,121,Box Score,,0.0,,True
4,2020-12-23,7:00p,Miami Heat,107,Orlando Magic,113,Box Score,,3396.0,,True


In [None]:
df_21['Home Last Win'] = False
df_21['Visitor Last Win'] = False

# Determining whether the home and visitor teams won their last games

from collections import defaultdict

won_last = defaultdict(int)

for index, row in df_21.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['Home Last Win'] = won_last[home_team]
    row['Visitor Last Win'] = won_last[visitor_team]
    
    # Set current win
    won_last[home_team] = row['Home Win']
    won_last[visitor_team] = not row['Home Win']
df_21.sample(10)

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win,Home Last Win,Visitor Last Win
716,2021-04-02,8:00p,Minnesota Timberwolves,108,Memphis Grizzlies,120,Box Score,,2987.0,,True,False,False
77,2021-01-02,5:00p,Sacramento Kings,94,Houston Rockets,102,Box Score,,3065.0,,True,False,False
755,2021-04-07,7:30p,New Orleans Pelicans,111,Brooklyn Nets,139,Box Score,,1773.0,,True,False,False
75,2021-01-01,9:00p,Los Angeles Clippers,100,Utah Jazz,106,Box Score,,1932.0,,True,False,False
572,2021-03-15,8:00p,New York Knicks,112,Brooklyn Nets,117,Box Score,,1637.0,,True,False,False
1029,2021-05-11,8:00p,Brooklyn Nets,115,Chicago Bulls,107,Box Score,,3434.0,,False,False,False
532,2021-03-04,10:00p,Sacramento Kings,119,Portland Trail Blazers,123,Box Score,,0.0,,True,False,False
1190,2021-05-09,10:00p,Phoenix Suns,110,Los Angeles Lakers,123,Box Score,,3144.0,,True,False,False
1238,2021-05-16,1:00p,Boston Celtics,92,New York Knicks,96,Box Score,,1981.0,,True,False,False
1116,2021-05-31,7:00p,Philadelphia 76ers,114,Washington Wizards,122,Box Score,,10665.0,,True,False,False


In [None]:
# Creating columns for winning streaks

df_21["Home Win Streak"] = 0
df_21["Visitor Win Streak"] = 0

from collections import defaultdict

win_streak = defaultdict(int)

for index, row in df_21.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["Home Win Streak"] = win_streak[home_team]
    row["Visitor Win Streak"] = win_streak[visitor_team]
    df_21.loc[index] = row    
    
    # Set current win streak
    if row["Home Win"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1
        
df_21.sample(10)

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win,Home Last Win,Visitor Last Win,Home Win Streak,Visitor Win Streak
209,2021-01-21,7:30p,Los Angeles Lakers,113,Milwaukee Bucks,106,Box Score,,0.0,,False,False,False,0,0
279,2021-01-30,8:30p,Los Angeles Lakers,96,Boston Celtics,95,Box Score,,0.0,,False,False,False,0,0
715,2021-04-02,8:00p,Charlotte Hornets,114,Indiana Pacers,97,Box Score,,0.0,,False,False,False,0,0
118,2021-01-07,10:00p,Dallas Mavericks,124,Denver Nuggets,117,Box Score,OT,0.0,,False,False,False,2,1
58,2020-12-30,8:30p,Los Angeles Lakers,121,San Antonio Spurs,107,Box Score,,0.0,,False,False,False,0,0
897,2021-04-25,1:00p,Boston Celtics,104,Charlotte Hornets,125,Box Score,,4493.0,,True,False,False,1,0
523,2021-03-03,10:00p,Los Angeles Lakers,120,Sacramento Kings,123,Box Score,,0.0,,True,False,False,0,0
1035,2021-05-11,10:00p,Oklahoma City Thunder,106,Sacramento Kings,122,Box Score,,0.0,,True,False,False,1,0
1121,2021-05-01,8:00p,Miami Heat,124,Cleveland Cavaliers,107,Box Score,,4148.0,,False,False,False,0,0
441,2021-02-21,7:00p,Minnesota Timberwolves,99,New York Knicks,103,Box Score,,0.0,,True,False,False,0,0


In [None]:
# Build function to add Home Team Ranks Higher feature
def home_team_ranks_higher_2015(row):
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    
    home_rank = rank_20.loc[home_team]['Rank']
    visitor_rank = rank_20.loc[visitor_team]['Rank']
    
    # The higher ranking will be the lower value
    return home_rank < visitor_rank

In [None]:
# Create new column by using .apply(home_team_ranks_higher_2015)

df_21["Home Team Ranks Higher"] = df_21.apply(home_team_ranks_higher_2015, axis = 1)
df_21.head()

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win,Home Last Win,Visitor Last Win,Home Win Streak,Visitor Win Streak,Home Team Ranks Higher
0,2020-12-22,7:00p,Golden State Warriors,99,Brooklyn Nets,125,Box Score,,0.0,,True,False,False,0,0,True
1,2020-12-22,10:00p,Los Angeles Clippers,116,Los Angeles Lakers,109,Box Score,,0.0,,False,False,False,0,0,True
2,2020-12-23,7:00p,Charlotte Hornets,114,Cleveland Cavaliers,121,Box Score,,300.0,,True,False,False,0,0,False
3,2020-12-23,7:00p,New York Knicks,107,Indiana Pacers,121,Box Score,,0.0,,True,False,False,0,0,True
4,2020-12-23,7:00p,Miami Heat,107,Orlando Magic,113,Box Score,,3396.0,,True,False,False,0,0,False


In [None]:
# Add a new feature without using function -> Determines whether a team won the last matchup
# This does not take into consideration the home/visitor teams

last_match_winner = defaultdict(int)
df_21["Home Team Won Last"] = 0

for index, row in df_21.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))
    
    row["Home Team Won Last"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
    df_21.loc[index] = row
    
    winner = row["Home Team"] if row["Home Win"] else row["Visitor Team"]
    last_match_winner[teams] = winner
    
df_21.sample(7)

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win,Home Last Win,Visitor Last Win,Home Win Streak,Visitor Win Streak,Home Team Ranks Higher,Home Team Won Last
672,2021-03-27,9:00p,Boston Celtics,111,Oklahoma City Thunder,94,Box Score,,0.0,,False,False,False,0,1,False,0
1067,2021-05-16,1:00p,Charlotte Hornets,110,Washington Wizards,115,Box Score,,5333.0,,True,False,False,1,0,True,0
127,2021-01-08,8:00p,Utah Jazz,131,Milwaukee Bucks,118,Box Score,,0.0,,False,False,False,3,0,True,0
1208,2021-05-11,10:00p,Oklahoma City Thunder,106,Sacramento Kings,122,Box Score,,0.0,,True,False,False,1,0,False,1
1178,2021-05-08,7:00p,Detroit Pistons,104,Philadelphia 76ers,118,Box Score,,5119.0,,True,False,False,4,1,True,1
92,2021-01-04,7:00p,Charlotte Hornets,101,Philadelphia 76ers,118,Box Score,,0.0,,True,False,False,3,0,True,1
340,2021-02-07,1:00p,Utah Jazz,103,Indiana Pacers,95,Box Score,,0.0,,False,False,False,0,3,True,0


In [None]:
# Validate the added features in the DataFrame

df_21[['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Last Win', 
       'Visitor Last Win', 'Home Team Ranks Higher', 'Home Win Streak', 'Visitor Win Streak', 
       'Home Team Won Last']][:5]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Last Win,Visitor Last Win,Home Team Ranks Higher,Home Win Streak,Visitor Win Streak,Home Team Won Last
0,2020-12-22,Golden State Warriors,99,Brooklyn Nets,125,True,False,False,True,0,0,0
1,2020-12-22,Los Angeles Clippers,116,Los Angeles Lakers,109,False,False,False,True,0,0,0
2,2020-12-23,Charlotte Hornets,114,Cleveland Cavaliers,121,True,False,False,False,0,0,0
3,2020-12-23,New York Knicks,107,Indiana Pacers,121,True,False,False,True,0,0,0
4,2020-12-23,Miami Heat,107,Orlando Magic,113,True,False,False,False,0,0,0


In [None]:
# Use selected features as the target variable if the home team has a higher seed

X_home_higher_15 = df_21[['Home Last Win', 'Visitor Last Win', 'Home Team Ranks Higher', 'Home Team Won Last']].values

In [None]:
home_teams_15 = encoding.transform(df_21['Home Team'].values)
visitor_teams_15 = encoding.transform(df_21['Visitor Team'].values)

X_teams_15 = np.vstack([home_teams_15, visitor_teams_15]).T
X_teams_15 = onehot.transform(X_teams_15).todense()

In [None]:
X_all_15 = np.hstack([X_home_higher_15, X_teams_15])
X_all_15.shape

(1299, 64)

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

print('The Home Team wins {:.2f}% of thier Matches\n'.format(100 * np.mean(y_true)))

y_pred = [1] * len(y_true)
grid.fit(X_all,y_true)

#y_pred = grid.predict(X_all_15)

print('F1 : {:.4f}'.format(f1_score(y_true, y_pred, pos_label = None, average = 'weighted')))
print(classification_report(y_true, y_pred))

The Home Team wins 54.89% of thier Matches



ValueError: Found input variables with inconsistent numbers of samples: [1230, 1299]

In [None]:
grid.fit(X_all, y_true)

y_pred = grid.predict(X_all_15)

ValueError: Found input variables with inconsistent numbers of samples: [1230, 1299]