# Predicting NBA Game Winners

In [1]:
# Load libraries

import numpy as np
import pandas as pd
import sklearn as sk
import os

In [6]:
# Load the dataset: NBA statistics from the 2013-2014 basketball season
df = pd.read_csv('./datasets/NBA Regular Season Results 2013-2014.csv')

In [10]:
from sklearn.metrics import f1_score, make_scorer, classification_report

model_scorer = make_scorer(f1_score, pos_label = None, average = 'weighted')

In [8]:
# Data Exploration
df.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Tue Oct 29 2013,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,18165,
1,Tue Oct 29 2013,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,18997,
2,Tue Oct 29 2013,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,19964,
3,Wed Oct 30 2013,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,20562,
4,Wed Oct 30 2013,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,19834,


In [11]:
df.dtypes

# Date is read as an object file

Date               object
Start (ET)         object
Visitor/Neutral    object
PTS                 int64
Home/Neutral       object
PTS.1               int64
Unnamed: 6         object
Unnamed: 7         object
Attend.             int64
Notes              object
dtype: object

In [12]:
# Parsing date column

df = pd.read_csv('./datasets/NBA Regular Season Results 2013-2014.csv', parse_dates = ['Date'])

# zChanging column names
df.columns = ['Date', 'Start Time (ET)', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Box Score', 
              'OT', 'Attendance', 'Notes']

In [13]:
df.shape

(1230, 10)

In [15]:
# updated dataset
df.head()

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,18165,
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,18997,
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,19964,
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,20562,
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,19834,


In [16]:

df.dtypes

Date               datetime64[ns]
Start Time (ET)            object
Visitor Team               object
Visitor Score               int64
Home Team                  object
Home Score                  int64
Box Score                  object
OT                         object
Attendance                  int64
Notes                      object
dtype: object

In [22]:
# Create new feature that notes whether the home team won
df['Home Win'] = df['Visitor Score'] < df['Home Score']

# Assign "class values" to the Home Win column

### This will also be the value that we want to predict ###
y_true = df['Home Win'].values

In [23]:
# Review new column with correctly populated fields
df.head()

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,18165,,True
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,18997,,True
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,19964,,True
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,20562,,True
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,19834,,True


In [24]:
# Establishing the baseline win percentage for the Home Teams overall
n_games = df['Home Win'].count()
n_homewins = df['Home Win'].sum()
win_percentage = n_homewins / n_games

print('Home Win percentage: {0:.2f}%'.format(100 * win_percentage))

Home Win percentage: 58.05%


The home teams' win percentage is 58.0%. If a model is considered useful, it must (at minimum) have a better score than the established baseline

In [25]:
# Predicting the baseline for Home Teams wins with simple classifier

from sklearn.metrics import f1_score

y_pred = [1] * len(y_true)
print('F1: {0:.4f}%'.format(f1_score(y_true, y_pred, pos_label = None, average = 'weighted') * 100))

F1: 42.6408%


The calculated F1 score of 42.6%. This is the baseline score our model needs to beat.

## Creating More Features

In [26]:
# Create two new columns, with all of the values set to False
df['Home Last Win'] = False
df['Visitor Last Win'] = False

In [27]:
# Determining whether the home and visitor teams won their last games
# This will update the Home Last Win & Visitor Last Win columns to either remain False or change to True

from collections import defaultdict

won_last = defaultdict(int)   # The default value of int is 0

# Inefficient method to deterime previous wins
for index, row in df.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['Home Last Win'] = won_last[home_team]
    row['Visitor Last Win'] = won_last[visitor_team]
    # Set current win
    won_last[home_team] = row['Home Win']
    won_last[visitor_team] = not row['Home Win']
df.loc[20:25]

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win,Home Last Win,Visitor Last Win
20,2013-11-01,7:30 pm,Milwaukee Bucks,105,Boston Celtics,98,Box Score,,18624,,False,False,False
21,2013-11-01,8:00 pm,Miami Heat,100,Brooklyn Nets,101,Box Score,,17732,,True,False,False
22,2013-11-01,7:00 pm,Cleveland Cavaliers,84,Charlotte Bobcats,90,Box Score,,18017,,True,False,False
23,2013-11-01,9:00 pm,Portland Trail Blazers,113,Denver Nuggets,98,Box Score,,19155,,False,False,False
24,2013-11-01,8:00 pm,Dallas Mavericks,105,Houston Rockets,113,Box Score,,18142,,True,False,False
25,2013-11-01,10:30 pm,San Antonio Spurs,91,Los Angeles Lakers,85,Box Score,,18997,,False,False,False


In [28]:
df.loc[90:95][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 
               'Home Win', 'Home Last Win', 'Visitor Last Win']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Last Win,Visitor Last Win
90,2013-11-09,Dallas Mavericks,91,Milwaukee Bucks,83,False,False,False
91,2013-11-09,Portland Trail Blazers,96,Sacramento Kings,85,False,False,False
92,2013-11-09,Utah Jazz,91,Toronto Raptors,115,True,False,False
93,2013-11-10,Minnesota Timberwolves,113,Los Angeles Lakers,90,False,False,False
94,2013-11-10,San Antonio Spurs,120,New York Knicks,89,False,False,False
95,2013-11-10,Washington Wizards,105,Oklahoma City Thunder,106,True,False,False


# Basic Classification with a Decision Tree

The default settings for the Decision Tree will be used for the home wins, winning streaks, and rankings

In [29]:
# Basic Decision Tree Classifier set up
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 14) # Remove random_state to get non-replicable results

In [35]:
from sklearn.model_selection import cross_val_score

# Use selected features as input for the classifier (target)
X_previouswins = df[['Home Last Win', 'Visitor Last Win']].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
decision_tree_scores = cross_val_score(dtc, X_previouswins, y_true, scoring = model_scorer)

# Print results
print("Using just the last result from the home and visitor teams")
print('F1: {0:.4f}%'.format(np.mean(decision_tree_scores) * 100))

Using just the last result from the home and visitor teams
F1: 42.6409%


No improvement from baseline score of 42.6%

### Decision Tree Classifier: Considerating Winning Streaks

In [37]:
# Taking into consideration winning streaks - What are the teams' win streaks coming into the game?
df["Home Win Streak"] = 0
df["Visitor Win Streak"] = 0

# Did the home and visitor teams win their last game?
from collections import defaultdict
win_streak = defaultdict(int)

for index, row in df.iterrows():  # Note that this is not the most efficient method
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["Home Win Streak"] = win_streak[home_team]
    row["Visitor Win Streak"] = win_streak[visitor_team]
    df.loc[index] = row    
    
    # Set current win streak
    if row["Home Win"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1

In [38]:
df.loc[50:60][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 
               'Home Win', 'Home Win Streak', 'Visitor Win Streak']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Win Streak,Visitor Win Streak
50,2013-11-05,Utah Jazz,88,Brooklyn Nets,104,True,0,0
51,2013-11-05,Los Angeles Lakers,104,Dallas Mavericks,123,True,1,1
52,2013-11-05,San Antonio Spurs,102,Denver Nuggets,94,False,0,0
53,2013-11-05,Indiana Pacers,99,Detroit Pistons,91,False,1,3
54,2013-11-05,Phoenix Suns,104,New Orleans Pelicans,98,False,1,0
55,2013-11-05,Charlotte Bobcats,102,New York Knicks,97,False,0,0
56,2013-11-05,Houston Rockets,116,Portland Trail Blazers,101,False,2,0
57,2013-11-05,Atlanta Hawks,105,Sacramento Kings,100,False,0,0
58,2013-11-05,Miami Heat,104,Toronto Raptors,95,False,1,1
59,2013-11-06,Utah Jazz,87,Boston Celtics,97,True,0,0


In [39]:
# Use selected features as input for the classifier (target)
X_winstreak = df[["Home Last Win", "Visitor Last Win", "Home Win Streak", "Visitor Win Streak"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_winstreak, y_true, scoring = scorer)

# Print results
print("Using whether the home team is ranked higher")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
F1: 53.0399%


When taking into consideration whether a team is on a win streak, the decision tree model's results show 52.5%

### Decision Tree Classifier: Based on Previous Season Standings

In [40]:
# Identify which team is higher in the standings, based on the previous year's regular season final standings
# Load the standings data file

rank = pd.read_csv('./datasets/2012-2013 Regular Season Standings.csv', index_col = 'Team')

In [41]:
rank.head(3)

Unnamed: 0_level_0,Rk,Overall,Home,Road,E,W,A,C,SE,NW,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Miami Heat,1,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,8-2,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
Oklahoma City Thunder,2,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,10-6,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
San Antonio Spurs,3,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,9-9,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6


In [42]:
# Rename columns in the rank DataFrame
rank.columns = ['Rank', 'Overall', 'Home', 'Road', 'Eastern Conference', 'Western Conference', 
                'Atlantic Div', 'Central Div', 'Southeast Div', 'Northwest Div', 'Pacific Div', 'Southwest Div', 
                'Pre All-Star', 'Post All-Star', 'Margin ≤3', 'Margin ≥10', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 
                'Mar', 'Apr']

In [43]:
rank.head(3)

Unnamed: 0_level_0,Rank,Overall,Home,Road,Eastern Conference,Western Conference,Atlantic Div,Central Div,Southeast Div,Northwest Div,...,Post All-Star,Margin ≤3,Margin ≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Miami Heat,1,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,8-2,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
Oklahoma City Thunder,2,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,10-6,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
San Antonio Spurs,3,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,9-9,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6


In [44]:
# Create a new feature -> Home Team Ranks Higher
# First create a function that iterates through the df to determine if the home team has a higher rank based on rank df

def home_team_ranks_higher(row):
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    
    # Adjusting the New Orleans team names due to off-season league changes between 12-13 & 13-14
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    if visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    
    home_rank = rank.loc[home_team]["Rank"]
    visitor_rank = rank.loc[visitor_team]["Rank"]
    
    return home_rank < visitor_rank   # The higher ranking will be the lower number

In [45]:
# Create new column by using .apply(home_team_ranks_higher) across the DataFrame
df["Home Team Ranks Higher"] = df.apply(home_team_ranks_higher, axis = 1)

# Display the selected columns on the sliced DataFrame (first 5 rows)
df[:5][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Ranks Higher']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Team Ranks Higher
0,2013-10-29,Orlando Magic,87,Indiana Pacers,97,True,True
1,2013-10-29,Los Angeles Clippers,103,Los Angeles Lakers,116,True,False
2,2013-10-29,Chicago Bulls,95,Miami Heat,107,True,True
3,2013-10-30,Brooklyn Nets,94,Cleveland Cavaliers,98,True,False
4,2013-10-30,Atlanta Hawks,109,Dallas Mavericks,118,True,False


______________________________________________________________________________________________________________________

In [28]:
# # Add new column without using a function & .apply
# df["Home Team Ranks Higher"] = 0

# for index, row in df.iterrows():
#     home_team = row["Home Team"]
#     visitor_team = row["Visitor Team"]
   
#     # Adjusting the New Orleans team names due to off-season league changes between 12-13 & 13-14
#     if home_team == "New Orleans Pelicans":
#         home_team = "New Orleans Hornets"
#     elif visitor_team == "New Orleans Pelicans":
#         visitor_team = "New Orleans Hornets"
    
#     home_rank = rank[rank["Team"] == home_team]["Rank"].values[0]
#     visitor_rank = rank[rank["Team"] == visitor_team]["Rank"].values[0]
#     row["Home Team Ranks Higher"] = int(home_rank > visitor_rank)
#     df.loc[index] = row

In [29]:
# df[:5][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win']]

______________________________________________________________________________________________________________________

In [62]:
# Decision Tree Classifier based on if Home Team has a Higher Ranking

# Use selected features as input for the classifier (target)
X_homehigher = df[["Home Last Win", "Visitor Last Win", "Home Team Ranks Higher"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_homehigher, y_true, scoring = scorer)

# Print results
print("Using whether the home team is ranked higher")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
F1: 55.0177%


# Decision Tree Classifier based on if Home Team has a Higher Ranking

# Use selected features as input for the classifier (target)
X_homehigher = df[["Home Last Win", "Visitor Last Win", "Home Team Ranks Higher"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_homehigher, y_true, scoring = scorer)

# Print results
print("Using whether the home team is ranked higher")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

When taking into consideration whether the home team has a higher rank in the previous season's standings, the model's results show 59.45% .


In [63]:
# Adjusting the parameters of the algorithm using GridSearchCV to test if there is an improvement in the model's score

from sklearn.model_selection import GridSearchCV

parameter_space = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
}
dtc = DecisionTreeClassifier(random_state = 14)

grid = GridSearchCV(dtc, parameter_space, scoring = scorer)
grid.fit(X_homehigher, y_true)

print('F1: {0:.4f}%'.format(grid.best_score_ * 100))

F1: 55.0177%


By adjusting the parameters (tuning) of the Decision Tree, there is no change in the models

## Decision Tree Classifier: Based on whether the Home Team Won the Last Matchup

______________________________________________________________________________________________________________________

In [64]:
# Build a function that determines whether a team won the last matchup between the 2 teams
# This does not take into consideration the home/visitor teams

last_game_winner = defaultdict(int)

def home_team_won_last(row):
    home_team = row['Home Team']
    visitor_team = row['Visistor Team']
    
    # Sort for a consistent ordering
    teams = tuple(sorted([home_team, visitor_team]))
    # Parse the row for which team won the last matchup, then add a 1 if the Home Team won
    result = 1 if last_game_winner[teams] == row['Home Team'] else 0
    
    # Update record for next matchup
    winner = row['Home Team'] if row['Home Win'] else row['Visitor Team']
    last_game_winner[teams] = winner
    
    return result

# Function works, but is not applying across DataFrame (see cell below)

In [33]:
# Create new column by using .apply(home_team_won_last) across the DataFrame
# df['Home Team Won Last'] = df.apply(home_team_won_last, axis = 1)

In [34]:
# Display the selected columns on the sliced DataFrame (5 random rows)
# df[90:100][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Won Last']]

______________________________________________________________________________________________________________________

In [65]:
# Add a new feature without using function -> Determines whether a team won the last matchup between the 2 teams
# This does not take into consideration the home/visitor teams

last_game_winner = defaultdict(int)
df["Home Team Won Last"] = 0

for index, row in df.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    
    # Set in the row, who won the last encounter
    row["Home Team Won Last"] = 1 if last_game_winner[teams] == row["Home Team"] else 0
    df.loc[index] = row
    
    # Who won this one?
    winner = row["Home Team"] if row["Home Win"] else row["Visitor Team"]
    last_game_winner[teams] = winner

In [66]:
# Display the selected columns on the sliced DataFrame (10 random rows)
df[90:100][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Won Last']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Team Won Last
90,2013-11-09,Dallas Mavericks,91,Milwaukee Bucks,83,False,0
91,2013-11-09,Portland Trail Blazers,96,Sacramento Kings,85,False,0
92,2013-11-09,Utah Jazz,91,Toronto Raptors,115,True,0
93,2013-11-10,Minnesota Timberwolves,113,Los Angeles Lakers,90,False,0
94,2013-11-10,San Antonio Spurs,120,New York Knicks,89,False,0
95,2013-11-10,Washington Wizards,105,Oklahoma City Thunder,106,True,0
96,2013-11-10,New Orleans Pelicans,94,Phoenix Suns,101,True,1
97,2013-11-11,Orlando Magic,105,Boston Celtics,120,True,1
98,2013-11-11,Atlanta Hawks,103,Charlotte Bobcats,94,False,0
99,2013-11-11,Cleveland Cavaliers,81,Chicago Bulls,96,True,0


In [67]:
# Use selected features as input for the classifier (target)
X_home_higher = df[['Home Last Win', 'Visitor Last Win', "Home Team Ranks Higher", "Home Team Won Last"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_home_higher, y_true, scoring = scorer)

# Print results
print("Using whether the home team won the last matchup")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

Using whether the home team won the last matchup
F1: 60.1919%


After adding the feature that takes into account whether the home team won the last matchup between the teams, the model's accuracy increased to 60.17%

*** This is about .09 lower than the lecture ***

### Decision Tree Classifier: Encoding Teams as Features

In [68]:
df['Home Team'].values

array(['Indiana Pacers', 'Los Angeles Lakers', 'Miami Heat', ...,
       'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs'],
      dtype=object)

In [69]:
# Use LabelEncoder to transform the teams by assigning numbers to them
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoding = LabelEncoder()
encoding.fit(df["Home Team"].values)

LabelEncoder()

In [70]:
# Numerical values assigned to the home and visitor teams
home_teams = encoding.transform(df["Home Team"].values)
visitor_teams = encoding.transform(df["Visitor Team"].values)

# Stacks arrays in a sequence vertically (row-wise) & transposes the data
X_teams = np.vstack([home_teams, visitor_teams]).T

In [71]:
# OneHotEncoder takes each of the features of the numerical valued-teams & identify if it was the home or visitor team
# Will display a 1 if True, 0 if False
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

In [72]:
# View the data size of X_teams
X_teams.shape

(1230, 60)

In [73]:
# Print the first 30 elements in X_teams
print('Home:', X_teams[0:30])
print('Visitor:', X_teams[0,30:])

Home: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
Visitor: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0.]]


In [77]:
# Run the classifier on the new teams as features
# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_teams, y_true, scoring = model_scorer)

# Print results
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

F1: 59.9327%




When encoding the teams as individual features, the accuracy of the model resulted in an F1 score of 59.94%.


## Random Forest Classifiers

### Random Forest Classifier: Use ONLY the Teams Encoded as Features

In [78]:
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
scores = cross_val_score(rfc, X_teams, y_true, scoring = scorer)

# Print results
print("Using full team labels is ranked higher")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))



Using full team labels is ranked higher
F1: 61.1738%


*** This result is .4 higher than the lecture ***

### Random Forest Classifier: Adjusting Parameters with GridSearchCV for Encoded Teams

In [None]:
# Set up more detailed parameters for the Random Forest Classifier in an attempt to improve accuracy score
# Adjusting the parameters of the algorithm using GridSearchCV

parameter_space = {
                   "max_features": [2, 10, 50, 'auto'],
                   "n_estimators": [50, 100, 200],
                   "criterion": ["gini", "entropy"],
                   "min_samples_leaf": [1, 2, 4, 6],
                   }

# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
grid = GridSearchCV(rfc, parameter_space, scoring = scorer)
grid.fit(X_teams, y_true)

# Print results
print("F1: {0:.4f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)





































































After adjusting the parameters when using the encoded teams as features, the model's results improved to 64.36%

### Random Forest Classifier: Use ALL Features for Data

In [None]:
# Combine the team features (from one hot endoder) with the other previously determined features
X_all = np.hstack([X_home_higher, X_teams])
print(X_all.shape)

In [None]:
# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
scores = cross_val_score(rfc, X_all, y_true, scoring = scorer)

# Print results
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

There was a slight improvement to 61.49% with the additional features used in addition to the team features

*** This result is 1.1 higher than the lecture results ***

In [None]:
# Set up more detailed parameters for the Random Forest Classifier in an attempt to improve accuracy score
# Adjusting the parameters of the algorithm using GridSearchCV

parameter_space = {
                   "max_features": [2, 10, 50, 'auto'],
                   "n_estimators": [50, 100, 200],
                   "criterion": ["gini", "entropy"],
                   "min_samples_leaf": [1, 2, 4, 6],
                   }

# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 14)
grid = GridSearchCV(rfc, parameter_space, scoring = scorer)
grid.fit(X_all, y_true)

# Print results
print("F1: {0:.4f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)

After modifying the parameters of the Random Forest Classifier and using all of the features, the result was 62.9%. While this wasn't the best performing RFC, it did perform better than the initial baseline and the majority of the Decision Tree Classifiers.


*** INSERT GRAPHS COMPARING MODEL SCORE RESULTS ***

# Testing on 2014-2015 Season

In [None]:
# Load the data for the 2014-15 season
df_15 = pd.read_csv('./datasets/NBA Regular Season Results 2014-2015.csv', parse_dates = ['Date'], 
                    dtype = {'Attendance':int}) # Attendence is still showing up as a float (HOW TO FIX?)

# Correct column names
df_15.columns = ['Date', 'Start Time (ET)', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Box Score', 
              'OT', 'Attendance', 'Notes']

df_15.head(7)

In [None]:
# Load the previous season's standings
rank_14 = pd.read_csv('./datasets/2013-2014 Regular Season Standings.csv', index_col = 'Team')

# Rename columns in the rank DataFrame
rank_14.columns = ['Rank', 'Overall', 'Home', 'Road', 'Eastern Conference', 'Western Conference', 
                'Atlantic Div', 'Central Div', 'Southeast Div', 'Northwest Div', 'Pacific Div', 'Southwest Div', 
                'Pre All-Star', 'Post All-Star', 'Margin ≤3', 'Margin ≥10', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 
                'Mar', 'Apr']

rank_14.head()

## Add features to the DataFrame

**Home Win**

In [None]:
# Determine whether the home team won
df_15['Home Win'] = df_15['Visitor Score'] < df_15['Home Score']

# Assign "class values" to the Home Win column
y_true = df_15['Home Win'].values

In [None]:
# Review new column with correctly populated fields
df.head()

**Home Last Win & Visitor Last Win**

In [None]:
# Create two new columns, with all of the values set to False
df_15['Home Last Win'] = False
df_15['Visitor Last Win'] = False

# Determining whether the home and visitor teams won their last games
# This will update the Home Last Win & Visitor Last Win columns to either remain False or change to True

from collections import defaultdict

won_last = defaultdict(int)   # The default value of int is 0

# Inefficient method to deterime previous wins
for index, row in df_15.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['Home Last Win'] = won_last[home_team]
    row['Visitor Last Win'] = won_last[visitor_team]
    # Set current win
    won_last[home_team] = row['Home Win']
    won_last[visitor_team] = not row['Home Win']
df_15.sample(10)

**Home Win Streak & Visitor Win Streak**

In [None]:
# Creating columns for winning streaks
df_15["Home Win Streak"] = 0
df_15["Visitor Win Streak"] = 0

# Did the home and visitor teams win their last game?

from collections import defaultdict

win_streak = defaultdict(int)

for index, row in df_15.iterrows():  # Note that this is not the most efficient method
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["Home Win Streak"] = win_streak[home_team]
    row["Visitor Win Streak"] = win_streak[visitor_team]
    df_15.loc[index] = row    
    
    # Set current win streak
    if row["Home Win"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1
        
df_15.sample(10)

______________________________________________________________________________________________________________________

**Home Team Ranks Higher**

In [None]:
# Build function to add Home Team Ranks Higher feature
def home_team_ranks_higher_2015(row):
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    
    # Adjusting the Charlotte team names due to off-season league changes between 13-14 & 14-15
    if home_team == "Charlotte Hornets":
        home_team = "Charlotte Bobcats"
    if visitor_team == "Charlotte Hornets":
        visitor_team = "Charlotte Bobcats"
    
    home_rank = rank_14.loc[home_team]['Rank']
    visitor_rank = rank_14.loc[visitor_team]['Rank']
    
    # The higher ranking will be the lower number
    return home_rank < visitor_rank

In [None]:
# Create new column by using .apply(home_team_ranks_higher_2015) across the DataFrame
df_15["Home Team Ranks Higher"] = df_15.apply(home_team_ranks_higher_2015, axis = 1)
df_15.head()

In [None]:
# Add new column without using a function & .apply
# df_15["Home Team Ranks Higher"] = 0

# for index, row in df_15.iterrows():
#     home_team = row["Home Team"]
#     visitor_team = row["Visitor Team"]
   
    # Adjusting the New Orleans team names due to off-season league changes between 12-13 & 13-14
#     if home_team == "Charlotte Hornets":
#         home_team = "Charlotte Bobcats"
#     elif visitor_team == "Charlotte Hornets":
#         visitor_team = "Charlotte Bobcats"
    
#     home_rank = rank_14[rank_14["Team"] == home_team]["Rank"].values[0]
#     visitor_rank = rank_14[rank_14["Team"] == visitor_team]["Rank"].values[0]
#     row["Home Team Ranks Higher"] = int(home_rank > visitor_rank)
#     df_15.loc[index] = row

______________________________________________________________________________________________________________________

**Home Team Won Last**

In [None]:
# Add a new feature without using function -> Determines whether a team won the last matchup between the 2 teams
# This does not take into consideration the home/visitor teams

last_match_winner = defaultdict(int)
df_15["Home Team Won Last"] = 0

for index, row in df_15.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    
    # Set in the row, who won the last encounter
    row["Home Team Won Last"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
    df_15.loc[index] = row
    
    # Who won this one?
    winner = row["Home Team"] if row["Home Win"] else row["Visitor Team"]
    last_match_winner[teams] = winner
    
df_15.sample(7)

In [None]:
# Review the added features in the 2014-2015 season DataFrame
df_15[['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Last Win', 
       'Visitor Last Win', 'Home Team Ranks Higher', 'Home Win Streak', 'Visitor Win Streak', 
       'Home Team Won Last']][:5]

In [None]:
# Use selected features as input for the classifier (target variable) for if the home team is ranked higher
X_home_higher_15 = df_15[['Home Last Win', 'Visitor Last Win', 'Home Team Ranks Higher', 'Home Team Won Last']].values

______________________________________________________________________________________________________________________

***There was a team name change in the test set of data, causing the previous encoder to not carry over.***


Options:
1. Re-clean the data so that the team names are the same. This will create an inaccuracy in seasonal team names, but keep the encoded data formatted as is
2. Pass an explicit mapping to ordinal encoding and encode the two versions of each name as the same category. Then  pass the now a encoded integer column into any other method. This would keep the integrity of the data across the seasons, but still allow for further analysis

In [None]:
# Map the previously encoded standings for the Charlotte Hornets (new) to be assigned to the same encoding as 
# the Charlotte Bobcats (old)


In [None]:
# Reuse the trained team encoding and onehot classifier
# encoding = LabelEncoder()
# encoding.fit(df['Home Team].values)

home_teams_15 = encoding.transform(df_15['Home Team'].values)
visitor_teams_15 = encoding.transform(df_15['Visitor Team'].values)

X_teams_15 = np.vstack([home_teams_15, visitor_teams_15]).T
X_teams_15 = onehot.transform(X_teams_15).todense()

In [None]:
X_all_15 = np.hstack([X_home_higher_15, X_teams_15])
X_all_15.shape

## Setting the Baseline

In [None]:
print('The home team wins {:.2f}% of the games'.format(100 * np.mean(y_true)))

y_pred = [1] * len(y_test)

print('F1: {:.4f}'.format(f1_score(y_test, y_pred, pos_label = None, average = 'weighted')))
print(classification_report(y_test, y_pred))

In [None]:
grid.fit(X_all, y_true)

y_pred = grid.predict(X_all_15)

In [None]:
from sklearn import classification_report

print(classification_report(y_test, y_pred))

In [None]:
print('This results in getting {:.1f}% of predictions correct!'.format(100 * np.mean(y_pred == y_test)))

______________________________________________________________________________________________________________________