In [30]:
# Load necessary libraries

import platform
import numpy as np
import pandas as pd
import sklearn as sk
import os
from sklearn.model_selection import cross_val_score

In [12]:
# Load the dataset: NBA statistics from the 2013-2014 basketball season
df = pd.read_csv('./datasets/NBA Regular Season Results 2013-2014.csv')

In [13]:
from sklearn.metrics import f1_score, make_scorer, classification_report

scorer = make_scorer(f1_score, pos_label = None, average = 'weighted')

In [14]:
# Examine the dataset with its initial formatting
df.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Tue Oct 29 2013,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,18165,
1,Tue Oct 29 2013,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,18997,
2,Tue Oct 29 2013,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,19964,
3,Wed Oct 30 2013,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,20562,
4,Wed Oct 30 2013,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,19834,


In [15]:
df.dtypes

Date               object
Start (ET)         object
Visitor/Neutral    object
PTS                 int64
Home/Neutral       object
PTS.1               int64
Unnamed: 6         object
Unnamed: 7         object
Attend.             int64
Notes              object
dtype: object

In [17]:
# Don't read the first row because it is blank, and read the date column as a date

df = pd.read_csv('./datasets/NBA Regular Season Results 2013-2014.csv', parse_dates = ['Date'])

# Correct column names
df.columns = ['Date', 'Start Time (ET)', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Box Score', 
              'OT', 'Attendance', 'Notes']

In [18]:
df.shape

(1230, 10)

In [19]:
# Create new feature that notes whether the home team won
df['Home Win'] = df['Visitor Score'] < df['Home Score']

# Assign "class values" to the Home Win column

### This will also be the value that we want to predict ###
y_true = df['Home Win'].values

In [20]:
df.head()

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,18165,,True
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,18997,,True
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,19964,,True
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,20562,,True
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,19834,,True


In [21]:
# Establishing the baseline win percentage for the Home Teams overall
n_games = df['Home Win'].count()
n_homewins = df['Home Win'].sum()
win_percentage = n_homewins / n_games

print('Home Win percentage: {0:.2f}%'.format(100 * win_percentage))

Home Win percentage: 58.05%


In [22]:
# Predicting the baseline for Home Teams wins with simple classifier

from sklearn.metrics import f1_score

y_pred = [1] * len(y_true)
print('F1: {0:.4f}%'.format(f1_score(y_true, y_pred, pos_label = None, average = 'weighted') * 100))

F1: 42.6408%


In [23]:
# Create two new columns, with all of the values set to False
df['Home Last Win'] = False
df['Visitor Last Win'] = False

In [24]:
# Determining whether the home and visitor teams won their last games
# This will update the Home Last Win & Visitor Last Win columns to either remain False or change to True

from collections import defaultdict

won_last = defaultdict(int)   # The default value of int is 0

# Inefficient method to deterime previous wins
for index, row in df.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['Home Last Win'] = won_last[home_team]
    row['Visitor Last Win'] = won_last[visitor_team]
    # Set current win
    won_last[home_team] = row['Home Win']
    won_last[visitor_team] = not row['Home Win']
df.loc[20:25]

Unnamed: 0,Date,Start Time (ET),Visitor Team,Visitor Score,Home Team,Home Score,Box Score,OT,Attendance,Notes,Home Win,Home Last Win,Visitor Last Win
20,2013-11-01,7:30 pm,Milwaukee Bucks,105,Boston Celtics,98,Box Score,,18624,,False,False,False
21,2013-11-01,8:00 pm,Miami Heat,100,Brooklyn Nets,101,Box Score,,17732,,True,False,False
22,2013-11-01,7:00 pm,Cleveland Cavaliers,84,Charlotte Bobcats,90,Box Score,,18017,,True,False,False
23,2013-11-01,9:00 pm,Portland Trail Blazers,113,Denver Nuggets,98,Box Score,,19155,,False,False,False
24,2013-11-01,8:00 pm,Dallas Mavericks,105,Houston Rockets,113,Box Score,,18142,,True,False,False
25,2013-11-01,10:30 pm,San Antonio Spurs,91,Los Angeles Lakers,85,Box Score,,18997,,False,False,False


In [25]:
df.loc[90:95][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 
               'Home Win', 'Home Last Win', 'Visitor Last Win']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Last Win,Visitor Last Win
90,2013-11-09,Dallas Mavericks,91,Milwaukee Bucks,83,False,False,False
91,2013-11-09,Portland Trail Blazers,96,Sacramento Kings,85,False,False,False
92,2013-11-09,Utah Jazz,91,Toronto Raptors,115,True,False,False
93,2013-11-10,Minnesota Timberwolves,113,Los Angeles Lakers,90,False,False,False
94,2013-11-10,San Antonio Spurs,120,New York Knicks,89,False,False,False
95,2013-11-10,Washington Wizards,105,Oklahoma City Thunder,106,True,False,False


In [26]:
# Basic Decision Tree Classifier set up
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 14) # Remove random_state to get non-replicable results

In [34]:
# Use selected features as input for the classifier (target)
X_previouswins = df[['Home Last Win', 'Visitor Last Win']].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_previouswins, y_true, scoring = scorer)

# Print results
print("Using just the last result from the home and visitor teams")
print('F1: {0:.4f}%'.format(np.mean(scores) * 100))

Using just the last result from the home and visitor teams
F1: 42.6409%


In [35]:
# Taking into consideration winning streaks - What are the teams' win streaks coming into the game?
df["Home Win Streak"] = 0
df["Visitor Win Streak"] = 0

# Did the home and visitor teams win their last game?
from collections import defaultdict
win_streak = defaultdict(int)

for index, row in df.iterrows():  # Note that this is not the most efficient method
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["Home Win Streak"] = win_streak[home_team]
    row["Visitor Win Streak"] = win_streak[visitor_team]
    df.loc[index] = row    
    
    # Set current win streak
    if row["Home Win"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1

In [36]:
df.loc[50:60][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 
               'Home Win', 'Home Win Streak', 'Visitor Win Streak']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Win Streak,Visitor Win Streak
50,2013-11-05,Utah Jazz,88,Brooklyn Nets,104,True,0,0
51,2013-11-05,Los Angeles Lakers,104,Dallas Mavericks,123,True,1,1
52,2013-11-05,San Antonio Spurs,102,Denver Nuggets,94,False,0,0
53,2013-11-05,Indiana Pacers,99,Detroit Pistons,91,False,1,3
54,2013-11-05,Phoenix Suns,104,New Orleans Pelicans,98,False,1,0
55,2013-11-05,Charlotte Bobcats,102,New York Knicks,97,False,0,0
56,2013-11-05,Houston Rockets,116,Portland Trail Blazers,101,False,2,0
57,2013-11-05,Atlanta Hawks,105,Sacramento Kings,100,False,0,0
58,2013-11-05,Miami Heat,104,Toronto Raptors,95,False,1,1
59,2013-11-06,Utah Jazz,87,Boston Celtics,97,True,0,0


In [37]:
# Use selected features as input for the classifier (target)
X_winstreak = df[["Home Last Win", "Visitor Last Win", "Home Win Streak", "Visitor Win Streak"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_winstreak, y_true, scoring = scorer)

# Print results
print("Using whether the home team is ranked higher")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
F1: 53.0399%


In [38]:
# Identify which team is higher in the standings, based on the previous year's regular season final standings
# Load the standings data file

rank = pd.read_csv('./datasets/2012-2013 Regular Season Standings.csv', index_col = 'Team')

In [41]:
rank.head()

Unnamed: 0_level_0,Rk,Overall,Home,Road,E,W,A,C,SE,NW,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Miami Heat,1,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,8-2,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
Oklahoma City Thunder,2,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,10-6,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
San Antonio Spurs,3,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,9-9,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
Denver Nuggets,4,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,11-5,...,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
Los Angeles Clippers,5,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,12-6,...,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1


In [43]:
# Rename columns in the rank DataFrame
rank.columns = ['Rank', 'Overall', 'Home', 'Road', 'Eastern Conference', 'Western Conference', 
                'Atlantic Div', 'Central Div', 'Southeast Div', 'Northwest Div', 'Pacific Div', 'Southwest Div', 
                'Pre All-Star', 'Post All-Star', 'Margin ≤3', 'Margin ≥10', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 
                'Mar', 'Apr']
rank.head(3)

Unnamed: 0_level_0,Rank,Overall,Home,Road,Eastern Conference,Western Conference,Atlantic Div,Central Div,Southeast Div,Northwest Div,...,Post All-Star,Margin ≤3,Margin ≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Miami Heat,1,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,8-2,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
Oklahoma City Thunder,2,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,10-6,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
San Antonio Spurs,3,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,9-9,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6


In [44]:
# Create a new feature -> Home Team Ranks Higher
# First create a function that iterates through the df to determine if the home team has a higher rank based on rank df

def home_team_ranks_higher(row):
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    
    # Adjusting the New Orleans team names due to off-season league changes between 12-13 & 13-14
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    if visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    
    home_rank = rank.loc[home_team]["Rank"]
    visitor_rank = rank.loc[visitor_team]["Rank"]
    
    return home_rank < visitor_rank   # The higher ranking will be the lower number

In [45]:
# Create new column by using .apply(home_team_ranks_higher) across the DataFrame
df["Home Team Ranks Higher"] = df.apply(home_team_ranks_higher, axis = 1)

# Display the selected columns on the sliced DataFrame (first 5 rows)
df[:5][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Ranks Higher']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Team Ranks Higher
0,2013-10-29,Orlando Magic,87,Indiana Pacers,97,True,True
1,2013-10-29,Los Angeles Clippers,103,Los Angeles Lakers,116,True,False
2,2013-10-29,Chicago Bulls,95,Miami Heat,107,True,True
3,2013-10-30,Brooklyn Nets,94,Cleveland Cavaliers,98,True,False
4,2013-10-30,Atlanta Hawks,109,Dallas Mavericks,118,True,False


In [46]:
# # Add new column without using a function & .apply
# df["Home Team Ranks Higher"] = 0

# for index, row in df.iterrows():
#     home_team = row["Home Team"]
#     visitor_team = row["Visitor Team"]
   
#     # Adjusting the New Orleans team names due to off-season league changes between 12-13 & 13-14
#     if home_team == "New Orleans Pelicans":
#         home_team = "New Orleans Hornets"
#     elif visitor_team == "New Orleans Pelicans":
#         visitor_team = "New Orleans Hornets"
    
#     home_rank = rank[rank["Team"] == home_team]["Rank"].values[0]
#     visitor_rank = rank[rank["Team"] == visitor_team]["Rank"].values[0]
#     row["Home Team Ranks Higher"] = int(home_rank > visitor_rank)
#     df.loc[index] = row

In [47]:
# df[:5][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win']]

In [54]:
# Decision Tree Classifier based on if Home Team has a Higher Ranking

# Use selected features as input for the classifier (target)
X_homehigher = df[["Home Last Win", "Visitor Last Win", "Home Team Ranks Higher"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_homehigher, y_true, scoring = scorer)

# Print results
print("Using whether the home team is ranked higher")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
F1: 55.0177%


In [50]:
# Adjusting the parameters of the algorithm using GridSearchCV to test if there is an improvement in the model's score

from sklearn.model_selection import GridSearchCV

parameter_space = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
}
dtc = DecisionTreeClassifier(random_state = 14)

grid = GridSearchCV(dtc, parameter_space, scoring = scorer)
grid.fit(X_homehigher, y_true)

print('F1: {0:.4f}%'.format(grid.best_score_ * 100))

F1: 55.0177%


In [51]:
# Build a function that determines whether a team won the last matchup between the 2 teams
# This does not take into consideration the home/visitor teams

last_game_winner = defaultdict(int)

def home_team_won_last(row):
    home_team = row['Home Team']
    visitor_team = row['Visistor Team']
    
    # Sort for a consistent ordering
    teams = tuple(sorted([home_team, visitor_team]))
    # Parse the row for which team won the last matchup, then add a 1 if the Home Team won
    result = 1 if last_game_winner[teams] == row['Home Team'] else 0
    
    # Update record for next matchup
    winner = row['Home Team'] if row['Home Win'] else row['Visitor Team']
    last_game_winner[teams] = winner
    
    return result

# Function works, but is not applying across DataFrame (see cell below)

In [52]:
# Create new column by using .apply(home_team_won_last) across the DataFrame
# df['Home Team Won Last'] = df.apply(home_team_won_last, axis = 1)

In [53]:
# Display the selected columns on the sliced DataFrame (5 random rows)
# df[90:100][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Won Last']]

In [55]:
# Add a new feature without using function -> Determines whether a team won the last matchup between the 2 teams
# This does not take into consideration the home/visitor teams

last_game_winner = defaultdict(int)
df["Home Team Won Last"] = 0

for index, row in df.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    
    # Set in the row, who won the last encounter
    row["Home Team Won Last"] = 1 if last_game_winner[teams] == row["Home Team"] else 0
    df.loc[index] = row
    
    # Who won this one?
    winner = row["Home Team"] if row["Home Win"] else row["Visitor Team"]
    last_game_winner[teams] = winner

In [56]:
# Display the selected columns on the sliced DataFrame (10 random rows)
df[90:100][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Won Last']]

Unnamed: 0,Date,Visitor Team,Visitor Score,Home Team,Home Score,Home Win,Home Team Won Last
90,2013-11-09,Dallas Mavericks,91,Milwaukee Bucks,83,False,0
91,2013-11-09,Portland Trail Blazers,96,Sacramento Kings,85,False,0
92,2013-11-09,Utah Jazz,91,Toronto Raptors,115,True,0
93,2013-11-10,Minnesota Timberwolves,113,Los Angeles Lakers,90,False,0
94,2013-11-10,San Antonio Spurs,120,New York Knicks,89,False,0
95,2013-11-10,Washington Wizards,105,Oklahoma City Thunder,106,True,0
96,2013-11-10,New Orleans Pelicans,94,Phoenix Suns,101,True,1
97,2013-11-11,Orlando Magic,105,Boston Celtics,120,True,1
98,2013-11-11,Atlanta Hawks,103,Charlotte Bobcats,94,False,0
99,2013-11-11,Cleveland Cavaliers,81,Chicago Bulls,96,True,0


In [57]:
#Use selected features as input for the classifier (target)
X_home_higher = df[['Home Last Win', 'Visitor Last Win', "Home Team Ranks Higher", "Home Team Won Last"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_home_higher, y_true, scoring = scorer)

# Print results
print("Using whether the home team won the last matchup")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

Using whether the home team won the last matchup
F1: 60.1919%
