In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score

Load boxscore_df

In [2]:
boxscore_df = pd.read_csv('boxscore_df')
boxscore_df.head()

Unnamed: 0.1,Unnamed: 0,id_x,name,id_y,game_id,team_id,player_id,position,player_name,goals,...,caused_turnovers,faceoffs_won,faceoffs_taken,penalties,penalty_time,goalie_seconds,goals_allowed,goalie_saves,created_at,updated_at
0,0,2,Binghamton,4712,1,2,41.0,D,Chris Bechle,0,...,0,0,0,1,30,0,0,0,2023-10-24 16:37:17.319077,2023-10-24 16:37:17.319077
1,1,2,Binghamton,4713,1,2,48.0,D,George Diegnan,0,...,0,1,2,0,0,0,0,0,2023-10-24 16:37:17.319077,2023-10-24 16:37:17.319077
2,2,2,Binghamton,4714,1,2,51.0,D,Sean Finnigan,0,...,0,0,0,0,0,0,0,0,2023-10-24 16:37:17.319077,2023-10-24 16:37:17.319077
3,3,2,Binghamton,4715,1,2,57.0,M,Matt Kaser,0,...,0,0,0,0,0,0,0,0,2023-10-24 16:37:17.319077,2023-10-24 16:37:17.319077
4,4,2,Binghamton,4716,1,2,63.0,M,Anthony Lombardo,0,...,0,0,0,0,0,0,0,0,2023-10-24 16:37:17.319077,2023-10-24 16:37:17.319077


In [3]:
boxscore_df.shape

(1180290, 28)

Let's look at total goals by position

In [4]:
boxscore_df.groupby('position')['goals'].sum()

position
A    116019
D      2530
G       127
M     55449
Name: goals, dtype: int64

This makes sense. Most of the goals are scored by Attackers and Midfielders with a handful by Defenders and a couple by Goalies. I wonder what the counts by position are.

In [5]:
boxscore_df.groupby('position')['player_id'].count()

position
A    69311
D    73686
G    32902
M    72669
Name: player_id, dtype: int64

This also makes sense. Each team is typically comprised of three A's, M's, and D's with one G. 

Create a new df for the classifier to include only the stats that are being taken into consideration. Then remove all blank entries.

In [3]:
df = boxscore_df[['position',
       'goals', 'assists', 'points', 'shots', 'shots_on_goal',
       'ground_balls', 'turnovers',
       'caused_turnovers', 'faceoffs_won', 'faceoffs_taken', 'penalties',
       'penalty_time', 'goalie_seconds', 'goals_allowed', 'goalie_saves']]

df_noblanks = df[df['position'] != '']
df_noblanks = df_noblanks.dropna()

Compare the shapes of the overall df to the one without blanks.

In [7]:
df.shape, df_noblanks.shape

((1180290, 16), (248616, 16))

~21% of the data points are not blank. Although, that's a low percentage, it's based on 1.1 millions rows of data. There are plenty of data points to train and test on.

In [4]:
X = df_noblanks.drop('position', axis=1).values
y = df_noblanks['position'].values

Create training and testing sets with default parameters

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

I wonder if different train/test sizes will make a difference.

In [6]:
gauclf = GaussianNB().fit(X_train, y_train)
#print('Gaussian training accuracy: {:.4f}'.format(gauclf.score(X_train,y_train)))
#print('Gaussian test accuracy: {:.4f}'.format(gauclf.score(X_test, y_test)))

In [7]:
cvs = cross_val_score(gauclf, X, y, cv=5)
print("5 cross val score: %0.4f accuracy with a standard deviation of %0.4f" % (cvs.mean(), cvs.std()))

5 cross val score: 0.7165 accuracy with a standard deviation of 0.0162


In [10]:
for r in range(60, 96, 5):
    print(r/100)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=r/100, random_state=42)
    gauclf = GaussianNB().fit(X_train, y_train)
    print('Gaussian training accuracy: {:.4f}'.format(gauclf.score(X_train,y_train)))
    print('Gaussian test accuracy: {:.4f}'.format(gauclf.score(X_test, y_test)))
    

0.6
Gaussian training accuracy: 0.7159
Gaussian test accuracy: 0.7154
0.65
Gaussian training accuracy: 0.7156
Gaussian test accuracy: 0.7159
0.7
Gaussian training accuracy: 0.7165
Gaussian test accuracy: 0.7147
0.75
Gaussian training accuracy: 0.7161
Gaussian test accuracy: 0.7158
0.8
Gaussian training accuracy: 0.7159
Gaussian test accuracy: 0.7169
0.85
Gaussian training accuracy: 0.7161
Gaussian test accuracy: 0.7164
0.9
Gaussian training accuracy: 0.7164
Gaussian test accuracy: 0.7153
0.95
Gaussian training accuracy: 0.7166
Gaussian test accuracy: 0.7103


Going to remove the same features as in the best scoring decision tree model

Remove goals, assists, penalty time, shots on goal, goalie saves, goals allowed, faceoffs won

In [11]:
#remove goals, assists, penalty time, shots on goal, goalie saves, goals allowed, faceoffs won
X2 = df_noblanks.drop(['position','goals','assists','faceoffs_won','penalty_time','shots_on_goal','goalie_saves','goals_allowed'], axis=1).values
y2 = df_noblanks['position'].values

In [12]:
for r in range(60, 96, 5):
    print(r/100)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, train_size=r/100, random_state=42)
    gauclf2 = GaussianNB().fit(X_train2, y_train2)
    print('Gaussian training accuracy: {:.4f}'.format(gauclf2.score(X_train2,y_train2)))
    print('Gaussian test accuracy: {:.4f}'.format(gauclf2.score(X_test2, y_test2)))

0.6
Gaussian training accuracy: 0.7201
Gaussian test accuracy: 0.7200
0.65
Gaussian training accuracy: 0.7198
Gaussian test accuracy: 0.7205
0.7
Gaussian training accuracy: 0.7199
Gaussian test accuracy: 0.7185
0.75
Gaussian training accuracy: 0.7192
Gaussian test accuracy: 0.7188
0.8
Gaussian training accuracy: 0.7190
Gaussian test accuracy: 0.7198
0.85
Gaussian training accuracy: 0.7195
Gaussian test accuracy: 0.7190
0.9
Gaussian training accuracy: 0.7195
Gaussian test accuracy: 0.7185
0.95
Gaussian training accuracy: 0.7197
Gaussian test accuracy: 0.7129
