In [35]:
# libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [36]:
# read in data
all_stats_df = pd.DataFrame(pd.read_csv('../resources/transformed_data/all_stats.csv'))
all_stats_df = all_stats_df.drop(columns=['WTeamID'])

# drop all NaNs
all_stats_df = all_stats_df.dropna(how='any')

all_stats_df.head()

Unnamed: 0,Season,DayNum,Team1,Team2,Tourney,WLoc,ScoreDiff,Team1Seed,Team2Seed,Team1FirstYear,...,AvgStl_Team1,AvgStl_Team2,AvgBlk_Team1,AvgBlk_Team2,AvgPF_Team1,AvgPF_Team2,FG%_Team1,FG%_Team2,FG3%_Team1,FG3%_Team2
74048,2003,10,1104,1328,0,0,6,10,1,1985,...,6.607143,6.933333,3.785714,3.766667,18.035714,18.6,0.420362,0.446934,0.320144,0.393673
74049,2003,10,1272,1393,0,0,7,7,3,1985,...,7.37931,8.310345,5.068966,7.275862,18.758621,16.586207,0.437931,0.470067,0.348797,0.330435
74050,2003,11,1266,1437,0,0,12,3,30,1985,...,6.0,7.5,3.642857,3.4,18.642857,20.9,0.48381,0.420429,0.379391,0.34904
74051,2003,11,1296,1457,0,0,6,30,30,1985,...,7.612903,7.607143,3.612903,5.392857,19.806452,19.642857,0.458967,0.432107,0.383104,0.351687
74052,2003,11,1208,1400,0,0,6,30,1,1985,...,7.62963,6.392857,4.37037,3.857143,17.185185,20.357143,0.464135,0.448513,0.380252,0.348936


In [37]:
starter_df = all_stats_df[['Season', 
                           'Team1', 'Team2',
                           'WLoc','WTeam',
                           'Tourney','Team1Seed', 'Team2Seed', 
                           'Team1RankMean','Team2RankMean',
                           'Win%_Team1','Win%_Team2', 
                           'GameCount_Team1', 'GameCount_Team2',
                           'LoseCount_Team1', 'LoseCount_Team2',


]]
starter_df

Unnamed: 0,Season,Team1,Team2,WLoc,WTeam,Tourney,Team1Seed,Team2Seed,Team1RankMean,Team2RankMean,Win%_Team1,Win%_Team2,GameCount_Team1,GameCount_Team2,LoseCount_Team1,LoseCount_Team2
74048,2003,1104,1328,0,1,0,10,1,27.655502,15.730233,36.862245,64.000000,28.0,30.0,11.0,6.0
74049,2003,1272,1393,0,1,0,7,3,42.000000,25.596154,62.901308,68.489893,29.0,29.0,6.0,5.0
74050,2003,1266,1437,0,1,0,3,30,18.967442,60.385000,67.474490,25.000000,28.0,30.0,5.0,15.0
74051,2003,1296,1457,0,1,0,30,30,147.512500,209.452500,30.072841,41.326531,31.0,28.0,14.0,10.0
74052,2003,1208,1400,0,0,0,30,1,19.261283,9.416279,49.519890,61.734694,27.0,28.0,8.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168791,2014,1163,1277,0,1,1,7,4,26.517978,10.352550,58.477509,58.477509,34.0,34.0,8.0,8.0
168792,2014,1246,1276,0,1,1,8,2,20.905134,22.406109,49.826990,57.392103,34.0,33.0,10.0,8.0
168793,2014,1163,1196,0,1,1,7,1,26.517978,11.246120,58.477509,88.581315,34.0,34.0,8.0,2.0
168794,2014,1246,1458,0,1,1,8,2,20.905134,9.019978,49.826990,62.075298,34.0,33.0,10.0,7.0


In [38]:
other_cols = ['Team1FirstYear', 'Team2FirstYear', 
#              'Team1RankMean', 'Team2RankMean', 
             'WinCount_Team1', 'WinCount_Team2', 
#              'GameCount_Team1', 'GameCount_Team2', 
             'AvgScore_Team1', 'AvgScore_Team2',                              
#              'Win%_Team1','Win%_Team2', 
#              'LoseCount_Team1', 'LoseCount_Team2',
             'AvgFGM_Team1',  'AvgFGM_Team2', 
             'AvgFGA_Team1', 'AvgFGA_Team2', 
             'AvgFGM3_Team1', 'AvgFGM3_Team2', 
             'AvgFGA3_Team1', 'AvgFGA3_Team2', 
             'AvgOR_Team1', 'AvgOR_Team2', 
             'AvgDR_Team1', 'AvgDR_Team2', 
             'FG%_Team1', 'FG%_Team2', 
             'FG3%_Team1', 'FG3%_Team2']

In [39]:
# One hot encoder and scaler for pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object))
)

In [40]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn import metrics

mlc = MLPClassifier(alpha=0.001, hidden_layer_sizes=(168), 
              random_state=1, solver='sgd',
              nesterovs_momentum=True)

pipe = make_pipeline(ct, mlc)

In [41]:
# select features and target
features_df = starter_df.copy()
target = features_df.pop('WTeam')
selected_features = features_df

X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, stratify=target, 
                                                    random_state=42)

pipe.fit(X_train, y_train)

preds = pipe.predict(X_test)
loss_score = metrics.log_loss(y_test, preds)

print('\n-------------------')
print(f'Log loss: {loss_score}')
print('---------------------')
print(metrics.confusion_matrix(y_test,preds))
print('\n-------------------')
print(metrics.classification_report(y_test,preds))
print('-------------------\n')


-------------------
Log loss: 8.314055601531692
---------------------
[[9226 2794]
 [2831 8517]]

-------------------
              precision    recall  f1-score   support

           0       0.77      0.77      0.77     12020
           1       0.75      0.75      0.75     11348

    accuracy                           0.76     23368
   macro avg       0.76      0.76      0.76     23368
weighted avg       0.76      0.76      0.76     23368

-------------------



log loss 1: 8.520980999224033

log loss 2: 8.39830485868785

log loss 3: 8.367263816334052

Current log loss: Log loss: 8.314055601531692

In [42]:
effective_features = {}

for col in other_cols[::2]:
    iteration_df = starter_df.copy()
    team1_stat = col
    team2_stat = other_cols[other_cols.index(col)+1]
    print(f'{team1_stat} - {team2_stat}')
    iteration_df[team1_stat] = all_stats_df[team1_stat]
    iteration_df[team2_stat] = all_stats_df[team2_stat]

    # select features and target
    target = iteration_df.pop('WTeam')
    selected_features = iteration_df

    X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                        target, stratify=target, 
                                                        random_state=42)

    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_test)
    new_loss_score = metrics.log_loss(y_test, preds)

    print(f'Log loss: {new_loss_score}')
    print('---------------------')
    print(metrics.confusion_matrix(y_test,preds))
    print('-------------------')
    print(metrics.classification_report(y_test,preds))
    print('-------------------\n\n')
    
    if new_loss_score < loss_score:
#         loss_score = new_loss_score
        effective_features[team1_stat] = new_loss_score - loss_score

Team1FirstYear - Team2FirstYear
Log loss: 8.364308323367881
---------------------
[[9242 2778]
 [2881 8467]]
-------------------
              precision    recall  f1-score   support

           0       0.76      0.77      0.77     12020
           1       0.75      0.75      0.75     11348

    accuracy                           0.76     23368
   macro avg       0.76      0.76      0.76     23368
weighted avg       0.76      0.76      0.76     23368

-------------------


WinCount_Team1 - WinCount_Team2
Log loss: 8.334747508274887
---------------------
[[9244 2776]
 [2863 8485]]
-------------------
              precision    recall  f1-score   support

           0       0.76      0.77      0.77     12020
           1       0.75      0.75      0.75     11348

    accuracy                           0.76     23368
   macro avg       0.76      0.76      0.76     23368
weighted avg       0.76      0.76      0.76     23368

-------------------


AvgScore_Team1 - AvgScore_Team2
Log loss: 8.

In [43]:
effective_features

{}

In [44]:
import operator
min(effective_features.items(), key=operator.itemgetter(1))[0]

ValueError: min() arg is an empty sequence