# NCAA ML contest - Model testing

In [22]:
# libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data

In [25]:
# read in data
all_stats_df = pd.DataFrame(pd.read_csv('../resources/transformed_data/all_stats.csv'))
all_stats_df = all_stats_df.drop(columns=['WTeamID'])
all_stats_df.columns

Index(['Season', 'DayNum', 'Team1', 'Team2', 'Tourney', 'WLoc', 'ScoreDiff',
       'Team1Seed', 'Team2Seed', 'Team1FirstYear', 'Team1LastYear',
       'Team2FirstYear', 'Team2LastYear', 'WTeam', 'Team1RankMean',
       'Team2RankMean', 'WinCount_Team1', 'GameCount_Team1', 'AvgScore_Team1',
       'Win%_Team1', 'WinCount_Team2', 'GameCount_Team2', 'AvgScore_Team2',
       'Win%_Team2', 'LoseCount_Team1', 'AvgFGM_Team1', 'LoseCount_Team2',
       'AvgFGM_Team2', 'AvgFGA_Team1', 'AvgFGA_Team2', 'AvgFGM3_Team1',
       'AvgFGM3_Team2', 'AvgFGA3_Team1', 'AvgFGA3_Team2', 'AvgFTM_Team1',
       'AvgFTM_Team2', 'AvgFTA_Team1', 'AvgFTA_Team2', 'AvgOR_Team1',
       'AvgOR_Team2', 'AvgDR_Team1', 'AvgDR_Team2', 'AvgAst_Team1',
       'AvgAst_Team2', 'AvgTO_Team1', 'AvgTO_Team2', 'AvgStl_Team1',
       'AvgStl_Team2', 'AvgBlk_Team1', 'AvgBlk_Team2', 'AvgPF_Team1',
       'AvgPF_Team2', 'FG%_Team1', 'FG%_Team2', 'FG3%_Team1', 'FG3%_Team2'],
      dtype='object')

## Feature selection

In [27]:
# easy way to comment out unwanted features
features_df = all_stats_df[['Season', 
#                             'DayNum', 
                            'Team1', 'Team2', 
                            'Tourney', 
                            'WLoc', 
#                             'ScoreDiff',
                            'Team1RankMean', 'Team2RankMean',
                            'Team1Seed', 'Team2Seed',
#                             'Team1FirstYear', 'Team2FirstYear',
#                             'Team1LastYear', 'Team2LastYear',
#                             'WinCount_Team1', 'WinCount_Team2',
                            'LoseCount_Team1', 'LoseCount_Team2',
                            'GameCount_Team1', 'GameCount_Team2',
                            'Win%_Team1', 'Win%_Team2',
#                             'AvgScore_Team1', 'AvgScore_Team2',
#                             'AvgFGM_Team1', 'AvgFGM_Team2', 
#                             'AvgFGA_Team1', 'AvgFGA_Team2',
#                             'AvgFGM3_Team1', 'AvgFGM3_Team2', 
#                             'AvgFGA3_Team1', 'AvgFGA3_Team2',
#                             'AvgFTM_Team1', 'AvgFTM_Team2', 
#                             'AvgFTA_Team1', 'AvgFTA_Team2',
#                             'AvgOR_Team1', 'AvgOR_Team2', 
#                             'AvgDR_Team1', 'AvgDR_Team2',
#                             'AvgAst_Team1', 'AvgAst_Team2', 
#                             'AvgTO_Team1', 'AvgTO_Team2',
#                             'AvgStl_Team1', 'AvgStl_Team2', 
#                             'AvgBlk_Team1', 'AvgBlk_Team2',
#                             'AvgPF_Team1', 'AvgPF_Team2',
#                             'FG%_Team1', 'FG%_Team2',
#                             'FG3%_Team1', 'FG3%_Team2',
                            'WTeam',
                           ]] 

## attempt at weighting
# features_df['Win%_Team1'] = (features_df['Win%_Team1']+1)**2
# features_df['Win%_Team2'] = (features_df['Win%_Team2']+1)**2

# features_df['Win%_Team1'] = features_df['Win%_Team1']*100
# features_df['Win%_Team2'] = features_df['Win%_Team2']*100

# features_df['Team1'] = features_df['Team1']*100
# features_df['Team2'] = features_df['Team2']*100

# drop all NaNs
features_df = features_df.dropna(how='any')


# # change column type to test one hot encoder and scaler
# features_df = features_df.astype({
#                                   'Season':'str',
#                                   'Team1':'str',
#                                   'Team2':'str',
#                                   'Team1FirstYear':'str',
#                                   'Team2FirstYear':'str',
#                                  })

# select features and target
target = features_df.pop('WTeam')
selected_features = features_df
print(selected_features.shape)
print(target.shape)

(93472, 15)
(93472,)


## Feature engineering

In [28]:
# One hot encoder and scaler for pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object))
)

ct.fit_transform(selected_features)

array([[-1.70216942, -1.40780681, -0.19004846, ..., -0.09477752,
         0.44560985,  1.82504033],
       [-1.70216942,  0.54764181,  0.56935645, ..., -0.56072422,
         1.80632575,  2.06050018],
       [-1.70216942,  0.47780436,  1.08341516, ..., -0.09477752,
         2.04530521, -0.22020534],
       ...,
       [ 0.44323025, -0.72107188, -1.73222459, ...,  1.76900926,
         1.57515253,  3.11413847],
       [ 0.44323025,  0.24501286,  1.32876136, ...,  1.30306257,
         1.12310482,  1.72410475],
       [ 0.44323025, -0.72107188, -1.14806697, ...,  1.76900926,
         1.57515253,  1.08177653]])

## MLP testing

#### Grid search

In [19]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import make_pipeline
# from sklearn.model_selection import GridSearchCV, LeaveOneOut
# from sklearn import metrics

# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# mlc = MLPClassifier(activation = 'relu', random_state=1, nesterovs_momentum=True)
# loo = LeaveOneOut()
# pipe = make_pipeline(ct, mlc)

# params = {
#           "mlpclassifier__hidden_layer_sizes":[(168,),(126,),(498,),(166,)],
#           "mlpclassifier__solver" : ('sgd','adam'), 
#           "mlpclassifier__alpha": [0.001,0.0001],
#           "mlpclassifier__learning_rate_init":[0.005,0.001]
#          }

# clf = GridSearchCV(pipe, params, n_jobs=-1, verbose=3)

# X_train, X_test, y_train, y_test = train_test_split(selected_features, 
#                                                     target, stratify=target, 
#                                                     random_state=42)

# clf.fit(X_train, y_train)

# model = clf.best_estimator_
# print("the best model and parameters are the following: {} ".format(model))

In [20]:
# model.predict_proba(X_test)
# preds = model.predict(X_test)
# train_score = model.score(X_train, y_train)
# test_score = model.score(X_test, y_test)
# loss_score = metrics.log_loss(y_test, preds)*.1

# print(f'Train score: {test_score}')
# print(f'Test score: {test_score}')
# print(f'Log loss: {loss_score}')

MLPClassifier(alpha=0.001, hidden_layer_sizes=(168,), random_state=1, solver='sgd')

Test score: 0.7564

Log loss: 0.841374060213598

MLPClassifier(alpha=0.001, hidden_layer_sizes=(168,), random_state=1, solver='sgd')

Test score: 0.7552

Log loss: 0.84551868139709

MLPClassifier(alpha=0.001, hidden_layer_sizes=(168,), random_state=1, solver='sgd')
 
Test score: 0.7595148169668797

Log loss: 0.8306158860092259

In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, stratify=target, 
                                                    random_state=42)

mlc = MLPClassifier(alpha=0.001, hidden_layer_sizes=(168), 
              random_state=1, solver='sgd',
              nesterovs_momentum=True, verbose=True)

pipe = make_pipeline(ct, mlc)

pipe.fit(X_train, y_train)

pipe.predict_proba(X_test)
preds = pipe.predict(X_test)
train_score = pipe.score(X_train, y_train)
test_score = pipe.score(X_test, y_test)
loss_score = metrics.log_loss(y_test, preds)

print('\n-------------------')
print(f'Train score: {test_score}')
print(f'Test score: {test_score}')
print(f'Log loss: {loss_score}')
print('\n-------------------\n')
print(metrics.confusion_matrix(y_test,preds))
print('\n-------------------\n')
print(metrics.classification_report(y_test,preds))

Iteration 1, loss = 0.55926628
Iteration 2, loss = 0.50914301
Iteration 3, loss = 0.50383336
Iteration 4, loss = 0.50140025
Iteration 5, loss = 0.49974254
Iteration 6, loss = 0.49851944
Iteration 7, loss = 0.49749014
Iteration 8, loss = 0.49660954
Iteration 9, loss = 0.49584680
Iteration 10, loss = 0.49519513
Iteration 11, loss = 0.49459855
Iteration 12, loss = 0.49404590
Iteration 13, loss = 0.49356915
Iteration 14, loss = 0.49312288
Iteration 15, loss = 0.49272520
Iteration 16, loss = 0.49234013
Iteration 17, loss = 0.49200833
Iteration 18, loss = 0.49170435
Iteration 19, loss = 0.49137676
Iteration 20, loss = 0.49115669
Iteration 21, loss = 0.49088284
Iteration 22, loss = 0.49064096
Iteration 23, loss = 0.49039901
Iteration 24, loss = 0.49020310
Iteration 25, loss = 0.48999609
Iteration 26, loss = 0.48979848
Iteration 27, loss = 0.48960862
Iteration 28, loss = 0.48942753
Iteration 29, loss = 0.48925219
Iteration 30, loss = 0.48907936
Iteration 31, loss = 0.48891552
Iteration 32, los

## Gaussian testing

In [65]:
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.pipeline import make_pipeline
# from sklearn.model_selection import train_test_split
# from sklearn import metrics
# import os

# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# X_train, X_test, y_train, y_test = train_test_split(selected_features, 
#                                                     target, 
#                                                     random_state=42)

# # clf = GaussianProcessClassifier(1.0 * RBF(1.0))
# clf = GaussianProcessClassifier()
# pipe = make_pipeline(ct, clf)
# pipe.fit(X_train, y_train)

# pipe.predict_proba(X_test)
# preds = pipe.predict(X_test)
# train_score = pipe.score(X_train, y_train)
# test_score = pipe.score(X_test, y_test)
# loss_score = metrics.log_loss(y_test, preds)*.1

# print(f'Train score: {test_score}')
# print(f'Test score: {test_score}')
# print(f'Log loss: {loss_score}')

## Random forest testing

In [None]:
# from sklearn.model_selection import GridSearchCV, LeaveOneOut
# from sklearn.pipeline import make_pipeline
# from sklearn.datasets import make_classification
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(selected_features, 
#                                                     target, 
#                                                     random_state=42)

# rfc = RandomForestClassifier(n_jobs=-1, max_features= 'sqrt', n_estimators=50, oob_score = True) 

# pipe = make_pipeline(ct, rfc)

# param_grid = {
#     'randomforestclassifier__n_estimators': [100, 500, 1000],
#     'randomforestclassifier__max_features': ['log2', 'sqrt','auto'],
# }

# CV_rfc = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1000)
# CV_rfc.fit(X_train, y_train)
# print(CV_rfc.best_params_)

In [None]:
# rfc = RandomForestClassifier(n_jobs=-1, max_features='log2', n_estimators=500, oob_score = True) 

# pipe = make_pipeline(ct, rfc)
# pipe.fit(X_train, y_train)

# pipe.predict_proba(X_test)
# preds = pipe.predict(X_test)
# train_score = pipe.score(X_train, y_train)
# test_score = pipe.score(X_test, y_test)
# loss_score = metrics.log_loss(y_test, preds)*.1

# print(f'Train score: {test_score}')
# print(f'Test score: {test_score}')
# print(f'Log loss: {loss_score}')

## Format submission

In [37]:
def format_submit(model, df):
    '''Creates and formats submission '''
    preds = model.predict(df)
    prob = [x[1] for x in model.predict_proba(df)]
    
    predict_df = pd.DataFrame({'Season':df['Season'],
                  'Team1':df['Team1'],
                  'Team2':df['Team2'],
                  'Guess':preds,
                  'Pred':prob}).round(2)

    predict_df['ID'] = predict_df['Season'].astype(str) + '_' +\
                       predict_df['Team1'].astype(str) + '_' +\
                       predict_df['Team2'].astype(str)
    
    submit_df = predict_df[['ID','Pred']].set_index('ID')
    
#     submit_df['Pred'] = np.where(submit_df['Pred']==0.0, .01, submit_df['Pred'])
#     submit_df['Pred'] = np.where(submit_df['Pred']==1.0, .99, submit_df['Pred'])

    submit_df.to_csv('../output/ml_march_madness_submission.csv')
    
    return submit_df

In [38]:
all_stats_submit_df = pd.DataFrame(pd.read_csv('../resources/transformed_data/all_stats_submit.csv'))
submit_df = all_stats_submit_df[selected_features.columns]
submit_df

Unnamed: 0,Season,Team1,Team2,Tourney,WLoc,Team1RankMean,Team2RankMean,Team1Seed,Team2Seed,LoseCount_Team1,LoseCount_Team2,GameCount_Team1,GameCount_Team2,Win%_Team1,Win%_Team2
0,2015,1107,1112,1,0,154.890566,7.155889,14,2,8.0,3.0,32.0,34.0,0.750000,0.911765
1,2015,1107,1116,1,0,154.890566,26.339623,14,5,8.0,8.0,32.0,34.0,0.750000,0.764706
2,2015,1107,1124,1,0,154.890566,17.802834,14,3,8.0,9.0,32.0,32.0,0.750000,0.718750
3,2015,1107,1125,1,0,154.890566,132.294340,14,15,8.0,10.0,32.0,31.0,0.750000,0.677419
4,2015,1107,1129,1,0,154.890566,63.752179,14,11,8.0,8.0,32.0,31.0,0.750000,0.741935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2019,1449,1459,1,0,43.404278,48.756124,9,7,8.0,4.0,34.0,30.0,0.764706,0.866667
11386,2019,1449,1463,1,0,43.404278,91.541037,9,14,8.0,7.0,34.0,28.0,0.764706,0.750000
11387,2019,1458,1459,1,0,19.631308,48.756124,5,7,10.0,4.0,33.0,30.0,0.696970,0.866667
11388,2019,1458,1463,1,0,19.631308,91.541037,5,14,10.0,7.0,33.0,28.0,0.696970,0.750000


In [39]:
format_submit(pipe, submit_df)

Unnamed: 0_level_0,Pred
ID,Unnamed: 1_level_1
2015_1107_1112,0.06
2015_1107_1116,0.11
2015_1107_1124,0.13
2015_1107_1125,0.43
2015_1107_1129,0.22
...,...
2019_1449_1459,0.56
2019_1449_1463,0.78
2019_1458_1459,0.58
2019_1458_1463,0.79
