# NCAA ML contest - Model testing

In [1]:
# libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data

In [2]:
# read in data
all_stats_df = pd.DataFrame(pd.read_csv('../resources/transformed_data/all_stats.csv'))
all_stats_df = all_stats_df.drop(columns=['WTeamID'])
all_stats_df.columns

Index(['Season', 'DayNum', 'Team1', 'Team2', 'Tourney', 'WLoc', 'ScoreDiff',
       'Team1Seed', 'Team2Seed', 'Team1FirstYear', 'Team1LastYear',
       'Team2FirstYear', 'Team2LastYear', 'WTeam', 'Team1RankMean',
       'Team2RankMean', 'WinCount_Team1', 'GameCount_Team1', 'AvgScore_Team1',
       'Win%_Team1', 'WinCount_Team2', 'GameCount_Team2', 'AvgScore_Team2',
       'Win%_Team2', 'LoseCount_Team1', 'AvgFGM_Team1', 'LoseCount_Team2',
       'AvgFGM_Team2', 'AvgFGA_Team1', 'AvgFGA_Team2', 'AvgFGM3_Team1',
       'AvgFGM3_Team2', 'AvgFGA3_Team1', 'AvgFGA3_Team2', 'AvgFTM_Team1',
       'AvgFTM_Team2', 'AvgFTA_Team1', 'AvgFTA_Team2', 'AvgOR_Team1',
       'AvgOR_Team2', 'AvgDR_Team1', 'AvgDR_Team2', 'AvgAst_Team1',
       'AvgAst_Team2', 'AvgTO_Team1', 'AvgTO_Team2', 'AvgStl_Team1',
       'AvgStl_Team2', 'AvgBlk_Team1', 'AvgBlk_Team2', 'AvgPF_Team1',
       'AvgPF_Team2', 'FG%_Team1', 'FG%_Team2', 'FG3%_Team1', 'FG3%_Team2'],
      dtype='object')

## Feature selection

In [38]:
# easy way to comment out unwanted features
features_df = all_stats_df[['Season', 
#                             'DayNum', 
                            'Team1', 'Team2', 
#                             'Tourney', 
                            'WLoc', 
#                             'ScoreDiff',
                            'Team1RankMean', 'Team2RankMean',
                            'Team1Seed', 'Team2Seed',
#                             'Team1FirstYear', 'Team2FirstYear',
#                             'Team1LastYear', 'Team2LastYear',
#                             'WinCount_Team1', 'WinCount_Team2',
                            'LoseCount_Team1', 'LoseCount_Team2',
                            'GameCount_Team1', 'GameCount_Team2',
                            'Win%_Team1', 'Win%_Team2',
#                             'AvgScore_Team1', 'AvgScore_Team2',
#                             'AvgFGM_Team1', 'AvgFGM_Team2', 
#                             'AvgFGA_Team1', 'AvgFGA_Team2',
#                             'AvgFGM3_Team1', 'AvgFGM3_Team2', 
#                             'AvgFGA3_Team1', 'AvgFGA3_Team2',
#                             'AvgFTM_Team1', 'AvgFTM_Team2', 
#                             'AvgFTA_Team1', 'AvgFTA_Team2',
#                             'AvgOR_Team1', 'AvgOR_Team2', 
#                             'AvgDR_Team1', 'AvgDR_Team2',
#                             'AvgAst_Team1', 'AvgAst_Team2', 
#                             'AvgTO_Team1', 'AvgTO_Team2',
#                             'AvgStl_Team1', 'AvgStl_Team2', 
#                             'AvgBlk_Team1', 'AvgBlk_Team2',
#                             'AvgPF_Team1', 'AvgPF_Team2',
#                             'FG%_Team1', 'FG%_Team2',
#                             'FG3%_Team1', 'FG3%_Team2',
                            'WTeam',
                           ]] 

# drop all NaNs
features_df = features_df.dropna(how='any')
# drop overcorrected rank 
features_df = features_df.loc[features_df.Team1RankMean < 500]
features_df = features_df.loc[features_df.Team2RankMean < 500]

# # shuffle and limit dataset (for time-consuming model training)
# features_df = features_df.sample(frac=1).reset_index(drop=True)[:50000]
# features_df = features_df.sample(frac=1).reset_index(drop=True)[:50000]


# # change column type to test one hot encoder and scaler
# features_df = features_df.astype({
#                                   'Season':'str',
#                                   'Team1':'str',
#                                   'Team2':'str',
#                                  })

# select features and target
target = features_df.pop('WTeam')
selected_features = features_df
print(selected_features.shape)
print(target.shape)

(93200, 14)
(93200,)


## Feature engineering

In [39]:
# One hot encoder and scaler for pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object))
)

ct.fit_transform(selected_features)

array([[-1.70146472, -1.40911253, -0.1899856 , ..., -0.09400181,
         0.44542821,  1.8236377 ],
       [-1.70146472,  0.54719759,  0.56965234, ..., -0.55951554,
         1.80511893,  2.05889961],
       [-1.70146472,  0.47732938,  1.08386878, ..., -0.09400181,
         2.04391834, -0.21988858],
       ...,
       [ 0.44899565, -0.72207505, -1.73263494, ...,  1.76805313,
         1.57411988,  3.11165213],
       [ 0.44899565,  0.24443531,  1.32929027, ...,  1.30253939,
         1.12241275,  1.72278698],
       [ 0.44899565, -0.72207505, -1.14829807, ...,  1.76805313,
         1.57411988,  1.08099875]])

## MLP testing

#### Grid search

In [32]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import make_pipeline
# from sklearn.model_selection import GridSearchCV, LeaveOneOut, RandomizedSearchCV
# from sklearn import metrics

# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# mlc = MLPClassifier(activation = 'relu', random_state=1, nesterovs_momentum=True)
# loo = LeaveOneOut()
# pipe = make_pipeline(ct, mlc)

# params = {
#           "mlpclassifier__hidden_layer_sizes":[(168,),(126,),(498,),(166,)],
#           "mlpclassifier__solver" : ('sgd','adam'), 
#           "mlpclassifier__alpha": [0.001,0.0001],
#           "mlpclassifier__learning_rate_init":[0.005,0.001]
#          }

# clf = RandomizedSearchCV(pipe, params, n_jobs=-1, verbose=3)

# X_train, X_test, y_train, y_test = train_test_split(selected_features, 
#                                                     target, stratify=target, 
#                                                     random_state=42)

# clf.fit(X_train, y_train)

# model = clf.best_estimator_
# print("the best model and parameters are the following: {} ".format(model))

Best model: 

MLPClassifier(alpha=0.001, hidden_layer_sizes=(168,),
              learning_rate_init=0.005, random_state=1))]) 

MLPClassifier(alpha=0.001, hidden_layer_sizes=(168,), random_state=1, solver='sgd')
 
Test score: 0.7595148169668797

Log loss: 0.8306158860092259

In [40]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, 
                                                    random_state=42)

mlc = MLPClassifier(alpha=0.01, hidden_layer_sizes=(16,16), 
                    random_state=1, 
                    solver='sgd', 
#                     learning_rate='invscaling', 
#                     learning_rate_init=0.01,
#                     power_t=.1,
#                     max_iter=200, 
                    nesterovs_momentum=True, 
#                     batch_size=75,
                    verbose=True)

# mlc = MLPClassifier(alpha=0.001, hidden_layer_sizes=(14,14), 
#                     random_state=1, solver='sgd', learning_rate_init=0.01,)


pipe = make_pipeline(ct, mlc)

pipe.fit(X_train, y_train)

pipe.predict_proba(X_test)
preds = pipe.predict(X_test)
train_score = pipe.score(X_train, y_train)
test_score = pipe.score(X_test, y_test)
loss_score = metrics.log_loss(y_test, preds)

print('\n-------------------')
print(f'Test score: {test_score}')
print(f'Log loss: {loss_score}')
print('\n-------------------\n')
print(metrics.confusion_matrix(y_test,preds))
print('\n-------------------\n')
print(metrics.classification_report(y_test,preds))


-------------------
Test score: 0.7608154506437769
Log loss: 8.261239575120412

-------------------

[[9128 2853]
 [2720 8599]]

-------------------

              precision    recall  f1-score   support

           0       0.77      0.76      0.77     11981
           1       0.75      0.76      0.76     11319

    accuracy                           0.76     23300
   macro avg       0.76      0.76      0.76     23300
weighted avg       0.76      0.76      0.76     23300



## Format submission

In [41]:
def format_submit(model, df):
    '''Creates and formats submission '''
    preds = model.predict(df)
    prob = [x[1] for x in model.predict_proba(df)]
    
    predict_df = pd.DataFrame({'Season':df['Season'],
                  'Team1':df['Team1'],
                  'Team2':df['Team2'],
                  'Guess':preds,
                  'Pred':prob}).round(3)

    predict_df['ID'] = predict_df['Season'].astype(str) + '_' +\
                       predict_df['Team1'].astype(str) + '_' +\
                       predict_df['Team2'].astype(str)
    
    submit_df = predict_df[['ID','Pred']].set_index('ID')
    
#     submit_df['Pred'] = np.where(submit_df['Pred']<0.1, submit_df['Pred']-.1, submit_df['Pred'])
#     submit_df['Pred'] = np.where(submit_df['Pred']>.9, submit_df['Pred']+.1, submit_df['Pred'])

    submit_df.to_csv('../output/ml_march_madness_submission.csv')
    
    return submit_df

In [42]:
all_stats_submit_df.columns

Index(['Season', 'Team1', 'Team2', 'Tourney', 'WLoc', 'Team1Seed', 'Team2Seed',
       'Team1FirstYear', 'Team1LastYear', 'Team2FirstYear', 'Team2LastYear',
       'Team1RankMean', 'Team2RankMean', 'WinCount_Team1', 'GameCount_Team1',
       'AvgScore_Team1', 'Win%_Team1', 'WinCount_Team2', 'GameCount_Team2',
       'AvgScore_Team2', 'Win%_Team2', 'LoseCount_Team1', 'AvgFGM_Team1',
       'LoseCount_Team2', 'AvgFGM_Team2', 'AvgFGA_Team1', 'AvgFGA_Team2',
       'AvgFGM3_Team1', 'AvgFGM3_Team2', 'AvgFGA3_Team1', 'AvgFGA3_Team2',
       'AvgFTM_Team1', 'AvgFTM_Team2', 'AvgFTA_Team1', 'AvgFTA_Team2',
       'AvgOR_Team1', 'AvgOR_Team2', 'AvgDR_Team1', 'AvgDR_Team2',
       'AvgAst_Team1', 'AvgAst_Team2', 'AvgTO_Team1', 'AvgTO_Team2',
       'AvgStl_Team1', 'AvgStl_Team2', 'AvgBlk_Team1', 'AvgBlk_Team2',
       'AvgPF_Team1', 'AvgPF_Team2', 'FG%_Team1', 'FG%_Team2', 'FG3%_Team1',
       'FG3%_Team2'],
      dtype='object')

In [43]:
all_stats_submit_df = pd.DataFrame(pd.read_csv('../resources/transformed_data/all_stats_submit.csv'))
submit_df = all_stats_submit_df[selected_features.columns]
submit_df.shape

(11390, 14)

In [44]:
format_submit(pipe, submit_df)

Unnamed: 0_level_0,Pred
ID,Unnamed: 1_level_1
2015_1107_1112,0.224
2015_1107_1116,0.204
2015_1107_1124,0.233
2015_1107_1125,0.320
2015_1107_1129,0.207
...,...
2019_1449_1459,0.665
2019_1449_1463,0.731
2019_1458_1459,0.719
2019_1458_1463,0.776


In [19]:
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc9093e7590>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc9093e77d0>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.001, hidden_layer_sizes=(16, 16),
                               learning_rate_init=0.01, random_state=1,
                               solver='sgd'))])

In [None]:
# import pickle
# filename = '../models/mlp_model_1.sav'
# pickle.dump(pipe, open(filename, 'wb'))

## Gaussian testing

In [5]:
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.pipeline import make_pipeline
# from sklearn.model_selection import train_test_split
# from sklearn import metrics
# import os

# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# X_train, X_test, y_train, y_test = train_test_split(selected_features, 
#                                                     target, 
#                                                     random_state=42)

# # clf = GaussianProcessClassifier(1.0 * RBF(1.0))
# clf = GaussianProcessClassifier(n_jobs=-1)
# pipe = make_pipeline(ct, clf)
# pipe.fit(X_train, y_train)

# pipe.predict_proba(X_test)
# preds = pipe.predict(X_test)
# train_score = pipe.score(X_train, y_train)
# test_score = pipe.score(X_test, y_test)
# loss_score = metrics.log_loss(y_test, preds)*.1

# print(f'Train score: {test_score}')
# print(f'Test score: {test_score}')
# print(f'Log loss: {loss_score}')

Best log loss: Log loss: 0.8206507730540533

## Random forest testing

In [6]:
# from sklearn.model_selection import GridSearchCV, LeaveOneOut
# from sklearn.pipeline import make_pipeline
# from sklearn.datasets import make_classification
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(selected_features, 
#                                                     target, 
#                                                     random_state=42)

# rfc = RandomForestClassifier(n_jobs=-1, max_features= 'sqrt', n_estimators=50, oob_score = True) 

# pipe = make_pipeline(ct, rfc)

# param_grid = {
#     'randomforestclassifier__n_estimators': [100, 500, 1000],
#     'randomforestclassifier__max_features': ['log2', 'sqrt','auto'],
# }

# CV_rfc = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1000)
# CV_rfc.fit(X_train, y_train)
# print(CV_rfc.best_params_)

In [7]:
# rfc = RandomForestClassifier(n_jobs=-1, max_features='log2', n_estimators=500, oob_score = True) 

# pipe = make_pipeline(ct, rfc)
# pipe.fit(X_train, y_train)

# pipe.predict_proba(X_test)
# preds = pipe.predict(X_test)
# train_score = pipe.score(X_train, y_train)
# test_score = pipe.score(X_test, y_test)
# loss_score = metrics.log_loss(y_test, preds)*.1

# print(f'Train score: {test_score}')
# print(f'Test score: {test_score}')
# print(f'Log loss: {loss_score}')