# NCAA ML contest - Model testing

In [1]:
# libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data

In [2]:
# read in data
all_stats_df = pd.DataFrame(pd.read_csv('../resources/transformed_data/all_stats.csv'))
all_stats_df = all_stats_df.drop(columns=['WTeamID'])
all_stats_df.columns

Index(['Season', 'DayNum', 'Team1', 'Team2', 'Tourney', 'WLoc', 'ScoreDiff',
       'Team1Seed', 'Team2Seed', 'Team1FirstYear', 'Team1LastYear',
       'Team2FirstYear', 'Team2LastYear', 'WTeam', 'Team1RankMean',
       'Team2RankMean', 'WinCount_Team1', 'GameCount_Team1', 'AvgScore_Team1',
       'Win%_Team1', 'WinCount_Team2', 'GameCount_Team2', 'AvgScore_Team2',
       'Win%_Team2', 'LoseCount_Team1', 'AvgFGM_Team1', 'LoseCount_Team2',
       'AvgFGM_Team2', 'AvgFGA_Team1', 'AvgFGA_Team2', 'AvgFGM3_Team1',
       'AvgFGM3_Team2', 'AvgFGA3_Team1', 'AvgFGA3_Team2', 'AvgFTM_Team1',
       'AvgFTM_Team2', 'AvgFTA_Team1', 'AvgFTA_Team2', 'AvgOR_Team1',
       'AvgOR_Team2', 'AvgDR_Team1', 'AvgDR_Team2', 'AvgAst_Team1',
       'AvgAst_Team2', 'AvgTO_Team1', 'AvgTO_Team2', 'AvgStl_Team1',
       'AvgStl_Team2', 'AvgBlk_Team1', 'AvgBlk_Team2', 'AvgPF_Team1',
       'AvgPF_Team2', 'FG%_Team1', 'FG%_Team2', 'FG3%_Team1', 'FG3%_Team2'],
      dtype='object')

## Feature selection

In [3]:
# easy way to comment out unwanted features
features_df = all_stats_df[['Season', 
#                             'DayNum', 
                            'Team1', 'Team2', 
#                             'Tourney', 
                            'WLoc', 
#                             'ScoreDiff',
                            'Team1RankMean', 'Team2RankMean',
#                             'Team1Seed', 'Team2Seed',
                            'Team1FirstYear', 'Team2FirstYear',
#                             'Team1LastYear', 'Team2LastYear',
                            'WinCount_Team1', 'WinCount_Team2',
#                             'LoseCount_Team1', 'LoseCount_Team2',
                            'GameCount_Team1', 'GameCount_Team2',
#                             'Win%_Team1', 'Win%_Team2',
#                             'AvgScore_Team1', 'AvgScore_Team2',
#                             'AvgFGM_Team1', 'AvgFGM_Team2', 
#                             'AvgFGA_Team1', 'AvgFGA_Team2',
#                             'AvgFGM3_Team1', 'AvgFGM3_Team2', 
#                             'AvgFGA3_Team1', 'AvgFGA3_Team2',
#                             'AvgFTM_Team1', 'AvgFTM_Team2', 
#                             'AvgFTA_Team1', 'AvgFTA_Team2',
#                             'AvgOR_Team1', 'AvgOR_Team2', 
#                             'AvgDR_Team1', 'AvgDR_Team2',
#                             'AvgAst_Team1', 'AvgAst_Team2', 
#                             'AvgTO_Team1', 'AvgTO_Team2',
#                             'AvgStl_Team1', 'AvgStl_Team2', 
#                             'AvgBlk_Team1', 'AvgBlk_Team2',
#                             'AvgPF_Team1', 'AvgPF_Team2',
#                             'FG%_Team1', 'FG%_Team2',
#                             'FG3%_Team1', 'FG3%_Team2',
                            'WTeam',
                           ]] 

# drop all NaNs
features_df = features_df.dropna(how='any')
# drop overcorrected rank 
features_df = features_df.loc[features_df.Team1RankMean < 500]
features_df = features_df.loc[features_df.Team2RankMean < 500]

# # shuffle and limit dataset (for time-consuming model training)
# features_df = features_df.sample(frac=1).reset_index(drop=True)[:50000]
# features_df = features_df.sample(frac=1).reset_index(drop=True)[:50000]


# # change column type to test one hot encoder and scaler
# features_df = features_df.astype({
#                                   'Season':'str',
#                                   'Team1':'str',
#                                   'Team2':'str',
#                                  })

# select features and target
target = features_df.pop('WTeam')
selected_features = features_df
print(selected_features.shape)
print(target.shape)

(97049, 12)
(97049,)


## Feature engineering

In [4]:
# One hot encoder and scaler for pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object))
)

ct.fit_transform(selected_features)

array([[-1.70175732, -1.4099379 , -0.19242587, ...,  1.40594384,
        -0.71502874,  0.03298801],
       [-1.70175732,  0.5441819 ,  0.56725334, ...,  1.40594384,
        -0.33564018, -0.34534499],
       [-1.70175732,  0.47439191,  1.08149773, ..., -0.05430278,
        -0.71502874,  0.03298801],
       ...,
       [ 0.3616575 , -0.72366964, -1.73515903, ...,  2.70394083,
         1.56130266,  1.54632002],
       [ 0.3616575 ,  0.2417586 ,  1.32693255, ...,  1.73044309,
         1.56130266,  1.16798702],
       [ 0.3616575 , -0.72366964, -1.15079041, ...,  1.40594384,
         1.56130266,  1.54632002]])

## MLP testing

#### Grid search

In [32]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import make_pipeline
# from sklearn.model_selection import GridSearchCV, LeaveOneOut, RandomizedSearchCV
# from sklearn import metrics

# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# mlc = MLPClassifier(activation = 'relu', random_state=1, nesterovs_momentum=True)
# loo = LeaveOneOut()
# pipe = make_pipeline(ct, mlc)

# params = {
#           "mlpclassifier__hidden_layer_sizes":[(168,),(126,),(498,),(166,)],
#           "mlpclassifier__solver" : ('sgd','adam'), 
#           "mlpclassifier__alpha": [0.001,0.0001],
#           "mlpclassifier__learning_rate_init":[0.005,0.001]
#          }

# clf = RandomizedSearchCV(pipe, params, n_jobs=-1, verbose=3)

# X_train, X_test, y_train, y_test = train_test_split(selected_features, 
#                                                     target, stratify=target, 
#                                                     random_state=42)

# clf.fit(X_train, y_train)

# model = clf.best_estimator_
# print("the best model and parameters are the following: {} ".format(model))

Best model: 

MLPClassifier(alpha=0.001, hidden_layer_sizes=(168,),
              learning_rate_init=0.005, random_state=1))]) 

MLPClassifier(alpha=0.001, hidden_layer_sizes=(168,), random_state=1, solver='sgd')
 
Test score: 0.7595148169668797

Log loss: 0.8306158860092259

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, 
                                                    random_state=42)

mlc = MLPClassifier(alpha=0.01, hidden_layer_sizes=(16,16), 
                    random_state=1, 
                    solver='sgd', 
#                     learning_rate='invscaling', 
#                     learning_rate_init=0.01,
#                     power_t=.1,
#                     max_iter=200, 
                    nesterovs_momentum=True, 
#                     batch_size=75,
                    verbose=True)

# mlc = MLPClassifier(alpha=0.001, hidden_layer_sizes=(14,14), 
#                     random_state=1, solver='sgd', learning_rate_init=0.01,)


pipe = make_pipeline(ct, mlc)

pipe.fit(X_train, y_train)

pipe.predict_proba(X_test)
preds = pipe.predict(X_test)
train_score = pipe.score(X_train, y_train)
test_score = pipe.score(X_test, y_test)
loss_score = metrics.log_loss(y_test, preds)

print('\n-------------------')
print(f'Test score: {test_score}')
print(f'Log loss: {loss_score}')
print('\n-------------------\n')
print(metrics.confusion_matrix(y_test,preds))
print('\n-------------------\n')
print(metrics.classification_report(y_test,preds))

Iteration 1, loss = 0.61217232
Iteration 2, loss = 0.53085917
Iteration 3, loss = 0.51382161
Iteration 4, loss = 0.50918211
Iteration 5, loss = 0.50653723
Iteration 6, loss = 0.50464102
Iteration 7, loss = 0.50318430
Iteration 8, loss = 0.50200659
Iteration 9, loss = 0.50102367
Iteration 10, loss = 0.50018767
Iteration 11, loss = 0.49947350
Iteration 12, loss = 0.49885607
Iteration 13, loss = 0.49832531
Iteration 14, loss = 0.49783135
Iteration 15, loss = 0.49740570
Iteration 16, loss = 0.49705519
Iteration 17, loss = 0.49669926
Iteration 18, loss = 0.49638214
Iteration 19, loss = 0.49609890
Iteration 20, loss = 0.49580649
Iteration 21, loss = 0.49558074
Iteration 22, loss = 0.49537413
Iteration 23, loss = 0.49516708
Iteration 24, loss = 0.49496091
Iteration 25, loss = 0.49477736
Iteration 26, loss = 0.49458150
Iteration 27, loss = 0.49443392
Iteration 28, loss = 0.49428567
Iteration 29, loss = 0.49412690
Iteration 30, loss = 0.49399714
Iteration 31, loss = 0.49386135
Iteration 32, los

## Format submission

In [13]:
def format_submit(model, df):
    '''Creates and formats submission '''
    preds = model.predict(df)
    prob = [x[1] for x in model.predict_proba(df)]
    
    predict_df = pd.DataFrame({'Season':df['Season'],
                  'Team1':df['Team1'],
                  'Team2':df['Team2'],
                  'Guess':preds,
                  'Pred':prob}).round(3)

    predict_df['ID'] = predict_df['Season'].astype(str) + '_' +\
                       predict_df['Team1'].astype(str) + '_' +\
                       predict_df['Team2'].astype(str)
    
    submit_df = predict_df[['ID','Guess','Pred']].set_index('ID')
    
#     submit_df['Pred'] = np.where(submit_df['Pred']<0.1, submit_df['Pred']-.1, submit_df['Pred'])
#     submit_df['Pred'] = np.where(submit_df['Pred']>.9, submit_df['Pred']+.1, submit_df['Pred'])

    submit_df.to_csv('../output/ml_march_madness_submission.csv')
    
    return submit_df

In [14]:
# all_stats_submit_df.columns

In [15]:
all_stats_submit_df = pd.DataFrame(pd.read_csv('../resources/transformed_data/all_stats_submit.csv'))
submit_df = all_stats_submit_df[selected_features.columns]
submit_df.shape

(2278, 12)

In [16]:
prediction_df = format_submit(pipe, submit_df)
prediction_df

Unnamed: 0_level_0,Guess,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2021_1101_1104,0,0.166
2021_1101_1111,1,0.782
2021_1101_1116,0,0.179
2021_1101_1124,0,0.105
2021_1101_1140,0,0.198
...,...,...
2021_1452_1457,1,0.608
2021_1452_1458,1,0.668
2021_1455_1457,0,0.398
2021_1455_1458,0,0.438


In [11]:
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff382394910>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff382394d90>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.01, hidden_layer_sizes=(16, 16),
                               random_state=1, solver='sgd', verbose=True))])

In [12]:
# import pickle
# filename = '../models/mlp_model_1.sav'
# pickle.dump(pipe, open(filename, 'wb'))