In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
import sklearn.metrics as metrics
from scipy.stats import norm

In [2]:
'''
'Season', 'T1', 'T2','ScoreDif',
       'T1MFAc', 'T1MFAc3', 'T1MSAc', 'T1MRC', 'T1MAst', 'T1MStl', 'T1MBlk',
       'T1MScoreDif', 'T1VFAc', 'T1VFAc3', 'T1VSAc', 'T1VRC', 'T1Seed',
       'T2MFAc','T2MFAc3', 'T2MSAc', 'T2MRC', 'T2MAst', 'T2MStl', 'T2MBlk',
       'T2MScoreDif', 'T2VFAc', 'T2VFAc3', 'T2VSAc', 'T2VRC', 
       'T2Seed', 'SeedDif'

'''
games = np.genfromtxt('processed_np.csv', delimiter=',')
np.set_printoptions(suppress=True)

In [3]:
# train test split
train = (games[:,0]<2015)
X_train,X_test = games[train,4:], games[~train,4:]
y_train,y_test = games[train,3],games[~train,3]

In [4]:
def log_loss(alg,test_input,test_result):
    preds = alg.predict(test_input)
    # to prob
    test_result[test_result>=0] = 1
    test_result[test_result<0] = 0
    loss = -np.mean(test_result*np.log(preds)+(1-test_result)*np.log(1-preds))
    print("Log_Loss : %f" % loss)

In [61]:
def rmse(alg,test_input,test_result):
    preds = alg.predict(test_input)
    error = np.sqrt(np.mean((test_result*preds)**2))
    print("RMSE : %f" % error)

In [19]:
def modelfit(alg,X,y,useCV= True,cv_folds=5, early_stopping_rounds=50):
    if useCV:
        xgb_param = alg.get_params()
        xgtrain = xgb.DMatrix(X,y)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    alg.fit(X,y,eval_metric='rmse')

In [91]:
# xgboost
import xgboost as xgb
dtrain = xgb.DMatrix(X_train,label = y_train)
dtest = xgb.DMatrix(X_test,label = y_test)
model = xgb.XGBRegressor(colsample_bytree=0.9,
                         gamma=0,
                         learning_rate=0.001,
                         max_depth=2,
                         min_child_weight=1,
                         n_estimators=20,  
                         reg_alpha=1e-05,
                         reg_lambda=0,
                         subsample=0.7,
                         seed=42)
t = time.time()
modelfit(model,X_train,y_train,useCV= True,cv_folds=5, early_stopping_rounds=50)
#model.fit(X_train,y_train)
print("runtime : %f" % (time.time()-t))
rmse(model,X_test,y_test)

runtime : 0.081017
RMSE : 0.513872


In [None]:
feat_imp = pd.Series(model.feature_importances_).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')

In [80]:
# tuning tree param
# try different depth and weight value to find optimal 
from sklearn.model_selection import GridSearchCV
param_test = {'max_depth':range(1,5),
             'min_child_weight':range(1,5)}
gsearch = GridSearchCV(estimator = model,
                      param_grid = param_test, scoring='neg_root_mean_squared_error',n_jobs=4,iid=False, cv=5)
gsearch.fit(X_train,y_train)
gsearch.best_score_,gsearch.best_params_

(-12.633021974907601, {'max_depth': 2, 'min_child_weight': 1})

In [81]:
param_test2 = {'gamma':[i/10.0 for i in range(0,5)]}
gsearch = GridSearchCV(estimator = model,
                      param_grid = param_test2, scoring='neg_root_mean_squared_error',n_jobs=4,iid=False, cv=5)
gsearch.fit(X_train,y_train)
gsearch.best_score_,gsearch.best_params_

(-12.633021974907601, {'gamma': 0.0})

In [84]:
param_test3 = {'colsample_bytree':[i/10.0 for i in range(6,10)],
              'n_estimators': range(16,25)}
gsearch = GridSearchCV(estimator = model,
                      param_grid = param_test3, scoring='neg_root_mean_squared_error',n_jobs=4,iid=False, cv=5)
gsearch.fit(X_train,y_train)
gsearch.best_score_,gsearch.best_params_

(-12.573635988889842, {'colsample_bytree': 0.9, 'n_estimators': 20})

In [85]:
param_test4 = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
              'reg_lambda': [0, 0.001, 0.005, 0.01, 0.05]}
gsearch = GridSearchCV(estimator = model,
                      param_grid = param_test4, scoring='neg_root_mean_squared_error',n_jobs=4,iid=False, cv=5)
gsearch.fit(X_train,y_train)
gsearch.best_score_,gsearch.best_params_

(-12.618871043282931, {'reg_alpha': 1e-05, 'reg_lambda': 0})

In [92]:
## output submission
#read and predict
sub = np.genfromtxt('submission_input.csv', delimiter=',')
sub_preds = model.predict(sub[:,3:])

In [93]:
# write and to_prob
submission = pd.read_csv('C:\\Users\\luciu\\Box Sync\\ncaa-march-madness-2020\\data\\MSampleSubmissionStage1_2020.csv')
mu, std = norm.fit(sub_preds)
sub_prob = norm.cdf(sub_preds,mu,std)
submission['Pred'] = sub_prob
submission.to_csv(r'C:\\Users\\luciu\\Box Sync\\ncaa-march-madness-2020\\code\submission1.csv', index = False)

In [None]:
# neural network

In [None]:
# glmm