In [1]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_data_dir = 'data/raw/DataFiles/'
seeds = pd.read_csv(raw_data_dir + 'NCAATourneySeeds.csv')
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [3]:
tour = pd.read_csv(raw_data_dir + 'NCAATourneyCompactResults.csv')
tour.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [4]:
def seed_to_int(seed_str):
    return int(seed_str[1:3])
seeds['seed_int'] = seeds.Seed.apply(seed_to_int)
seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label
seeds.head()

Unnamed: 0,Season,TeamID,seed_int
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [5]:
tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
tour.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


In [6]:
win_seeds = seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
loss_seeds = seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
temp = tour.merge(win_seeds, how='left', on=['Season', 'WTeamID'])
tour = temp.merge(loss_seeds, how='left', on=['Season', 'LTeamID'])
tour['diff'] = tour.WSeed - tour.LSeed
tour.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,diff
0,1985,1116,1234,9,8,1
1,1985,1120,1345,11,6,5
2,1985,1207,1250,1,16,-15
3,1985,1229,1425,9,8,1
4,1985,1242,1325,3,14,-11


In [7]:
wins = pd.DataFrame()
wins['diff'] = tour['diff']
wins['result'] = 1

losses = pd.DataFrame()
losses['diff'] = -tour['diff']
losses['result'] = 0

seeds_train = pd.concat((wins, losses))
seeds_train.head()

Unnamed: 0,diff,result
0,1,1
1,5,1
2,-15,1
3,1,1
4,-11,1


In [8]:
X_train = seeds_train['diff'].reshape(-1,1)
y_train = seeds_train.result.values
X_train, y_train = shuffle(X_train, y_train)

## Train a Basic LR model

In [9]:
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

Best log_loss: -0.5534, with best C: 0.01


###### Training XGBoost Model using the same data

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier()
gbm_params = {
            'n_estimators': [100, 150, 200, 300], 
            'max_depth': [2, 3, 5, 7], 
            'min_samples_leaf': [5, 8, 10],
            'max_features': ['auto']
         }
clf = GridSearchCV(gbm, gbm_params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4} with params {}'.format(clf.best_score_, clf.best_params_))

Best log_loss: -0.5527 with params {'max_features': 'auto', 'n_estimators': 100, 'max_depth': 2, 'min_samples_leaf': 8}


#### Creating Baseline Submission

In [3]:
test = pd.read_csv('data/raw/SampleSubmissionStage1.csv')
test.head()
#test = test.drop('Pred', axis=1)

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.5
1,2014_1107_1112,0.5
2,2014_1107_1113,0.5
3,2014_1107_1124,0.5
4,2014_1107_1140,0.5


In [11]:
def get_year_t1_t2(ID):
    return (int(x) for x in ID.split('_'))

In [12]:
X_test = np.zeros(shape=(len(test), 1))
for ii, row in test.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = seeds[(seeds.TeamID == t1) & (seeds.Season == year)].seed_int.values[0]
    t2_seed = seeds[(seeds.TeamID == t2) & (seeds.Season == year)].seed_int.values[0]
    diff_seed = t1_seed - t2_seed
    X_test[ii, 0] = diff_seed

In [20]:
preds = clf.predict_proba(X_test)[:,1]
clipped_preds = np.clip(preds, 0.05, 0.95)
test['Pred'] = clipped_preds
test.head()

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.458264
1,2014_1107_1112,0.075159
2,2014_1107_1113,0.268154
3,2014_1107_1124,0.157979
4,2014_1107_1140,0.268154


In [21]:
test.to_csv('baseline.csv', index=False)