In [1]:
import pandas as pd

In [2]:
# assuming ncaa2019 package is installed 
from ncaa2019 import DataSet

In [3]:
ds = DataSet()

default path used C:\Users\Huang\kaggle-ncaa-2019-data


In [4]:
ds.list_raw_keys()

['city',
 'conf',
 'conftour',
 'gamecity',
 'massey',
 't_compact_result',
 't_detail_result',
 't_seed_slot',
 't_seed',
 't_slot',
 'r_compact_result',
 'r_detail_result',
 'season',
 'st_compact_result',
 'st_team',
 'team',
 'coache',
 'team_conf',
 'team_spelling']

# Team Score Difference

In [5]:
regular_bs = ds.get_raw_data('r_compact_result')

In [6]:
regular_bs['DayNum'].max()

132

In [7]:
regular_bs.head().style

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [8]:
team_score_season_hist = pd.concat([
    (regular_bs
     .assign(relative_score = lambda x: x['WScore'] - x['LScore'],
             TeamID = lambda x: x['WTeamID'])
     .loc[:,['Season', 'TeamID', 'relative_score']]
    ),
    (regular_bs
     .assign(relative_score = lambda x: x['LScore'] - x['WScore'],
             TeamID = lambda x: x['LTeamID'])
     .loc[:,['Season', 'TeamID', 'relative_score']]
    )],
    ignore_index=True,
    axis=0)
    

In [9]:
team_score_season_avg = team_score_season_hist.groupby(['Season', 'TeamID'])[['relative_score']].mean()
team_score_season_avg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,relative_score
Season,TeamID,Unnamed: 2_level_1
1985,1102,-5.791667
1985,1103,-3.043478
1985,1104,7.8
1985,1106,-3.791667
1985,1108,7.96


In [10]:
team_score_season_avg.columns

Index(['relative_score'], dtype='object')

# Rank Feature

In [11]:
rank_df = ds.get_raw_data('massey')

In [12]:
from scipy import stats

In [13]:
rank_df.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [14]:
team_avg_rank = (rank_df
                 .groupby(['Season','TeamID'])[['OrdinalRank']]
                 .mean()
                )

In [15]:
team_avg_rank.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,OrdinalRank
Season,TeamID,Unnamed: 2_level_1
2003,1102,144.2875
2003,1103,182.205
2003,1104,27.655502
2003,1105,305.3775
2003,1106,243.265


# Labels

In [16]:
wl_records = pd.concat([
    (ds
     .get_raw_data('t_compact_result')
     .assign(GameType='T')
    ),
    (ds
    .get_raw_data('r_compact_result')
     .assign(GameType='R')
    )],
    ignore_index=True,
    axis=0)


In [17]:
wl_records_symmetric = pd.concat([
    (wl_records
     .rename(columns={'WTeamID':'TeamID1',
                      'LTeamID':'TeamID2'})
     .assign(Winner='Team1')
     .loc[:,['Season', 'GameType', 'TeamID1', 'TeamID2', 'Winner']]                     
    ),
    (wl_records
     .rename(columns={'LTeamID':'TeamID1',
                      'WTeamID':'TeamID2'})
     .assign(Winner='Team2')
     .loc[:,['Season', 'GameType', 'TeamID1', 'TeamID2', 'Winner']]                     
    )],
    ignore_index=True,
    axis=0
)

# Feature and Label Data Frame


In [18]:
feature_label = (
    pd.merge(
        pd.merge(
            wl_records_symmetric,
            (team_score_season_avg
             .reset_index()
             .rename(columns={'TeamID': 'TeamID1', 'relative_score':'Team1_season_score'})
            ),
            on=['TeamID1', 'Season']
        ),       
        (team_score_season_avg
         .reset_index()
         .rename(columns={'TeamID': 'TeamID2', 'relative_score':'Team2_season_score'})
        ),
        on=['TeamID2', 'Season']
    ))

Top 5 row of feature and label data frame

In [19]:
feature_label.head().style

Unnamed: 0,Season,GameType,TeamID1,TeamID2,Winner,Team1_season_score,Team2_season_score
0,1985,T,1116,1234,Team1,3.63636,10.4667
1,1985,R,1116,1234,Team2,3.63636,10.4667
2,1985,R,1326,1234,Team2,4.67857,10.4667
3,1985,R,1326,1234,Team2,4.67857,10.4667
4,1985,R,1228,1234,Team1,10.871,10.4667


# Training and Test Data
We only use season average WL difference as the feature.
All regular season result + Playoff before 2011 are available for training
Playoff after 2011 including 2011 are set aside as test data

In [20]:
from sklearn.utils import shuffle

In [21]:
def train_test_df_split(feature_label):
    train_data = feature_label.loc[(feature_label['Season']<2011) |
                                   (feature_label['GameType']=='R'), :]
    test_data = feature_label.loc[(feature_label['Season']>=2011) &
                                  (feature_label['GameType']=='T'), :]
    return train_data, test_data


def train_test_split(feature_label):
    train_df, test_df = train_test_df_split(feature_label)
    X_train = train_df[['Team1_season_score','Team2_season_score']].values
    y_train = train_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values
    X_test = test_df[['Team1_season_score','Team2_season_score']].values
    y_test = test_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values    
    return X_train, X_test, y_train, y_test

In [22]:
X_train, X_test, y_train, y_test = train_test_split(feature_label)
print(X_train.shape)

(315474, 2)


Top of train and test data

In [23]:
print(f"""
X_train:
{X_train}
y_train
{y_train}
""")



X_train:
[[  3.63636364  10.46666667]
 [  3.63636364  10.46666667]
 [  4.67857143  10.46666667]
 ...
 [-13.71875      0.26666667]
 [-13.71875      0.26666667]
 [-15.26666667   0.26666667]]
y_train
[1 0 0 ... 0 0 0]



# Logistic Regression
We apply the logistc regression to test the result

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
clf = LogisticRegression(fit_intercept=False)
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
import numpy as np

In [27]:
def KL_score(y_value, p_predict):
    score = -(np.sum(np.log(p_predict[y_value==1])) + np.sum(np.log(1-p_predict[y_value==0])))/len(y_value)
    return score

In [28]:
def predict_prob(clf, X):
    prob_class = clf.predict_proba(X)
    prob = prob_class[:,1]
    return prob

In [29]:
print(
f"""
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)


KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.5291835615488899
KL Score in testing data: 0.6194114258739689



# only use post season scores

In [30]:
feature_label.groupby('GameType').count()

Unnamed: 0_level_0,Season,TeamID1,TeamID2,Winner,Team1_season_score,Team2_season_score
GameType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
R,312178,312178,312178,312178,312178,312178
T,4368,4368,4368,4368,4368,4368


In [31]:
def train_test_df_split(feature_label):
    train_data = feature_label.loc[(feature_label['Season']<2011) &
                                   (feature_label['GameType']=='T'), :]
    test_data = feature_label.loc[(feature_label['Season']>=2011) &
                                  (feature_label['GameType']=='T'), :]
    return train_data, test_data


def train_test_split(feature_label):
    train_df, test_df = train_test_df_split(feature_label)
    X_train = train_df[['Team1_season_score','Team2_season_score']].values
    y_train = train_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values
    X_test = test_df[['Team1_season_score','Team2_season_score']].values
    y_test = test_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values    
    return X_train, X_test, y_train, y_test

In [32]:
X_train, X_test, y_train, y_test = train_test_split(feature_label)

In [33]:
X_train.shape

(3296, 2)

In [34]:
clf = LogisticRegression(fit_intercept=False)
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [35]:
print(
f"""
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)


KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.5973824388937066
KL Score in testing data: 0.6180925730126637



In [36]:
from sklearn.linear_model import LogisticRegressionCV

In [37]:
from sklearn import metrics

In [38]:
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

In [39]:
clf = LogisticRegressionCV(fit_intercept=False,scoring =  'neg_log_loss')
clf.fit(X_train, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=False, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0)

In [40]:
print(
f"""
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)


KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.5973984453130364
KL Score in testing data: 0.6179979325204425



# Rank + Score

In [41]:
feature_label = (
    pd.merge(
        pd.merge(
            wl_records_symmetric,
            (team_score_season_avg
             .reset_index()
             .rename(columns={'TeamID': 'TeamID1', 'relative_score':'Team1_season_score'})
            ),
            on=['TeamID1', 'Season']
        ),       
        (team_score_season_avg
         .reset_index()
         .rename(columns={'TeamID': 'TeamID2', 'relative_score':'Team2_season_score'})
        ),
        on=['TeamID2', 'Season']
    ))

In [42]:
score_rank = pd.merge(team_score_season_avg.reset_index(), team_avg_rank.reset_index(), on=['Season', 'TeamID'])

In [43]:
feature_label = (
    pd.merge(
        pd.merge(
            wl_records_symmetric,
            (score_rank             
             .rename(columns={'TeamID': 'TeamID1', 'relative_score':'Team1_season_score', 'OrdinalRank':'Team1_OrdinalRank'
                             })
            ),
            on=['TeamID1', 'Season']
        ),       
        (score_rank         
         .rename(columns={'TeamID': 'TeamID2', 'relative_score':'Team2_season_score', 'OrdinalRank':'Team2_OrdinalRank'})
        ),
        on=['TeamID2', 'Season']
    ))

In [64]:
def train_test_df_split(feature_label):
    train_data = feature_label.loc[(feature_label['Season']<2011) |
                                   (feature_label['GameType']=='R'), :]
    test_data = feature_label.loc[(feature_label['Season']>=2011) &
                                  (feature_label['GameType']=='T'), :]
    return train_data, test_data


def train_test_split(feature_label):
    train_df, test_df = train_test_df_split(feature_label)
    X_train = train_df[['Team1_season_score','Team2_season_score','Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_train = train_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values
    X_test = test_df[['Team1_season_score','Team2_season_score','Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_test = test_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values    
    return X_train, X_test, y_train, y_test

In [65]:
X_train, X_test, y_train, y_test = train_test_split(feature_label)

In [66]:
from sklearn import preprocessing
from sklearn import pipeline

In [67]:
clf = pipeline.Pipeline([('SS', preprocessing.StandardScaler()),
                ('LR', LogisticRegressionCV(fit_intercept=False,scoring =  'neg_log_loss'))])
clf.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('SS', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=False, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0))])

In [68]:
clf.named_steps['LR'].coef_

array([[ 0.22991145, -0.22981372, -1.33141786,  1.33152004]])

In [69]:
print(
f"""
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)


KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.4948252308285958
KL Score in testing data: 0.5736330128092295



In [49]:
def train_test_df_split(feature_label):
    train_data = feature_label.loc[(feature_label['Season']<2011) *
                                   (feature_label['GameType']=='T'), :]
    test_data = feature_label.loc[(feature_label['Season']>=2011) &
                                  (feature_label['GameType']=='T'), :]
    return train_data, test_data


def train_test_split(feature_label):
    train_df, test_df = train_test_df_split(feature_label)
    X_train = train_df[['Team1_season_score','Team2_season_score','Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_train = train_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values
    X_test = test_df[['Team1_season_score','Team2_season_score','Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_test = test_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values    
    return X_train, X_test, y_train, y_test

In [50]:
X_train, X_test, y_train, y_test = train_test_split(feature_label)

  .format(op=op_str, alt_op=unsupported[op_str]))


In [61]:
clf = pipeline.Pipeline([('SS', preprocessing.StandardScaler()),
                ('LR', LogisticRegressionCV(fit_intercept=False,scoring =  'neg_log_loss'))])
clf.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('SS', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=False, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0))])

In [63]:
print(
f"""
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)


KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.49633304853756893
KL Score in testing data: 0.5769814327460678



# Rank Only

In [53]:
def train_test_df_split(feature_label):
    train_data = feature_label.loc[(feature_label['Season']<2011) |
                                   (feature_label['GameType']=='R'), :]
    test_data = feature_label.loc[(feature_label['Season']>=2011) &
                                  (feature_label['GameType']=='T'), :]
    return train_data, test_data


def train_test_split(feature_label):
    train_df, test_df = train_test_df_split(feature_label)
    X_train = train_df[['Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_train = train_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values
    X_test = test_df[['Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_test = test_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values    
    return X_train, X_test, y_train, y_test

In [54]:
X_train, X_test, y_train, y_test = train_test_split(feature_label)

In [55]:
train_data, test_data = train_test_df_split(feature_label)

In [56]:
train_data.corr()

Unnamed: 0,Season,TeamID1,TeamID2,Team1_season_score,Team1_OrdinalRank,Team2_season_score,Team2_OrdinalRank
Season,1.0,-8.3e-05,-8.3e-05,-0.00796,0.047904,-0.00796,0.047904
TeamID1,-8.3e-05,1.0,0.004305,0.066632,-0.092424,0.028389,-0.047744
TeamID2,-8.3e-05,0.004305,1.0,0.028389,-0.047744,0.066632,-0.092424
Team1_season_score,-0.00796,0.066632,0.028389,1.0,-0.871584,0.110839,-0.216249
Team1_OrdinalRank,0.047904,-0.092424,-0.047744,-0.871584,1.0,-0.216249,0.381897
Team2_season_score,-0.00796,0.028389,0.066632,0.110839,-0.216249,1.0,-0.871584
Team2_OrdinalRank,0.047904,-0.047744,-0.092424,-0.216249,0.381897,-0.871584,1.0


In [57]:
X_train.shape

(165106, 2)

In [58]:
clf = LogisticRegressionCV(fit_intercept=False,scoring =  'neg_log_loss')
clf.fit(X_train, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=False, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0)

In [70]:
print(
f"""`
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)

`
KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.4948252308285958
KL Score in testing data: 0.5736330128092295



In [71]:
def train_test_df_split(feature_label):
    train_data = feature_label.loc[(feature_label['Season']<2011) &
                                   (feature_label['GameType']=='T'), :]
    test_data = feature_label.loc[(feature_label['Season']>=2011) &
                                  (feature_label['GameType']=='T'), :]
    return train_data, test_data


def train_test_split(feature_label):
    train_df, test_df = train_test_df_split(feature_label)
    X_train = train_df[['Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_train = train_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values
    X_test = test_df[['Team1_OrdinalRank','Team2_OrdinalRank']].values
    y_test = test_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values    
    return X_train, X_test, y_train, y_test

In [72]:
X_train, X_test, y_train, y_test = train_test_split(feature_label)

In [73]:
X_train

array([[241.37      , 236.2175    ],
       [241.37      ,   9.41627907],
       [ 31.08726415,   9.41627907],
       ...,
       [  3.95454545, 146.05320814],
       [ 38.87575758, 113.44444444],
       [ 16.92898551, 109.01095462]])

In [74]:
X_train.shape

(1024, 2)

In [75]:
clf = LogisticRegressionCV(fit_intercept=False,scoring =  'neg_log_loss')
clf.fit(X_train, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=False, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0)

In [76]:
print(
f"""`
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)

`
KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.5309060958025951
KL Score in testing data: 0.5855290376895113



## Observation
The performance in rank only seems to be the same as combining diff score with rank, and is better than diff score only. We now check the correlation between rank and diff-score, which shows a very strong correlation. 

In [77]:
score_rank[['relative_score','OrdinalRank']].corr()

Unnamed: 0,relative_score,OrdinalRank
relative_score,1.0,-0.86982
OrdinalRank,-0.86982,1.0


In [78]:
score_rank[['relative_score','OrdinalRank']].corr('spearman')

Unnamed: 0,relative_score,OrdinalRank
relative_score,1.0,-0.88029
OrdinalRank,-0.88029,1.0
