In [1]:
import pandas as pd

In [2]:
# assuming ncaa2019 package is installed 
from ncaa2019 import DataSet

In [3]:
ds = DataSet()

default path used C:\Users\Huang\kaggle-ncaa-2019-data


In [4]:
ds.list_raw_keys()

['city',
 'conf',
 'conftour',
 'gamecity',
 'massey',
 't_compact_result',
 't_detail_result',
 't_seed_slot',
 't_seed',
 't_slot',
 'r_compact_result',
 'r_detail_result',
 'season',
 'st_compact_result',
 'st_team',
 'team',
 'coache',
 'team_conf',
 'team_spelling']

# Team Score Difference

In [5]:
regular_bs = ds.get_raw_data('r_compact_result')

In [6]:
regular_bs['DayNum'].max()

132

In [7]:
regular_bs.head().style

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [8]:
team_score_season_hist = pd.concat([
    (regular_bs
     .assign(relative_score = lambda x: x['WScore'] - x['LScore'],
             TeamID = lambda x: x['WTeamID'])
     .loc[:,['Season', 'TeamID', 'relative_score']]
    ),
    (regular_bs
     .assign(relative_score = lambda x: x['LScore'] - x['WScore'],
             TeamID = lambda x: x['LTeamID'])
     .loc[:,['Season', 'TeamID', 'relative_score']]
    )],
    ignore_index=True,
    axis=0)
    

In [9]:
team_score_season_avg = team_score_season_hist.groupby(['Season', 'TeamID'])[['relative_score']].mean()
team_score_season_avg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,relative_score
Season,TeamID,Unnamed: 2_level_1
1985,1102,-5.791667
1985,1103,-3.043478
1985,1104,7.8
1985,1106,-3.791667
1985,1108,7.96


In [10]:
team_score_season_avg.columns

Index(['relative_score'], dtype='object')

# Rank Feature

In [11]:
rank_df = ds.get_raw_data('massey')

In [12]:
from scipy import stats

In [13]:
rank_df.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [14]:
team_avg_rank = (rank_df
                 .groupby(['Season','TeamID'])[['OrdinalRank']]
                 .mean()
                )

In [15]:
team_avg_rank.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,OrdinalRank
Season,TeamID,Unnamed: 2_level_1
2003,1102,144.2875
2003,1103,182.205
2003,1104,27.655502
2003,1105,305.3775
2003,1106,243.265


# Labels

In [16]:
wl_records = pd.concat([
    (ds
     .get_raw_data('t_compact_result')
     .assign(GameType='T')
    ),
    (ds
    .get_raw_data('r_compact_result')
     .assign(GameType='R')
    )],
    ignore_index=True,
    axis=0)


In [17]:
wl_records_symmetric = pd.concat([
    (wl_records
     .rename(columns={'WTeamID':'TeamID1',
                      'LTeamID':'TeamID2'})
     .assign(Winner='Team1')
     .loc[:,['Season', 'GameType', 'TeamID1', 'TeamID2', 'Winner']]                     
    ),
    (wl_records
     .rename(columns={'LTeamID':'TeamID1',
                      'WTeamID':'TeamID2'})
     .assign(Winner='Team2')
     .loc[:,['Season', 'GameType', 'TeamID1', 'TeamID2', 'Winner']]                     
    )],
    ignore_index=True,
    axis=0
)

# Feature and Label Data Frame


In [18]:
feature_label = (
    pd.merge(
        pd.merge(
            wl_records_symmetric,
            (team_score_season_avg
             .reset_index()
             .rename(columns={'TeamID': 'TeamID1', 'relative_score':'Team1_season_score'})
            ),
            on=['TeamID1', 'Season']
        ),       
        (team_score_season_avg
         .reset_index()
         .rename(columns={'TeamID': 'TeamID2', 'relative_score':'Team2_season_score'})
        ),
        on=['TeamID2', 'Season']
    ))

Top 5 row of faeture and label data frame

In [19]:
feature_label.head().style

Unnamed: 0,Season,GameType,TeamID1,TeamID2,Winner,Team1_season_score,Team2_season_score
0,1985,T,1116,1234,Team1,3.63636,10.4667
1,1985,R,1116,1234,Team2,3.63636,10.4667
2,1985,R,1326,1234,Team2,4.67857,10.4667
3,1985,R,1326,1234,Team2,4.67857,10.4667
4,1985,R,1228,1234,Team1,10.871,10.4667


# Training and Test Data
We only use season average WL difference as the feature.
All regular season result + Playoff before 2011 are available for training
Playoff after 2011 including 2011 are set aside as test data

In [20]:
from sklearn.utils import shuffle

In [21]:
def train_test_df_split(feature_label):
    train_data = feature_label.loc[(feature_label['Season']<2011) |
                                   (feature_label['GameType']=='R'), :]
    test_data = feature_label.loc[(feature_label['Season']>=2011) &
                                  (feature_label['GameType']=='T'), :]
    return train_data, test_data


def train_test_split(feature_label):
    train_df, test_df = train_test_df_split(feature_label)
    X_train = train_df[['Team1_season_score','Team2_season_score']].values
    y_train = train_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values
    X_test = test_df[['Team1_season_score','Team2_season_score']].values
    y_test = test_df['Winner'].apply(lambda x: 1 if x=='Team1' else 0).values    
    return X_train, X_test, y_train, y_test

In [22]:
X_train, X_test, y_train, y_test = train_test_split(feature_label)

Top of train and test data

In [23]:
print(f"""
X_train:
{X_train}
y_train
{y_train}
""")



X_train:
[[  3.63636364  10.46666667]
 [  3.63636364  10.46666667]
 [  4.67857143  10.46666667]
 ...
 [-13.71875      0.26666667]
 [-13.71875      0.26666667]
 [-15.26666667   0.26666667]]
y_train
[1 0 0 ... 0 0 0]



# Logistic Regression
We apply the logistc regression to test the result

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
clf = LogisticRegression()
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
import numpy as np

In [27]:
def KL_score(y_value, p_predict):
    score = -(np.sum(np.log(p_predict[y_value==1])) + np.sum(np.log(1-p_predict[y_value==0])))/len(y_value)
    return score

In [28]:
def predict_prob(clf, X):
    prob_class = clf.predict_proba(X)
    prob = prob_class[:,1]
    return prob

In [29]:
print(
f"""
KL Score for dummy estimator: {KL_score(y_test, np.repeat(0.5, len(y_test)))}
KL Score in training data: {KL_score(y_train, predict_prob(clf, X_train))}
KL Score in testing data: {KL_score(y_test, predict_prob(clf, X_test))}
"""
)


KL Score for dummy estimator: 0.6931471805599453
KL Score in training data: 0.5291835615488899
KL Score in testing data: 0.619411425873969

