# 2021 March Madess ML contest

In [2]:
# libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlalchemy 

## Import data

Data source: https://www.kaggle.com/c/ncaam-march-mania-2021/data

In [3]:
# read in all CSV files
datasets = os.listdir('../resources/kaggle_data/')
dfs = {}
for file in datasets:
    dfs[file[:-4]] = pd.DataFrame(pd.read_csv(f'../resources/kaggle_data/{file}', encoding='cp1252'))

In [4]:
# # preview all DFs
# df_list = list(dfs.keys())
# for x in df_list:
#     print(x)
#     print(dfs[x])
#     print('\n-----\n')

In [5]:
# create relevant dataframes
seed_df = dfs['MNCAATourneySeeds']
reg_short_df = dfs['MRegularSeasonCompactResults']
tourney_short_df = dfs['MNCAATourneyCompactResults']
reg_long_df = dfs['MRegularSeasonDetailedResults']
tourney_long_df = dfs['MNCAATourneyDetailedResults']
massey_df = dfs['MMasseyOrdinals']
teams_df = dfs['MTeams']
game_cities_df = dfs['MGameCities']
secondary_short_df = dfs['MSecondaryTourneyCompactResults']
coaches_df = dfs['MTeamCoaches']
submission_example_df = dfs['MSampleSubmissionStage1']

In [6]:
teams_df.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2021
1,1102,Air Force,1985,2021
2,1103,Akron,1985,2021
3,1104,Alabama,1985,2021
4,1105,Alabama A&M,2000,2021


In [7]:
reg_short_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [8]:
coaches_df.head()

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName
0,1985,1102,0,154,reggie_minton
1,1985,1103,0,154,bob_huggins
2,1985,1104,0,154,wimp_sanderson
3,1985,1106,0,154,james_oliver
4,1985,1108,0,154,davey_whitney


## Transform and clean

In [9]:
# make boolean columns for Tourney
reg_short_df['Tourney'] = 0
tourney_short_df['Tourney'] = 1

# append reg season and tourney DFs
combined_df = reg_short_df.append(tourney_short_df)
combined_df['ScoreDiff'] = combined_df['WScore'] - combined_df['LScore']

# create team 1 and 2 columns based on ID
combined_df['Team1'] = np.where(combined_df['WTeamID']<combined_df['LTeamID'],
                                combined_df['WTeamID'],
                                combined_df['LTeamID'])
combined_df['Team2'] = np.where(combined_df['WTeamID']>combined_df['LTeamID'],
                                combined_df['WTeamID'],
                                combined_df['LTeamID'])

combined_df = combined_df[['Season','DayNum','Team1','Team2','Tourney','WLoc','WTeamID','ScoreDiff']]

# add team 1 tourney seed column
merged_df = pd.merge(combined_df, seed_df, 
                     how='left', 
                     left_on=['Season','Team1'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

merged_df = merged_df.rename(columns={'Seed':'Team1Seed'})

# add team 2 tourney seed column
merged_df2 = pd.merge(merged_df, seed_df, 
                     how='left', 
                     left_on=['Season','Team2'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

merged_df2 = merged_df2.rename(columns={'Seed':'Team2Seed'})

# merged_df2['Team1Seed'] = np.where(merged_df2['Tourney']==0, 'N/A', merged_df2['Team1Seed'])
# merged_df2['Team2Seed'] = np.where(merged_df2['Tourney']==0, 'N/A', merged_df2['Team2Seed'])

# merged_df2 = merged_df2.fillna('N/A')

# add first/last D1 year
merged_df3 = pd.merge(merged_df2, teams_df, how='left', left_on='Team1', right_on='TeamID')
merged_df3 = merged_df3.drop(columns=['TeamID','TeamName'])
merged_df3 = merged_df3.rename(columns={'FirstD1Season':'Team1FirstYear','LastD1Season':'Team1LastYear'})

merged_df3 = pd.merge(merged_df3, teams_df, how='left', left_on='Team2', right_on='TeamID')
merged_df3 = merged_df3.drop(columns=['TeamID','TeamName'])
merged_df3 = merged_df3.rename(columns={'FirstD1Season':'Team2FirstYear','LastD1Season':'Team2LastYear'})

# winning team boolean
merged_df3['WTeam'] = np.where(merged_df2['WTeamID']==merged_df2['Team1'],1,0)

# change seed type
merged_df3['Team1Seed'] = merged_df3['Team1Seed'].str.strip().str[1:3]
merged_df3['Team1Seed'] = np.where(len(merged_df3['Team1Seed'])>2, 
                                   merged_df3['Team1Seed'].str[:2], 
                                   merged_df3['Team1Seed'])
merged_df3['Team2Seed'] = merged_df3['Team2Seed'].str.strip().str[1:3]
merged_df3['Team2Seed'] = np.where(len(merged_df3['Team2Seed'])>2, 
                                   merged_df3['Team2Seed'].str[:2], 
                                   merged_df3['Team2Seed'])

merged_df3 = merged_df3.fillna('30')

merged_df3 = merged_df3.astype({'Team1Seed':'int','Team2Seed':'int'})

# merged_df2['Team1Seed'] = np.where(merged_df2['Tourney']==1, merged_df2['Team1Seed'], '0')
# merged_df2['Team2Seed'] = np.where(merged_df2['Tourney']==1, merged_df2['Team2Seed'], '0')

# add team rank averages
massey_season_avg = massey_df.groupby(['Season','TeamID'], as_index=False)['OrdinalRank'].mean()

merged_df4 = pd.merge(merged_df3, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team1'], 
                       right_on=['Season','TeamID'])

merged_df5 = pd.merge(merged_df4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team2'], 
                       right_on=['Season','TeamID'])

merged_df5 = merged_df5.rename(columns={'OrdinalRank_x':'Team1RankMean','OrdinalRank_y':'Team2RankMean'})\
    .drop(columns=['TeamID_x','TeamID_y'])

merged_df5 = merged_df5.fillna(500)

merged_df5

Unnamed: 0,Season,DayNum,Team1,Team2,Tourney,WLoc,WTeamID,ScoreDiff,Team1Seed,Team2Seed,Team1FirstYear,Team1LastYear,Team2FirstYear,Team2LastYear,WTeam,Team1RankMean,Team2RankMean
0,1985,20,1228,1328,0,N,1228,17,3,1,1985,2021,1985,2021,1,500.000000,500.000000
1,1985,25,1106,1354,0,H,1106,7,30,30,1985,2021,1985,2021,1,500.000000,500.000000
2,1985,25,1112,1223,0,H,1112,7,10,30,1985,2021,1985,2021,1,500.000000,500.000000
3,1985,25,1165,1432,0,H,1165,16,30,30,1985,2020,1985,1987,1,500.000000,500.000000
4,1985,25,1192,1447,0,H,1192,12,16,30,1985,2021,1985,2021,1,500.000000,500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169126,2019,146,1120,1246,1,N,1120,6,5,2,1985,2021,1985,2021,1,17.748713,15.207700
169127,2019,146,1181,1277,1,N,1277,1,1,2,1985,2021,1985,2021,0,1.980769,6.603239
169128,2019,152,1277,1403,1,N,1403,10,2,3,1985,2021,1985,2021,0,6.603239,13.330275
169129,2019,152,1120,1438,1,N,1438,1,5,1,1985,2021,1985,2021,0,17.748713,3.682186


## Feature selection and engineering

In [10]:
# remove Tourney 2015-19 records
merged_df5 = merged_df5[['Season','Team1','Team2','Tourney','WLoc',
                         'Team1Seed','Team2Seed','Team1RankMean',
                         'Team2RankMean','Team1FirstYear','Team2FirstYear','WTeam']]
merged_df5 = merged_df5.loc[(merged_df5.Season < 2014) & (merged_df5.Tourney==0)]
# merged_df5 = merged_df5.dropna(how='any')
merged_df5.shape

(129204, 12)

In [11]:
# merged_df4 = merged_df4.astype({'Season':'str','DayNum':'str','Team1':'str','Team2':'str'})
# dummies_df = pd.get_dummies(merged_df4[['Season','Team1','Team2','Tourney','WLoc','Team1Seed','Team2Seed','WTeam']])
# dummies_df

In [12]:
# select features and target
target = merged_df5.pop('WTeam')
selected_features = merged_df5

print(selected_features.shape)
print(target.shape)

(129204, 11)
(129204,)


In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(), make_column_selector(dtype_include=object))
)

#hmmm. maybe 'handle_unknown = "ignore"'?'

# ct.fit_transform(selected_features)

## MLP Classifier testing

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn import metrics

# # X, y = make_classification(n_samples=100, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, stratify=target, 
                                                    random_state=69)

clf = make_pipeline(ct, MLPClassifier(random_state=1, max_iter=300, verbose=True))

clf.fit(X_train, y_train)

clf.predict_proba(X_test)
preds = clf.predict(X_test)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
loss_score = metrics.log_loss(y_test, preds)

print(f'Train score: {test_score}')
print(f'Test score: {test_score}')
print(f'Log loss: {loss_score}')

Iteration 1, loss = 0.58192777
Iteration 2, loss = 0.55745433
Iteration 3, loss = 0.55593220
Iteration 4, loss = 0.55497414
Iteration 5, loss = 0.55450037
Iteration 6, loss = 0.55443715
Iteration 7, loss = 0.55406040
Iteration 8, loss = 0.55377612
Iteration 9, loss = 0.55379749
Iteration 10, loss = 0.55347778
Iteration 11, loss = 0.55339706
Iteration 12, loss = 0.55336337
Iteration 13, loss = 0.55344847
Iteration 14, loss = 0.55336690
Iteration 15, loss = 0.55337734
Iteration 16, loss = 0.55313997
Iteration 17, loss = 0.55291114
Iteration 18, loss = 0.55308291
Iteration 19, loss = 0.55287725
Iteration 20, loss = 0.55285116
Iteration 21, loss = 0.55270424
Iteration 22, loss = 0.55287867
Iteration 23, loss = 0.55278303
Iteration 24, loss = 0.55268716
Iteration 25, loss = 0.55265379
Iteration 26, loss = 0.55259084
Iteration 27, loss = 0.55252799
Iteration 28, loss = 0.55258846
Iteration 29, loss = 0.55248287
Iteration 30, loss = 0.55244004
Iteration 31, loss = 0.55236336
Iteration 32, los

## Format and submission

In [22]:
def format_submit(model, df):
    '''Creates and formats submission '''
    preds = model.predict(df)
    prob = [x[1] for x in clf.predict_proba(df)]
    
    predict_df = pd.DataFrame({'Season':df['Season'],
                  'Team1':df['Team1'],
                  'Team2':df['Team2'],
                  'Pred':preds,
                  'Probability':prob}).round(1)

    predict_df['ID'] = predict_df['Season'].astype(str) + '_' +\
                       predict_df['Team1'].astype(str) + '_' +\
                       predict_df['Team2'].astype(str)
    
    return predict_df[['ID','Probability']].reset_index(drop=True)

In [23]:
submit_data = pd.DataFrame(submission_example_df['ID'].str.split('_',expand=True))
submit_data.columns = ['Season','Team1','Team2']
submit_data = submit_data.astype('int')

# make boolean columns for Tourney and WLoc
submit_data['Tourney'] = 1
submit_data['WLoc'] = 'N'

# add team 1 and 2 tourney seed column
submit_data2 = pd.merge(submit_data, seed_df, 
                     how='left', 
                     left_on=['Season','Team1'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

submit_data2 = submit_data2.rename(columns={'Seed':'Team1Seed'})

submit_data3 = pd.merge(submit_data2, seed_df, 
                     how='left', 
                     left_on=['Season','Team2'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

submit_data4 = submit_data3.rename(columns={'Seed':'Team2Seed'})

# change seed data type and remove alpha characters
submit_data4['Team1Seed'] = submit_data4['Team1Seed'].str.strip().str[1:3]
submit_data4['Team1Seed'] = np.where(len(submit_data4['Team1Seed'])>2, 
                                   submit_data4['Team1Seed'].str[:2], 
                                   submit_data4['Team1Seed'])
submit_data4['Team2Seed'] = submit_data4['Team2Seed'].str.strip().str[1:3]
submit_data4['Team2Seed'] = np.where(len(submit_data4['Team2Seed'])>2, 
                                   submit_data4['Team2Seed'].str[:2], 
                                   submit_data4['Team2Seed'])

# test_data3 = test_data3.fillna('30')

submit_data4 = submit_data4.astype({'Team1Seed':'int','Team2Seed':'int'})

# add rank data
massey_season_avg = massey_df.groupby(['Season','TeamID'], as_index=False)['OrdinalRank'].mean()

submit_data4 = pd.merge(submit_data4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team1'], 
                       right_on=['Season','TeamID'])

submit_data5 = pd.merge(submit_data4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team2'], 
                       right_on=['Season','TeamID'])

submit_data5 = submit_data5.rename(columns={'OrdinalRank_x':'Team1RankMean','OrdinalRank_y':'Team2RankMean'})\
    .drop(columns=['TeamID_x','TeamID_y'])

# test_data5 = test_data5.fillna(500)

# add first/last D1 year
submit_data5 = pd.merge(submit_data5, teams_df, how='left', left_on='Team1', right_on='TeamID')
submit_data5 = submit_data5.drop(columns=['TeamID','TeamName'])
submit_data5 = submit_data5.rename(columns={'FirstD1Season':'Team1FirstYear','LastD1Season':'Team1LastYear'})

submit_data5 = pd.merge(submit_data5, teams_df, how='left', left_on='Team2', right_on='TeamID')
submit_data5 = submit_data5.drop(columns=['TeamID','TeamName'])
submit_data5 = submit_data5.rename(columns={'FirstD1Season':'Team2FirstYear','LastD1Season':'Team2LastYear'})

# set DF shape
submit_data5 = submit_data5[['Season','Team1','Team2','Tourney','WLoc','Team1Seed','Team2Seed','Team1RankMean','Team2RankMean','Team1FirstYear','Team2FirstYear']]
submit_data5.shape

(11390, 11)

In [24]:
# # create new DF of all potential matchups
# # for 2020 NCAA Tourney only

# import itertools

# all_combos = list(itertools.combinations(teams_dict[i], 2)) 
# pd.DataFrame(all_combos)

In [25]:
format_submit(clf, submit_data5)

Unnamed: 0,ID,Probability
0,2015_1107_1112,0.1
1,2015_1107_1116,0.1
2,2015_1107_1124,0.1
3,2015_1107_1125,0.4
4,2015_1107_1129,0.3
...,...,...
11385,2019_1449_1459,0.6
11386,2019_1449_1463,0.8
11387,2019_1458_1459,0.7
11388,2019_1458_1463,0.9


# TODO:

## KNN testing

In [None]:
# # Scale data
# from sklearn.preprocessing import MinMaxScaler

# # define min max scaler
# scaler = MinMaxScaler(feature_range=(0, 1))

# # transform data
# X_scaled = scaler.fit_transform(selected_features)

# pd.DataFrame(X_scaled)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, random_state=42)

In [27]:
from sklearn.neighbors import KNeighborsClassifier
# Loop through different k values to see which has the highest accuracy
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    print(k)
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print('----')
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

1


ValueError: could not convert string to float: 'A'

In [None]:
from sklearn.metrics import log_loss
print(1)
knn_model = KNeighborsClassifier(n_neighbors=9)
print(2)
knn_model.fit(X_train, y_train)
print(3)
preds = knn_model.predict(X_test)
print(4)
train_score = knn_model.score(X_train, y_train)
print(5)
test_score = knn_model.score(X_test, y_test)
print(6)
loss_score = log_loss(y_test, preds)
print(7)
print(f'Train score: {train_score}')
print(f'Test score: {test_score}')
print(f'Loss score: {loss_score}')

In [None]:
format_submit(knn_model)

In [None]:
knn_model.predict_proba(X_test)[0:10,1]

In [None]:
# def format_submit(model):
#     '''Creates and formats submission '''
# #     pipeline = make_pipeline(ct, model)
    
# #     pipeline.fit(X_train, y_train)
    
#     train_score = clf.score(X_train, y_train)
#     test_score = clf.score(X_test, y_test)
#     loss_score = metrics.log_loss(y_test, prediction)
    
#     preds = model.predict(test_data5)
#     prob = [x[0] for x in clf.predict_proba(test_data5)]
    
#     predict_df = pd.DataFrame({'Season':test_data5['Season'],
#                   'Team1':test_data5['Team1'],
#                   'Team2':test_data5['Team2'],
#                   'Pred':preds,
#                   'Probability':prob}).round(1)

#     predict_df['ID'] = predict_df['Season'].astype(str) + '_' + predict_df['Team1'].astype(str) + '_' + predict_df['Team2'].astype(str)
    
# #     predict_df['Correct?'] = np.where(predict_df.Pred==predict_df.Answer, 'YES', 'NO')

#     print(f'Train score: {test_score}')
#     print(f'Test score: {test_score}')
#     print(f'Log loss: {loss_score}')
# #     print(f"{(predict_df.loc[predict_df['Correct?']=='YES']['Correct?'].count()/predict_df['Correct?'].count())*100}%")
    
#     return predict_df[['ID','Probability']].reset_index(drop=True)