# 2021 March Madess ML contest

In [1]:
# libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlalchemy 

## Import data

Data source: https://www.kaggle.com/c/ncaam-march-mania-2021/data

In [2]:
# read in all CSV files
datasets = os.listdir('../resources/kaggle_data/')
dfs = {}
for file in datasets:
    dfs[file[:-4]] = pd.DataFrame(pd.read_csv(f'../resources/kaggle_data/{file}', encoding='cp1252'))

In [3]:
# # preview all DFs
# df_list = list(dfs.keys())
# for x in df_list:
#     print(x)
#     print(dfs[x])
#     print('\n-----\n')

In [4]:
# create relevant dataframes
seed_df = dfs['MNCAATourneySeeds']
reg_short_df = dfs['MRegularSeasonCompactResults']
tourney_short_df = dfs['MNCAATourneyCompactResults']
reg_long_df = dfs['MRegularSeasonDetailedResults']
tourney_long_df = dfs['MNCAATourneyDetailedResults']
massey_df = dfs['MMasseyOrdinals']
teams_df = dfs['MTeams']
game_cities_df = dfs['MGameCities']
secondary_short_df = dfs['MSecondaryTourneyCompactResults']
coaches_df = dfs['MTeamCoaches']
submission_example_df = dfs['MSampleSubmissionStage1']

## Transform and clean

In [5]:
def get_stat_avg(stat, main_df):
    '''Return DF with specified stat for team and season'''
    grouped_wscore = reg_long_df.groupby(['WTeamID','Season'],as_index=False)[f'W{stat}'].sum()\
                                .rename(columns={'WTeamID':'TeamID'})
    grouped_lscore = reg_long_df.groupby(['LTeamID','Season'],as_index=False)[f'L{stat}'].sum()\
                                .rename(columns={'LTeamID':'TeamID'})
    merge_grouped = pd.merge(grouped_wscore, grouped_lscore, how='left', on=['TeamID','Season'])
    grouped_wcount = reg_long_df.groupby(['WTeamID','Season'],as_index=False)['WScore'].count()\
                                .rename(columns={'WTeamID':'TeamID','WScore':'WinCount'})
    grouped_lcount = reg_long_df.groupby(['LTeamID','Season'],as_index=False)['LScore'].count()\
                                .rename(columns={'LTeamID':'TeamID','LScore':'LoseCount'})
    merge_grouped = pd.merge(merge_grouped, grouped_wcount, how='left', on=['TeamID','Season']).fillna(0)
    merge_grouped = pd.merge(merge_grouped, grouped_lcount, how='left', on=['TeamID','Season']).fillna(0)
    merge_grouped['GameCount'] = merge_grouped['WinCount'] + merge_grouped['LoseCount']
    merge_grouped[f'Total{stat}'] = merge_grouped[f'W{stat}'] + merge_grouped[f'L{stat}']
#     merge_grouped = merge_grouped.fillna(0)
    merge_grouped[f'Avg{stat}'] = merge_grouped[f'Total{stat}']/merge_grouped['GameCount']
    
#     cols_to_use = merge_grouped.columns.difference(main_df.columns)

    merged_main = pd.merge(main_df, merge_grouped,
                           how='left',
                           left_on=['Team1','Season'],
                           right_on=['TeamID','Season'],
                           suffixes=('', ''))
    merged_main = pd.merge(merged_main, merge_grouped,
                           how='left',
                           left_on=['Team2','Season'],
                           right_on=['TeamID','Season'],
                           suffixes=('_Team1', '_Team2'))
    merged_main = merged_main.drop(columns=['TeamID_Team1','TeamID_Team2',
                                   f'W{stat}_Team1',f'W{stat}_Team2',
                                   f'L{stat}_Team1',f'L{stat}_Team2',
                                   f'Total{stat}_Team1',f'Total{stat}_Team2',])

    merged_main = merged_main.loc[:,~merged_main.columns.duplicated()]
    return merged_main

In [6]:
# make boolean columns for Tourney
reg_short_df['Tourney'] = 0
tourney_short_df['Tourney'] = 1

# append reg season and tourney DFs
combined_df = reg_short_df.append(tourney_short_df)
combined_df['ScoreDiff'] = combined_df['WScore'] - combined_df['LScore']

# create team 1 and 2 columns based on ID
combined_df['Team1'] = np.where(combined_df['WTeamID']<combined_df['LTeamID'],
                                combined_df['WTeamID'],
                                combined_df['LTeamID'])
combined_df['Team2'] = np.where(combined_df['WTeamID']>combined_df['LTeamID'],
                                combined_df['WTeamID'],
                                combined_df['LTeamID'])

combined_df = combined_df[['Season','DayNum','Team1','Team2','Tourney','WLoc','WTeamID','ScoreDiff']]

# add team 1 tourney seed column
merged_df = pd.merge(combined_df, seed_df, 
                     how='left', 
                     left_on=['Season','Team1'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

merged_df = merged_df.rename(columns={'Seed':'Team1Seed'})

# add team 2 tourney seed column
merged_df2 = pd.merge(merged_df, seed_df, 
                     how='left', 
                     left_on=['Season','Team2'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

merged_df2 = merged_df2.rename(columns={'Seed':'Team2Seed'})

# merged_df2['Team1Seed'] = np.where(merged_df2['Tourney']==0, 'N/A', merged_df2['Team1Seed'])
# merged_df2['Team2Seed'] = np.where(merged_df2['Tourney']==0, 'N/A', merged_df2['Team2Seed'])

# merged_df2 = merged_df2.fillna('N/A')

# add first/last D1 year
merged_df3 = pd.merge(merged_df2, teams_df, how='left', left_on='Team1', right_on='TeamID')
merged_df3 = merged_df3.drop(columns=['TeamID','TeamName'])
merged_df3 = merged_df3.rename(columns={'FirstD1Season':'Team1FirstYear','LastD1Season':'Team1LastYear'})

merged_df3 = pd.merge(merged_df3, teams_df, how='left', left_on='Team2', right_on='TeamID')
merged_df3 = merged_df3.drop(columns=['TeamID','TeamName'])
merged_df3 = merged_df3.rename(columns={'FirstD1Season':'Team2FirstYear','LastD1Season':'Team2LastYear'})

######

# winning team boolean
merged_df3['WTeam'] = np.where(merged_df2['WTeamID']==merged_df2['Team1'],1,0)

# change seed type
merged_df3['Team1Seed'] = merged_df3['Team1Seed'].str.strip().str[1:3]
merged_df3['Team1Seed'] = np.where(len(merged_df3['Team1Seed'])>2, 
                                   merged_df3['Team1Seed'].str[:2], 
                                   merged_df3['Team1Seed'])
merged_df3['Team2Seed'] = merged_df3['Team2Seed'].str.strip().str[1:3]
merged_df3['Team2Seed'] = np.where(len(merged_df3['Team2Seed'])>2, 
                                   merged_df3['Team2Seed'].str[:2], 
                                   merged_df3['Team2Seed'])

merged_df3 = merged_df3.fillna('30')

merged_df3 = merged_df3.astype({'Team1Seed':'int','Team2Seed':'int'})

#######

# add team rank averages
massey_season_avg = massey_df.groupby(['Season','TeamID'], as_index=False)['OrdinalRank'].mean()

merged_df4 = pd.merge(merged_df3, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team1'], 
                       right_on=['Season','TeamID'])

merged_df5 = pd.merge(merged_df4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team2'], 
                       right_on=['Season','TeamID'])

merged_df5 = merged_df5.rename(columns={'OrdinalRank_x':'Team1RankMean','OrdinalRank_y':'Team2RankMean'})\
    .drop(columns=['TeamID_x','TeamID_y'])

merged_df5 = merged_df5.fillna(500)

##########

grouped_wscore = reg_short_df.groupby(['WTeamID','Season'],as_index=False)['WScore'].sum()\
                            .rename(columns={'WTeamID':'TeamID'})
grouped_lscore = reg_short_df.groupby(['LTeamID','Season'],as_index=False)[f'LScore'].sum()\
                            .rename(columns={'LTeamID':'TeamID'})
merge_grouped = pd.merge(grouped_wscore, grouped_lscore, how='left', on=['TeamID','Season'])
grouped_wcount = reg_short_df.groupby(['WTeamID','Season'],as_index=False)['WScore'].count()\
                            .rename(columns={'WTeamID':'TeamID','WScore':'WinCount'})
grouped_lcount = reg_short_df.groupby(['LTeamID','Season'],as_index=False)['LScore'].count()\
                            .rename(columns={'LTeamID':'TeamID','LScore':'LoseCount'})
merge_grouped = pd.merge(merge_grouped, grouped_wcount, how='left', on=['TeamID','Season']).fillna(0)
merge_grouped = pd.merge(merge_grouped, grouped_lcount, how='left', on=['TeamID','Season']).fillna(0)
merge_grouped['GameCount'] = merge_grouped['WinCount'] + merge_grouped['LoseCount']
merge_grouped = merge_grouped.fillna(0)
merge_grouped[f'TotalScore'] = merge_grouped[f'WScore'] + merge_grouped[f'LScore']
#     merge_grouped = merge_grouped.fillna(0)
merge_grouped[f'AvgScore'] = merge_grouped[f'TotalScore']/merge_grouped['GameCount']

# # add win% columns
merge_grouped['Win%'] = merge_grouped['WinCount']/merge_grouped['GameCount']


# Merge with main DF
merged_df6 = pd.merge(merged_df5, merge_grouped, 
                      how='left', 
                      left_on=['Team1','Season'], 
                      right_on=['TeamID','Season'], 
                      suffixes=['_Team1','_Team2'])
merged_df6 = pd.merge(merged_df6, merge_grouped, 
                      how='left', 
                      left_on=['Team2','Season'], 
                      right_on=['TeamID','Season'],
                      suffixes=['_Team1','_Team2'])

merged_df6 = merged_df6.drop(columns=['LoseCount_Team1','LoseCount_Team2',
                                      'TotalScore_Team1','TotalScore_Team2',
                                      'TeamID_Team1','WScore_Team1','LScore_Team1',
                                      'TeamID_Team2','WScore_Team2','LScore_Team2'])

merged_df6

Unnamed: 0,Season,DayNum,Team1,Team2,Tourney,WLoc,WTeamID,ScoreDiff,Team1Seed,Team2Seed,...,Team1RankMean,Team2RankMean,WinCount_Team1,GameCount_Team1,AvgScore_Team1,Win%_Team1,WinCount_Team2,GameCount_Team2,AvgScore_Team2,Win%_Team2
0,1985,20,1228,1328,0,N,1228,17,3,1,...,500.000000,500.000000,23.0,31.0,68.225806,0.741935,25.0,30.0,89.833333,0.833333
1,1985,25,1106,1354,0,H,1106,7,30,30,...,500.000000,500.000000,10.0,24.0,71.625000,0.416667,9.0,24.0,68.208333,0.375000
2,1985,25,1112,1223,0,H,1112,7,10,30,...,500.000000,500.000000,18.0,27.0,66.518519,0.666667,17.0,25.0,68.320000,0.680000
3,1985,25,1165,1432,0,H,1165,16,30,30,...,500.000000,500.000000,12.0,24.0,61.375000,0.500000,11.0,23.0,63.478261,0.478261
4,1985,25,1192,1447,0,H,1192,12,16,30,...,500.000000,500.000000,19.0,28.0,67.892857,0.678571,8.0,24.0,73.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169126,2019,146,1120,1246,1,N,1120,6,5,2,...,17.748713,15.207700,25.0,34.0,78.882353,0.735294,27.0,33.0,76.727273,0.818182
169127,2019,146,1181,1277,1,N,1277,1,1,2,...,1.980769,6.603239,29.0,34.0,83.500000,0.852941,28.0,34.0,78.823529,0.823529
169128,2019,152,1277,1403,1,N,1403,10,2,3,...,6.603239,13.330275,28.0,34.0,78.823529,0.823529,26.0,32.0,73.093750,0.812500
169129,2019,152,1120,1438,1,N,1438,1,5,1,...,17.748713,3.682186,25.0,34.0,78.882353,0.735294,29.0,32.0,71.843750,0.906250


In [7]:
stat_cols = reg_long_df.columns[8:]
stat_cols

# stat_list = [x[1:] for x in stat_cols]
# for stat in stat_list:
#     merged_df5 = add_stat_avg(stat, merged_df5)
    
# merged_df5.columns

Index(['WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst',
       'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM',
       'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [8]:
# merged_df7 = get_stat_avg("FGM", merged_df6)
# merged_df7 = get_stat_avg("FGA", merged_df7)
# merged_df7 = get_stat_avg("FGM3", merged_df7)
# merged_df7 = get_stat_avg("FGA3", merged_df7)
# merged_df7 = get_stat_avg("OR", merged_df7)
# merged_df7 = get_stat_avg("DR", merged_df7)
# merged_df = get_stat_avg("Ast", merged_df7)


# merged_df7 = merged_df7.dropna(how='any')

# merged_df7

In [9]:
# # add shooting %
# merged_df6['FG%_Team1'] = merged_df6['AvgFGM_Team1']/merged_df6['AvgFGA_Team1']
# merged_df6['FG%_Team2'] = merged_df6['AvgFGM_Team2']/merged_df6['AvgFGA_Team2']
# merged_df6['FG3%_Team1'] = merged_df6['AvgFGM3_Team1']/merged_df6['AvgFGA3_Team1']
# merged_df6['FG3%_Team2'] = merged_df6['AvgFGM3_Team2']/merged_df6['AvgFGA3_Team2']

## Feature selection and engineering

In [10]:
# remove Tourney 2015-19 records
features_df = merged_df6.drop(columns=['WTeamID','DayNum','ScoreDiff',
                                       'Team1LastYear','Team2LastYear', 
                                       'WinCount_Team1', 'WinCount_Team2',
                                       'GameCount_Team1','GameCount_Team2'])                                       

features_df = pd.get_dummies(features_df.loc[(features_df.Season < 2014) & (merged_df5.Tourney==0)])

features_df = features_df.dropna(how='any')

# features_df = features_df.astype({'Season':'str',
#                                   'Team1':'str',
#                                   'Team2':'str',
#                                   'Team1FirstYear':'str',
#                                   'Team2FirstYear':'str'})

features_df.shape

(128901, 18)

In [11]:
# select features and target
target = features_df.pop('WTeam')
selected_features = features_df

print(selected_features.shape)
print(target.shape)

(128901, 17)
(128901,)


In [12]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object))
)

ct.fit_transform(selected_features)

array([[-1.77456301,  0.03015702, -0.19053283, ..., -0.65984179,
        -1.21689006,  3.00389845],
       [-1.77456301, -1.38010876,  0.11124568, ..., -0.65984179,
         0.82176692, -0.33290073],
       [-1.77456301, -1.31075142, -1.40925373, ..., -0.65984179,
         0.82176692, -0.33290073],
       ...,
       [ 1.55860856, -0.33974876, -0.75926925, ..., -0.65984179,
        -1.21689006,  3.00389845],
       [ 1.55860856,  1.16299347,  1.31835971, ..., -0.65984179,
        -1.21689006,  3.00389845],
       [ 1.55860856,  1.86812636,  1.02818806, ..., -0.65984179,
        -1.21689006,  3.00389845]])

## MLP Classifier testing

In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, stratify=target, 
                                                    random_state=42)

clf = make_pipeline(ct, MLPClassifier(random_state=69, max_iter=300, verbose=True))

clf.fit(X_train, y_train)

clf.predict_proba(X_test)
preds = clf.predict(X_test)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
loss_score = metrics.log_loss(y_test, preds)

print(f'Train score: {test_score}')
print(f'Test score: {test_score}')
print(f'Log loss: {loss_score}')

Iteration 1, loss = 0.52465542
Iteration 2, loss = 0.50201587
Iteration 3, loss = 0.49914558
Iteration 4, loss = 0.49721301
Iteration 5, loss = 0.49602544
Iteration 6, loss = 0.49482792
Iteration 7, loss = 0.49410918
Iteration 8, loss = 0.49339960
Iteration 9, loss = 0.49319535
Iteration 10, loss = 0.49292050
Iteration 11, loss = 0.49247418
Iteration 12, loss = 0.49231891
Iteration 13, loss = 0.49214797
Iteration 14, loss = 0.49193641
Iteration 15, loss = 0.49170981
Iteration 16, loss = 0.49136439
Iteration 17, loss = 0.49127573
Iteration 18, loss = 0.49135200
Iteration 19, loss = 0.49110579
Iteration 20, loss = 0.49112412
Iteration 21, loss = 0.49080678
Iteration 22, loss = 0.49058769
Iteration 23, loss = 0.49055986
Iteration 24, loss = 0.49035913
Iteration 25, loss = 0.49039009
Iteration 26, loss = 0.49034198
Iteration 27, loss = 0.49017308
Iteration 28, loss = 0.49054245
Iteration 29, loss = 0.49003414
Iteration 30, loss = 0.48990697
Iteration 31, loss = 0.49028153
Iteration 32, los

Best log loss: 8.338768513668983

## Format and submission

In [25]:
def format_submit(model, df):
    '''Creates and formats submission '''
    preds = model.predict(df)
    prob = [x[1] for x in clf.predict_proba(df)]
    
    predict_df = pd.DataFrame({'Season':df['Season'],
                  'Team1':df['Team1'],
                  'Team2':df['Team2'],
                  'Guess':preds,
                  'Pred':prob}).round(1)

    predict_df['ID'] = predict_df['Season'].astype(str) + '_' +\
                       predict_df['Team1'].astype(str) + '_' +\
                       predict_df['Team2'].astype(str)
    
    submit_df = predict_df[['ID','Pred']].reset_index(drop=True)
    
    submit_df.to_csv('../output/ml_march_madness_submission.csv')
    
    return submit_df

In [26]:
submit_data = pd.DataFrame(submission_example_df['ID'].str.split('_',expand=True))
submit_data.columns = ['Season','Team1','Team2']
submit_data = submit_data.astype('int')

# make boolean columns for Tourney and WLoc
submit_data['Tourney'] = 1
# submit_data['WLoc'] = 'N'
submit_data['WLoc_A'] = 0
submit_data['WLoc_H'] = 0
submit_data['WLoc_N'] = 1

# submit_data = submit_data.astype({'WLoc_A':'int','WLoc_H':'int','WLoc_N':'int'})

# add team 1 and 2 tourney seed column
submit_data2 = pd.merge(submit_data, seed_df, 
                     how='left', 
                     left_on=['Season','Team1'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

submit_data2 = submit_data2.rename(columns={'Seed':'Team1Seed'})

submit_data3 = pd.merge(submit_data2, seed_df, 
                     how='left', 
                     left_on=['Season','Team2'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

submit_data4 = submit_data3.rename(columns={'Seed':'Team2Seed'})

# change seed data type and remove alpha characters
submit_data4['Team1Seed'] = submit_data4['Team1Seed'].str.strip().str[1:3]
submit_data4['Team1Seed'] = np.where(len(submit_data4['Team1Seed'])>2, 
                                   submit_data4['Team1Seed'].str[:2], 
                                   submit_data4['Team1Seed'])
submit_data4['Team2Seed'] = submit_data4['Team2Seed'].str.strip().str[1:3]
submit_data4['Team2Seed'] = np.where(len(submit_data4['Team2Seed'])>2, 
                                   submit_data4['Team2Seed'].str[:2], 
                                   submit_data4['Team2Seed'])

# test_data3 = test_data3.fillna('30')

submit_data4 = submit_data4.astype({'Team1Seed':'int','Team2Seed':'int'})

# add rank data
massey_season_avg = massey_df.groupby(['Season','TeamID'], as_index=False)['OrdinalRank'].mean()

submit_data4 = pd.merge(submit_data4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team1'], 
                       right_on=['Season','TeamID'])

submit_data5 = pd.merge(submit_data4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team2'], 
                       right_on=['Season','TeamID'])

submit_data5 = submit_data5.rename(columns={'OrdinalRank_x':'Team1RankMean',
                                            'OrdinalRank_y':'Team2RankMean'})\
                            .drop(columns=['TeamID_x','TeamID_y'])

# test_data5 = test_data5.fillna(500)

# add first/last D1 year
submit_data5 = pd.merge(submit_data5, teams_df, how='left', left_on='Team1', right_on='TeamID')
submit_data5 = submit_data5.drop(columns=['TeamID','TeamName'])
submit_data5 = submit_data5.rename(columns={'FirstD1Season':'Team1FirstYear',
                                            'LastD1Season':'Team1LastYear'})

submit_data5 = pd.merge(submit_data5, teams_df, how='left', left_on='Team2', right_on='TeamID')
submit_data5 = submit_data5.drop(columns=['TeamID','TeamName'])
submit_data5 = submit_data5.rename(columns={'FirstD1Season':'Team2FirstYear',
                                            'LastD1Season':'Team2LastYear'})

##########

grouped_wscore = reg_short_df.groupby(['WTeamID','Season'],as_index=False)['WScore'].sum()\
                            .rename(columns={'WTeamID':'TeamID'})
grouped_lscore = reg_short_df.groupby(['LTeamID','Season'],as_index=False)[f'LScore'].sum()\
                            .rename(columns={'LTeamID':'TeamID'})
merge_grouped = pd.merge(grouped_wscore, grouped_lscore, how='left', on=['TeamID','Season'])
grouped_wcount = reg_short_df.groupby(['WTeamID','Season'],as_index=False)['WScore'].count()\
                            .rename(columns={'WTeamID':'TeamID','WScore':'WinCount'})
grouped_lcount = reg_short_df.groupby(['LTeamID','Season'],as_index=False)['LScore'].count()\
                            .rename(columns={'LTeamID':'TeamID','LScore':'LoseCount'})
merge_grouped = pd.merge(merge_grouped, grouped_wcount, how='left', on=['TeamID','Season']).fillna(0)
merge_grouped = pd.merge(merge_grouped, grouped_lcount, how='left', on=['TeamID','Season']).fillna(0)
merge_grouped['GameCount'] = merge_grouped['WinCount'] + merge_grouped['LoseCount']
merge_grouped = merge_grouped.fillna(0)
merge_grouped[f'TotalScore'] = merge_grouped[f'WScore'] + merge_grouped[f'LScore']
#     merge_grouped = merge_grouped.fillna(0)
merge_grouped[f'AvgScore'] = merge_grouped[f'TotalScore']/merge_grouped['GameCount']

# # add win% columns
merge_grouped['Win%'] = merge_grouped['WinCount']/merge_grouped['GameCount']


# Merge with main DF
submit_data6 = pd.merge(submit_data5, merge_grouped, 
                      how='left', 
                      left_on=['Team1','Season'], 
                      right_on=['TeamID','Season'], 
                      suffixes=['_Team1','_Team2'])
submit_data6 = pd.merge(submit_data6, merge_grouped, 
                      how='left', 
                      left_on=['Team2','Season'], 
                      right_on=['TeamID','Season'],
                      suffixes=['_Team1','_Team2'])

submit_data6 = submit_data6.drop(columns=['LoseCount_Team1','LoseCount_Team2',
                                      'TotalScore_Team1','TotalScore_Team2',
                                      'TeamID_Team1','WScore_Team1','LScore_Team1',
                                      'TeamID_Team2','WScore_Team2','LScore_Team2'])


# set DF shape
submit_data6 = submit_data6[selected_features.columns.tolist()]
submit_data6

Unnamed: 0,Season,Team1,Team2,Tourney,Team1Seed,Team2Seed,Team1FirstYear,Team2FirstYear,Team1RankMean,Team2RankMean,AvgScore_Team1,Win%_Team1,AvgScore_Team2,Win%_Team2,WLoc_A,WLoc_H,WLoc_N
0,2015,1107,1112,1,14,2,2000,1985,154.890566,7.155889,65.500000,0.750000,76.441176,0.911765,0,0,1
1,2015,1107,1116,1,14,5,2000,1985,154.890566,26.339623,65.500000,0.750000,78.029412,0.764706,0,0,1
2,2015,1107,1124,1,14,3,2000,1985,154.890566,17.802834,65.500000,0.750000,69.125000,0.718750,0,0,1
3,2015,1107,1125,1,14,15,2000,2000,154.890566,132.294340,65.500000,0.750000,73.774194,0.677419,0,0,1
4,2015,1107,1129,1,14,11,2000,1985,154.890566,63.752179,65.500000,0.750000,69.870968,0.741935,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2019,1449,1459,1,9,7,1985,1996,43.404278,48.756124,69.823529,0.764706,81.200000,0.866667,0,0,1
11386,2019,1449,1463,1,9,14,1985,1985,43.404278,91.541037,69.823529,0.764706,80.892857,0.750000,0,0,1
11387,2019,1458,1459,1,5,7,1985,1996,19.631308,48.756124,69.060606,0.696970,81.200000,0.866667,0,0,1
11388,2019,1458,1463,1,5,14,1985,1985,19.631308,91.541037,69.060606,0.696970,80.892857,0.750000,0,0,1


In [27]:
# submit_data7 = get_stat_avg("FGA", submit_data6)
# submit_data7

In [28]:
# # add shooting %
# merged_df6['FG%_Team1'] = merged_df6['AvgFGM_Team1']/merged_df6['AvgFGA_Team1']
# merged_df6['FG%_Team2'] = merged_df6['AvgFGM_Team2']/merged_df6['AvgFGA_Team2']
# merged_df6['FG3%_Team1'] = merged_df6['AvgFGM3_Team1']/merged_df6['AvgFGA3_Team1']
# merged_df6['FG3%_Team2'] = merged_df6['AvgFGM3_Team2']/merged_df6['AvgFGA3_Team2']

# submit_data6

In [29]:
format_submit(clf, submit_data6)

Unnamed: 0,ID,Pred
0,2015_1107_1112,0.1
1,2015_1107_1116,0.1
2,2015_1107_1124,0.1
3,2015_1107_1125,0.5
4,2015_1107_1129,0.3
...,...,...
11385,2019_1449_1459,0.5
11386,2019_1449_1463,0.8
11387,2019_1458_1459,0.7
11388,2019_1458_1463,0.9


In [30]:
# # create new DF of all potential matchups
# # for 2021 NCAA Tourney only
# import itertools
# all_combos = list(itertools.combinations(teams_dict[i], 2)) 
# pd.DataFrame(all_combos)