# 2021 March Madess ML contest

In [1]:
# libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlalchemy 

## Import data

Data source: https://www.kaggle.com/c/ncaam-march-mania-2021/data

In [2]:
# read in all CSV files
datasets = os.listdir('../resources/kaggle_data/')
dfs = {}
for file in datasets:
    dfs[file[:-4]] = pd.DataFrame(pd.read_csv(f'../resources/kaggle_data/{file}', encoding='cp1252'))

In [3]:
# # preview all DFs
# df_list = list(dfs.keys())
# for x in df_list:
#     print(x)
#     print(dfs[x])
#     print('\n-----\n')

In [4]:
# create relevant dataframes
seed_df = dfs['MNCAATourneySeeds']
reg_short_df = dfs['MRegularSeasonCompactResults']
tourney_short_df = dfs['MNCAATourneyCompactResults']
reg_long_df = dfs['MRegularSeasonDetailedResults']
tourney_long_df = dfs['MNCAATourneyDetailedResults']
massey_df = dfs['MMasseyOrdinals']
teams_df = dfs['MTeams']
game_cities_df = dfs['MGameCities']
secondary_short_df = dfs['MSecondaryTourneyCompactResults']
coaches_df = dfs['MTeamCoaches']
submission_example_df = dfs['MSampleSubmissionStage1']

In [5]:
teams_df.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2021
1,1102,Air Force,1985,2021
2,1103,Akron,1985,2021
3,1104,Alabama,1985,2021
4,1105,Alabama A&M,2000,2021


In [6]:
reg_short_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [7]:
coaches_df.head()

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName
0,1985,1102,0,154,reggie_minton
1,1985,1103,0,154,bob_huggins
2,1985,1104,0,154,wimp_sanderson
3,1985,1106,0,154,james_oliver
4,1985,1108,0,154,davey_whitney


## Transform and clean

In [8]:
# make boolean columns for Tourney
reg_short_df['Tourney'] = 0
tourney_short_df['Tourney'] = 1

# append reg season and tourney DFs
combined_df = reg_short_df.append(tourney_short_df)
combined_df['ScoreDiff'] = combined_df['WScore'] - combined_df['LScore']

# create team 1 and 2 columns based on ID
combined_df['Team1'] = np.where(combined_df['WTeamID']<combined_df['LTeamID'],
                                combined_df['WTeamID'],
                                combined_df['LTeamID'])
combined_df['Team2'] = np.where(combined_df['WTeamID']>combined_df['LTeamID'],
                                combined_df['WTeamID'],
                                combined_df['LTeamID'])

combined_df = combined_df[['Season','DayNum','Team1','Team2','Tourney','WLoc','WTeamID','ScoreDiff']]

# add team 1 tourney seed column
merged_df = pd.merge(combined_df, seed_df, 
                     how='left', 
                     left_on=['Season','Team1'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

merged_df = merged_df.rename(columns={'Seed':'Team1Seed'})

# add team 2 tourney seed column
merged_df2 = pd.merge(merged_df, seed_df, 
                     how='left', 
                     left_on=['Season','Team2'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

merged_df2 = merged_df2.rename(columns={'Seed':'Team2Seed'})

# merged_df2['Team1Seed'] = np.where(merged_df2['Tourney']==0, 'N/A', merged_df2['Team1Seed'])
# merged_df2['Team2Seed'] = np.where(merged_df2['Tourney']==0, 'N/A', merged_df2['Team2Seed'])

# merged_df2 = merged_df2.fillna('N/A')

# add first/last D1 year
merged_df3 = pd.merge(merged_df2, teams_df, how='left', left_on='Team1', right_on='TeamID')
merged_df3 = merged_df3.drop(columns=['TeamID','TeamName'])
merged_df3 = merged_df3.rename(columns={'FirstD1Season':'Team1FirstYear','LastD1Season':'Team1LastYear'})

merged_df3 = pd.merge(merged_df3, teams_df, how='left', left_on='Team2', right_on='TeamID')
merged_df3 = merged_df3.drop(columns=['TeamID','TeamName'])
merged_df3 = merged_df3.rename(columns={'FirstD1Season':'Team2FirstYear','LastD1Season':'Team2LastYear'})

# winning team boolean
merged_df3['WTeam'] = np.where(merged_df2['WTeamID']==merged_df2['Team1'],1,0)

# change seed type
merged_df3['Team1Seed'] = merged_df3['Team1Seed'].str.strip().str[1:3]
merged_df3['Team1Seed'] = np.where(len(merged_df3['Team1Seed'])>2, 
                                   merged_df3['Team1Seed'].str[:2], 
                                   merged_df3['Team1Seed'])
merged_df3['Team2Seed'] = merged_df3['Team2Seed'].str.strip().str[1:3]
merged_df3['Team2Seed'] = np.where(len(merged_df3['Team2Seed'])>2, 
                                   merged_df3['Team2Seed'].str[:2], 
                                   merged_df3['Team2Seed'])

merged_df3 = merged_df3.fillna('30')

merged_df3 = merged_df3.astype({'Team1Seed':'int','Team2Seed':'int'})

# merged_df2['Team1Seed'] = np.where(merged_df2['Tourney']==1, merged_df2['Team1Seed'], '0')
# merged_df2['Team2Seed'] = np.where(merged_df2['Tourney']==1, merged_df2['Team2Seed'], '0')

# add team rank averages
massey_season_avg = massey_df.groupby(['Season','TeamID'], as_index=False)['OrdinalRank'].mean()

merged_df4 = pd.merge(merged_df3, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team1'], 
                       right_on=['Season','TeamID'])

merged_df5 = pd.merge(merged_df4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team2'], 
                       right_on=['Season','TeamID'])

merged_df5 = merged_df5.rename(columns={'OrdinalRank_x':'Team1RankMean','OrdinalRank_y':'Team2RankMean'})\
    .drop(columns=['TeamID_x','TeamID_y'])

merged_df5 = merged_df5.fillna(500)

merged_df5

Unnamed: 0,Season,DayNum,Team1,Team2,Tourney,WLoc,WTeamID,ScoreDiff,Team1Seed,Team2Seed,Team1FirstYear,Team1LastYear,Team2FirstYear,Team2LastYear,WTeam,Team1RankMean,Team2RankMean
0,1985,20,1228,1328,0,N,1228,17,3,1,1985,2021,1985,2021,1,500.000000,500.000000
1,1985,25,1106,1354,0,H,1106,7,30,30,1985,2021,1985,2021,1,500.000000,500.000000
2,1985,25,1112,1223,0,H,1112,7,10,30,1985,2021,1985,2021,1,500.000000,500.000000
3,1985,25,1165,1432,0,H,1165,16,30,30,1985,2020,1985,1987,1,500.000000,500.000000
4,1985,25,1192,1447,0,H,1192,12,16,30,1985,2021,1985,2021,1,500.000000,500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169126,2019,146,1120,1246,1,N,1120,6,5,2,1985,2021,1985,2021,1,17.748713,15.207700
169127,2019,146,1181,1277,1,N,1277,1,1,2,1985,2021,1985,2021,0,1.980769,6.603239
169128,2019,152,1277,1403,1,N,1403,10,2,3,1985,2021,1985,2021,0,6.603239,13.330275
169129,2019,152,1120,1438,1,N,1438,1,5,1,1985,2021,1985,2021,0,17.748713,3.682186


In [9]:
# add avg team scores
detailed_all = reg_long_df.append(tourney_long_df)

win_score_avg = detailed_all.groupby(['WTeamID','Season'], 
                                    as_index=False)['WScore'].mean()
lose_score_avg = detailed_all.groupby(['LTeamID','Season'], 
                                     as_index=False)['LScore'].mean()
score_merge = pd.merge(win_score_avg, lose_score_avg, 
                       left_on=['WTeamID','Season'], 
                       right_on=['LTeamID','Season'])
score_merge = score_merge.drop(columns=['LTeamID'])\
                         .rename(columns={'WTeamID':'TeamID',
                                          'WScore':'WScoreAvg',
                                          'LScore':'LScoreAvg'})

score_merge['SeasonTeam1'] = score_merge['Season'].astype(str) + score_merge['TeamID'].astype(str)

score_merge = score_merge[['SeasonTeam1','WScoreAvg','LScoreAvg']]

###########

merged_df5['SeasonTeam1'] = merged_df5['Season'].astype(str) + merged_df5['Team1'].astype(str)

merged_df6 = pd.merge(merged_df5, score_merge, 
                      how='left', 
                      on='SeasonTeam1')\
               .rename(columns={'WScoreAvg':'Team1WScoreAvg','LScoreAvg':'Team1LScoreAvg'})
score_merge = score_merge.rename(columns={'SeasonTeam1':'SeasonTeam2'})

merged_df6['SeasonTeam2'] = merged_df6['Season'].astype(str) + merged_df6['Team2'].astype(str)

merged_df7 = pd.merge(merged_df6, score_merge, 
                      how='left', 
                      on='SeasonTeam2')\
               .rename(columns={'WScoreAvg':'Team2WScoreAvg','LScoreAvg':'Team2LScoreAvg'})

merged_df7 = merged_df7.dropna(how='any').drop(columns=['SeasonTeam1','SeasonTeam2'])

merged_df7

Unnamed: 0,Season,DayNum,Team1,Team2,Tourney,WLoc,WTeamID,ScoreDiff,Team1Seed,Team2Seed,...,Team1LastYear,Team2FirstYear,Team2LastYear,WTeam,Team1RankMean,Team2RankMean,Team1WScoreAvg,Team1LScoreAvg,Team2WScoreAvg,Team2LScoreAvg
74048,2003,10,1104,1328,0,N,1104,6,10,1,...,2021,1985,2021,1,27.655502,15.730233,74.705882,61.000000,74.370370,54.857143
74049,2003,10,1272,1393,0,N,1272,7,7,3,...,2021,1985,2021,1,42.000000,25.596154,74.826087,73.000000,82.300000,63.200000
74050,2003,11,1266,1437,0,N,1266,12,3,30,...,2021,1985,2021,1,18.967442,60.385000,79.851852,72.166667,80.066667,64.333333
74051,2003,11,1296,1457,0,N,1296,6,30,30,...,2021,1987,2021,1,147.512500,209.452500,72.764706,65.785714,74.944444,59.500000
74052,2003,11,1208,1400,0,N,1400,6,30,1,...,2021,1985,2021,0,19.261283,9.416279,82.210526,72.000000,78.730769,81.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169126,2019,146,1120,1246,1,N,1120,6,5,2,...,2021,1985,2021,1,17.748713,15.207700,82.275862,69.900000,76.400000,73.428571
169127,2019,146,1181,1277,1,N,1277,1,1,2,...,2021,1985,2021,0,1.980769,6.603239,83.875000,76.500000,79.218750,70.000000
169128,2019,152,1277,1403,1,N,1403,10,2,3,...,2021,1985,2021,0,6.603239,13.330275,79.218750,70.000000,74.903226,63.285714
169129,2019,152,1120,1438,1,N,1438,1,5,1,...,2021,1985,2021,0,17.748713,3.682186,82.275862,69.900000,71.828571,66.666667


## Feature selection and engineering

In [22]:
# remove Tourney 2015-19 records

# merged_df7 = merged_df7.astype({'Team1':'str','Team2':'str'})
features_df = pd.get_dummies(merged_df7[['Season','Team1','Team2','Tourney','WLoc',
                         'Team1Seed','Team2Seed','Team1RankMean',
                         'Team2RankMean','Team1FirstYear','Team2FirstYear',
                         'Team1WScoreAvg','Team1LScoreAvg','Team2WScoreAvg',
                         'Team2LScoreAvg','WTeam']])

features_df = features_df.loc[(features_df.Season < 2014) & (merged_df5.Tourney==0)]

features_df

Unnamed: 0,Season,Team1,Team2,Tourney,WLoc,Team1Seed,Team2Seed,Team1RankMean,Team2RankMean,Team1FirstYear,Team2FirstYear,Team1WScoreAvg,Team1LScoreAvg,Team2WScoreAvg,Team2LScoreAvg,WTeam
74048,2003,1104,1328,0,N,10,1,27.655502,15.730233,1985,1985,74.705882,61.000000,74.370370,54.857143,1
74049,2003,1272,1393,0,N,7,3,42.000000,25.596154,1985,1985,74.826087,73.000000,82.300000,63.200000,1
74050,2003,1266,1437,0,N,3,30,18.967442,60.385000,1985,1985,79.851852,72.166667,80.066667,64.333333,1
74051,2003,1296,1457,0,N,30,30,147.512500,209.452500,1985,1987,72.764706,65.785714,74.944444,59.500000,1
74052,2003,1208,1400,0,N,30,1,19.261283,9.416279,1985,1985,82.210526,72.000000,78.730769,81.571429,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129199,2013,1231,1458,0,N,1,5,4.748869,30.787158,1985,1985,81.965517,64.857143,71.173913,53.083333,0
129200,2013,1274,1314,0,N,2,8,20.012926,36.366114,1986,1985,71.793103,61.000000,80.916667,64.363636,1
129201,2013,1196,1279,0,N,3,12,5.980769,36.901561,1985,1985,74.172414,61.375000,79.444444,70.444444,0
129202,2013,1326,1458,0,N,2,5,13.718326,30.787158,1985,1985,72.310345,62.750000,71.173913,53.083333,1


In [23]:
# select features and target
target = features_df.pop('WTeam')
selected_features = features_df

print(selected_features.shape)
print(target.shape)

(55071, 15)
(55071,)


In [24]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(), make_column_selector(dtype_include=object))
)

ct.fit_transform(selected_features)

array([[-1.6454118 , -1.40338717, -0.18663373, ...,  0.        ,
         0.        ,  1.        ],
       [-1.6454118 ,  0.54453122,  0.56825713, ...,  0.        ,
         0.        ,  1.        ],
       [-1.6454118 ,  0.47496271,  1.07926018, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.53987751, -0.33666995, -0.7557053 , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.53987751,  1.17064785,  1.32314799, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.53987751,  1.87792775,  1.03280536, ...,  0.        ,
         0.        ,  1.        ]])

## MLP Classifier testing

In [25]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, stratify=target, 
                                                    random_state=42)

clf = make_pipeline(ct, MLPClassifier(random_state=69, max_iter=300, verbose=True))

clf.fit(X_train, y_train)

clf.predict_proba(X_test)
preds = clf.predict(X_test)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
loss_score = metrics.log_loss(y_test, preds)

print(f'Train score: {test_score}')
print(f'Test score: {test_score}')
print(f'Log loss: {loss_score}')

Iteration 1, loss = 0.53737792
Iteration 2, loss = 0.48981758
Iteration 3, loss = 0.48708932
Iteration 4, loss = 0.48549988
Iteration 5, loss = 0.48461788
Iteration 6, loss = 0.48355293
Iteration 7, loss = 0.48285424
Iteration 8, loss = 0.48229768
Iteration 9, loss = 0.48205195
Iteration 10, loss = 0.48158911
Iteration 11, loss = 0.48090898
Iteration 12, loss = 0.48059393
Iteration 13, loss = 0.48045013
Iteration 14, loss = 0.48018724
Iteration 15, loss = 0.47985314
Iteration 16, loss = 0.47961785
Iteration 17, loss = 0.47932781
Iteration 18, loss = 0.47886142
Iteration 19, loss = 0.47899852
Iteration 20, loss = 0.47858519
Iteration 21, loss = 0.47840994
Iteration 22, loss = 0.47827208
Iteration 23, loss = 0.47801837
Iteration 24, loss = 0.47759829
Iteration 25, loss = 0.47758757
Iteration 26, loss = 0.47751935
Iteration 27, loss = 0.47713183
Iteration 28, loss = 0.47694483
Iteration 29, loss = 0.47656950
Iteration 30, loss = 0.47679721
Iteration 31, loss = 0.47643684
Iteration 32, los

Best log loss: 8.602190376710555

## Format and submission

In [14]:
def format_submit(model, df):
    '''Creates and formats submission '''
    preds = model.predict(df)
    prob = [x[1] for x in clf.predict_proba(df)]
    
    predict_df = pd.DataFrame({'Season':df['Season'],
                  'Team1':df['Team1'],
                  'Team2':df['Team2'],
                  'Pred':preds,
                  'Probability':prob}).round(1)

    predict_df['ID'] = predict_df['Season'].astype(str) + '_' +\
                       predict_df['Team1'].astype(str) + '_' +\
                       predict_df['Team2'].astype(str)
    
    return predict_df[['ID','Probability']].reset_index(drop=True)

In [15]:
submit_data = pd.DataFrame(submission_example_df['ID'].str.split('_',expand=True))
submit_data.columns = ['Season','Team1','Team2']
submit_data = submit_data.astype('int')

# make boolean columns for Tourney and WLoc
submit_data['Tourney'] = 1
submit_data['WLoc_A'] = 0
submit_data['WLoc_H'] = 0
submit_data['WLoc_N'] = 1

# submit_data = submit_data.astype({'WLoc_A':'int','WLoc_H':'int','WLoc_N':'int'})

# add team 1 and 2 tourney seed column
submit_data2 = pd.merge(submit_data, seed_df, 
                     how='left', 
                     left_on=['Season','Team1'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

submit_data2 = submit_data2.rename(columns={'Seed':'Team1Seed'})

submit_data3 = pd.merge(submit_data2, seed_df, 
                     how='left', 
                     left_on=['Season','Team2'], 
                     right_on=['Season','TeamID'])\
            .drop(columns=['TeamID'])

submit_data4 = submit_data3.rename(columns={'Seed':'Team2Seed'})

# change seed data type and remove alpha characters
submit_data4['Team1Seed'] = submit_data4['Team1Seed'].str.strip().str[1:3]
submit_data4['Team1Seed'] = np.where(len(submit_data4['Team1Seed'])>2, 
                                   submit_data4['Team1Seed'].str[:2], 
                                   submit_data4['Team1Seed'])
submit_data4['Team2Seed'] = submit_data4['Team2Seed'].str.strip().str[1:3]
submit_data4['Team2Seed'] = np.where(len(submit_data4['Team2Seed'])>2, 
                                   submit_data4['Team2Seed'].str[:2], 
                                   submit_data4['Team2Seed'])

# test_data3 = test_data3.fillna('30')

submit_data4 = submit_data4.astype({'Team1Seed':'int','Team2Seed':'int'})

# add rank data
massey_season_avg = massey_df.groupby(['Season','TeamID'], as_index=False)['OrdinalRank'].mean()

submit_data4 = pd.merge(submit_data4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team1'], 
                       right_on=['Season','TeamID'])

submit_data5 = pd.merge(submit_data4, massey_season_avg, 
                       how='left', 
                       left_on=['Season','Team2'], 
                       right_on=['Season','TeamID'])

submit_data5 = submit_data5.rename(columns={'OrdinalRank_x':'Team1RankMean',
                                            'OrdinalRank_y':'Team2RankMean'})\
    .drop(columns=['TeamID_x','TeamID_y'])

# test_data5 = test_data5.fillna(500)

# add first/last D1 year
submit_data5 = pd.merge(submit_data5, teams_df, how='left', left_on='Team1', right_on='TeamID')
submit_data5 = submit_data5.drop(columns=['TeamID','TeamName'])
submit_data5 = submit_data5.rename(columns={'FirstD1Season':'Team1FirstYear',
                                            'LastD1Season':'Team1LastYear'})

submit_data5 = pd.merge(submit_data5, teams_df, how='left', left_on='Team2', right_on='TeamID')
submit_data5 = submit_data5.drop(columns=['TeamID','TeamName'])
submit_data5 = submit_data5.rename(columns={'FirstD1Season':'Team2FirstYear',
                                            'LastD1Season':'Team2LastYear'})

# set DF shape
submit_data5 = submit_data5[['Season','Team1','Team2',
                             'Tourney','Team1Seed','Team2Seed',
                             'Team1RankMean','Team2RankMean',
                             'Team1FirstYear','Team2FirstYear',
                             'WLoc_A','WLoc_H','WLoc_N']]
submit_data5.shape

(11390, 13)

In [16]:
# add avg team scores
detailed_all = reg_long_df.append(tourney_long_df)

win_score_avg = detailed_all.groupby(['WTeamID','Season'], 
                                    as_index=False)['WScore'].mean()
lose_score_avg = detailed_all.groupby(['LTeamID','Season'], 
                                     as_index=False)['LScore'].mean()
score_merge = pd.merge(win_score_avg, lose_score_avg, 
                       left_on=['WTeamID','Season'], 
                       right_on=['LTeamID','Season'])
score_merge = score_merge.drop(columns=['LTeamID'])\
                         .rename(columns={'WTeamID':'TeamID',
                                          'WScore':'WScoreAvg',
                                          'LScore':'LScoreAvg'})

score_merge['SeasonTeam1'] = score_merge['Season'].astype(str) + score_merge['TeamID'].astype(str)

score_merge = score_merge[['SeasonTeam1','WScoreAvg','LScoreAvg']]

###########

submit_data5['SeasonTeam1'] = submit_data5['Season'].astype(str) + merged_df5['Team1'].astype(str)

submit_data6 = pd.merge(submit_data5, score_merge, 
                      how='left', 
                      on='SeasonTeam1')\
               .rename(columns={'WScoreAvg':'Team1WScoreAvg','LScoreAvg':'Team1LScoreAvg'})
score_merge = score_merge.rename(columns={'SeasonTeam1':'SeasonTeam2'})

submit_data6['SeasonTeam2'] = submit_data6['Season'].astype(str) + merged_df6['Team2'].astype(str)

submit_data7 = pd.merge(submit_data6, score_merge, 
                      how='left', 
                      on='SeasonTeam2')\
               .rename(columns={'WScoreAvg':'Team2WScoreAvg','LScoreAvg':'Team2LScoreAvg'})

submit_data7 = submit_data7.dropna(how='any').drop(columns=['SeasonTeam1','SeasonTeam2'])

submit_data7 = submit_data7[['Season','Team1','Team2','Tourney',
                         'Team1Seed','Team2Seed','Team1RankMean',
                         'Team2RankMean','Team1FirstYear','Team2FirstYear',
                         'Team1WScoreAvg','Team1LScoreAvg','Team2WScoreAvg',
                         'Team2LScoreAvg','WLoc_A','WLoc_H','WLoc_N']]
submit_data7.shape

(10844, 17)

In [19]:
# # create new DF of all potential matchups
# # for 2020 NCAA Tourney only
# import itertools
# all_combos = list(itertools.combinations(teams_dict[i], 2)) 
# pd.DataFrame(all_combos)

In [20]:
format_submit(clf, submit_data7)

Unnamed: 0,ID,Probability
0,2015_1107_1112,0.0
1,2015_1107_1116,0.1
2,2015_1107_1124,0.0
3,2015_1107_1129,0.2
4,2015_1107_1138,0.1
...,...,...
10839,2019_1449_1459,0.6
10840,2019_1449_1463,0.9
10841,2019_1458_1459,0.8
10842,2019_1458_1463,0.9


In [37]:
def add_avg(stat, main_df):
    '''Calculates average of one stat per team per season and adds to main dataset'''
    detailed_all = reg_long_df.append(tourney_long_df)

    win_stat_avg = detailed_all.groupby(['WTeamID','Season'], 
                                        as_index=False)[f'W{stat}'].mean()
    lose_stat_avg = detailed_all.groupby(['LTeamID','Season'], 
                                         as_index=False)[f'L{stat}'].mean()
    stat_merge = pd.merge(win_stat_avg, lose_stat_avg, 
                           left_on=['WTeamID','Season'], 
                           right_on=['LTeamID','Season'])
    stat_merge = stat_merge.drop(columns=['LTeamID'])\
                           .rename(columns={'WTeamID':'TeamID',
                                             f'W{stat}':f'W{stat}Avg',
                                             f'L{stat}':f'L{stat}Avg'})

    stat_merge['SeasonTeam1'] = stat_merge['Season'].astype(str) + stat_merge['TeamID'].astype(str)

    stat_merge = stat_merge[['SeasonTeam1',f'W{stat}Avg',f'L{stat}Avg']]

    ###########

    merged_df['SeasonTeam1'] = main_df['Season'].astype(str) + main_df['Team1'].astype(str)

    merged_df2 = pd.merge(merged_df, stat_merge, 
                          how='left', 
                          on='SeasonTeam1')\
                   .rename(columns={f'W{stat}Avg':f'Team1W{stat}Avg',
                                    f'L{stat}Avg':f'Team1L{stat}Avg'})
    stat_merge = stat_merge.rename(columns={'SeasonTeam1':'SeasonTeam2'})

    merged_df2['SeasonTeam2'] = merged_df2['Season'].astype(str) + merged_df6['Team2'].astype(str)

    merged_df3 = pd.merge(merged_df2, stat_merge, 
                          how='left', 
                          on='SeasonTeam2')\
                   .rename(columns={f'W{stat}Avg':f'Team2W{stat}Avg',
                                    f'L{stat}Avg':f'Team2L{stat}Avg'})
    merged_df3 = merged_df3.dropna(how='any').drop(columns=['SeasonTeam1','SeasonTeam2'])

    return merged_df3

In [38]:
add_avg('FGA',merged_df7)

Unnamed: 0,Season,DayNum,Team1,Team2,Tourney,WLoc,WTeamID,ScoreDiff,Team1Seed,Team1WFGAAvg,Team1LFGAAvg,Team2WFGAAvg,Team2LFGAAvg
74048,2003,10,1104,1328,0,N,1104,6,Y10,58.352941,55.083333,56.481481,55.285714
74049,2003,10,1272,1393,0,N,1272,7,Z07,59.434783,63.142857,60.900000,63.800000
74050,2003,11,1266,1437,0,N,1266,12,Y03,55.000000,64.333333,59.000000,59.133333
74054,2003,12,1161,1236,0,H,1161,18,Z14,52.294118,52.000000,57.777778,53.600000
74058,2003,13,1166,1426,0,H,1166,56,Z06,57.448276,58.000000,48.714286,57.692308
...,...,...,...,...,...,...,...,...,...,...,...,...,...
169126,2019,146,1120,1246,1,N,1120,6,Y05,61.000000,59.000000,54.766667,58.142857
169127,2019,146,1181,1277,1,N,1277,1,W01,62.937500,67.333333,57.937500,53.714286
169128,2019,152,1277,1403,1,N,1403,10,W02,57.937500,53.714286,55.225806,56.857143
169129,2019,152,1120,1438,1,N,1438,1,Y05,61.000000,59.000000,54.000000,55.333333
