In [2]:
# Import Libraries
import pandas as pd

In [1]:
# Specify where the data files are located locally
KAGGLE = 'data/march-machine-learning-mania-2023/'
SPORTS = 'data/sports-reference/'
SAVED = 'data/saved/'

##### **Data Section 1 - CONFERENCES**

In [3]:
# Creates Team Conferences dataframes:
MTeamConfs = pd.read_csv(DATA_DIR + 'MTeamConferences.csv')
WTeamConfs = pd.read_csv(DATA_DIR + 'WTeamConferences.csv')

# Show sample output
print('MTeamConfs:', MTeamConfs.shape, 'WTeamConfs:', WTeamConfs.shape)
print('MTeamConfs:', MTeamConfs.columns)

MTeamConfs: (12662, 3) WTeamConfs: (8768, 3)
MTeamConfs: Index(['Season', 'TeamID', 'ConfAbbrev'], dtype='object')


In [4]:
# Reduce size of 'Team Conferences' dataframes (for the year 2010 and later)
df1m = MTeamConfs[MTeamConfs['Season'] >= 2010].copy()
df1w = WTeamConfs[WTeamConfs['Season'] >= 2010].copy()

df2b = pd.read_csv(DATA_DIR + 'Conferences.csv') # add Conferences dataframe:

# Merge 'Team Conferences' with 'Conferences' dataframes on 'ConfAbbrev' column
MTeamConfs = pd.merge(df1m, df2b, on='ConfAbbrev', how='left')
WTeamConfs = pd.merge(df1w, df2b, on='ConfAbbrev', how='left')

# Show sample output
print('MTeamConfs (after merge):', MTeamConfs.shape, 'WTeamConfs (after merge):', WTeamConfs.shape)
print('MTeamConfs (after merge):', MTeamConfs.columns)

MTeamConfs (after merge): (4913, 4) WTeamConfs (after merge): (4883, 4)
MTeamConfs (after merge): Index(['Season', 'TeamID', 'ConfAbbrev', 'Description'], dtype='object')


In [5]:
# Copy 'Team Conferences' dataframes
df3m = MTeamConfs.copy()
df3w = WTeamConfs.copy()

# Add in Team Conference Rankings (from sports-reference.com) as dataframes:
df4m = pd.read_csv('MConferenceRankingSR.csv')
df4w = pd.read_csv('WConferenceRankingSR.csv')

# Merge 'Team Conferences' with 'Conference Rankings' dataframes on 'Season' and 'Conference' columns
MConfRanks = pd.merge(df3m, df4m, how='left', left_on=['Season', 'Description'], right_on = ['Season', 'Conference'])
WConfRanks = pd.merge(df3w, df4w, how='left', left_on=['Season', 'Description'], right_on = ['Season', 'Conference'])

# Show sample output
print('MConfRanks:', MConfRanks.shape, 'WConfRanks:', WConfRanks.shape)
print('MConfRanks:', MConfRanks.columns)

MConfRanks: (4913, 12) WConfRanks: (4883, 12)
MConfRanks: Index(['Season', 'TeamID', 'ConfAbbrev', 'Description', 'ConfRank',
       'Conference', 'Schools', 'Wins', 'Losses', 'WinPCT', 'ConfSRS',
       'ConfSOS'],
      dtype='object')


In [6]:
MConfRanks.head()

Unnamed: 0,Season,TeamID,ConfAbbrev,Description,ConfRank,Conference,Schools,Wins,Losses,WinPCT,ConfSRS,ConfSOS
0,2010,1102,mwc,Mountain West Conference,7.0,Mountain West Conference,9.0,173.0,123.0,0.584459459,5.68,3.84
1,2010,1103,mac,Mid-American Conference,15.0,Mid-American Conference,12.0,195.0,190.0,0.506493506,-2.25,-1.27
2,2010,1104,sec,Southeastern Conference,5.0,Southeastern Conference,12.0,242.0,160.0,0.60199005,10.71,6.47
3,2010,1105,swac,Southwest Athletic Conference,32.0,Southwest Athletic Conference,10.0,120.0,188.0,0.38961039,-18.01,-11.01
4,2010,1106,swac,Southwest Athletic Conference,32.0,Southwest Athletic Conference,10.0,120.0,188.0,0.38961039,-18.01,-11.01


In [None]:
# Creates Seasons dataframes:
MSeasons = pd.read_csv(DATA_DIR + 'MSeasons.csv')
WSeasons = pd.read_csv(DATA_DIR + 'WSeasons.csv')

# Create a new field called 'StartDate' as DayZero converted to datetime 
MSeasons['StartDate'] = pd.to_datetime(MSeasons['DayZero'])
WSeasons['StartDate'] = pd.to_datetime(WSeasons['DayZero'])

# Show sample output
print('Mens:', MSeasons.shape, 'Womens:', WSeasons.shape)
print('Mens:', MSeasons.columns)

In [None]:
# Copy 'Team Conferences' dataframes for merge
df3m = MTeamConfs.copy()
df3w = WTeamConfs.copy()

# Reduce size of 'Seasons' dataframes (for the year 2010 and later)
df4m = MSeasons[MSeasons['Season'] >= 2010].copy()
df4w = WSeasons[WSeasons['Season'] >= 2010].copy()

# Merge 'Team Conferences' with 'Seasons' dataframes on 'Season' column
MTeamConfs = pd.merge(df3m, df4m, on='Season', how='left')
WTeamConfs = pd.merge(df3w, df4w, on='Season', how='left')

# Show sample output
print('MTeamConfs (before):', df3m.shape, 'MSeasons:', df4m.shape, 'MTeamConfs (after):', MTeamConfs.shape)
print('WTeamConfs (before):', df3w.shape, 'WSeasons:', df4w.shape, 'WTeamConfs (after):', WTeamConfs.shape)
print('MTeamConfs (after merge):', MTeamConfs.columns)

In [None]:
# Creates Team Coaches dataframe:
MCoaches = pd.read_csv(DATA_DIR + 'MTeamCoaches.csv')

# Show sample output
print('MCoaches:', MCoaches.shape)
print('MCoaches:', MCoaches.columns)

In [None]:
# Reduce size of 'Coaches' dataframes (for the year 2010 and later)
df5m = MCoaches[MCoaches['Season'] >= 2010].copy()

# Create a new field called 'CoachChg' (fraction of season coached, between 0 and 1) 
df5m['CoachChg'] = df5m.apply(lambda x: (x['LastDayNum'] - x['FirstDayNum']) / 154, axis=1)

# Drop unwanted columns
df5m.drop(df5m.columns[[2, 3]], axis=1, inplace=True)

# Rename dataframe
MCoaches = df5m.copy()

# Show sample output
print('MCoaches:', MCoaches.shape)
print('MCoaches:', MCoaches.columns)

In [None]:
# Merge 'Team Conferences' with 'Coaches' dataframes on 'Season' and 'TeamID' columns
MTeamConfs = pd.merge(MTeamConfs, MCoaches,  how='left', left_on=['Season', 'TeamID'], right_on = ['Season', 'TeamID'])

# Reorder and drop unwanted columns from dataframe
reo_mconfs = [0, 9, 4, 1, 2, 3, 5, 6, 7, 8, 11, 10]
reo_wconfs = [0, 9, 4, 1, 2, 3, 5, 6, 7, 8]
MTeamConfs = MTeamConfs.iloc[:,reo_mconfs].drop(columns=['DayZero'])
WTeamConfs = WTeamConfs.iloc[:,reo_wconfs].drop(columns=['DayZero'])

# Show sample output
print('MTeamConfs (after merge):', MTeamConfs.shape, 'Missing:', MTeamConfs.isna().sum().sum())
print('WTeamConfs (after merge):', WTeamConfs.shape, 'Missing:', WTeamConfs.isna().sum().sum())
print('MTeamConfs (after merge):', MTeamConfs.columns)

In [None]:
seasonsm = MTeamConfs.to_csv('seasons-mens.csv',index=False)
seasonsw = WTeamConfs.to_csv('seasons-womens.csv',index=False)