In [1]:
DATA_DIR = 'march-machine-learning-mania-2023/' # input('Enter file path for data files: ')

In [2]:
# For manipulating data
import pandas as pd

##### **Data Section 1 - SEASONS**

In [3]:
# Creates Team Conferences dataframes:
MTeamConfs = pd.read_csv(DATA_DIR + 'MTeamConferences.csv')
WTeamConfs = pd.read_csv(DATA_DIR + 'WTeamConferences.csv')

# Show sample output
print('MTeamConfs:', MTeamConfs.shape, 'WTeamConfs:', WTeamConfs.shape)
print('MTeamConfs:', MTeamConfs.columns)

MTeamConfs: (12662, 3) WTeamConfs: (8768, 3)
MTeamConfs: Index(['Season', 'TeamID', 'ConfAbbrev'], dtype='object')


In [4]:
# Reduce size of 'Team Conferences' dataframes (for the year 2000 and later)
df1m = MTeamConfs[MTeamConfs['Season'] >= 2000].copy()
df1w = WTeamConfs[WTeamConfs['Season'] >= 2000].copy()

df2b = pd.read_csv(DATA_DIR + 'Conferences.csv') # add Conferences dataframe:

# Merge 'Team Conferences' with 'Conferences' dataframes on 'ConfAbbrev' column
MTeamConfs = pd.merge(df1m, df2b, on='ConfAbbrev', how='left')
WTeamConfs = pd.merge(df1m, df2b, on='ConfAbbrev', how='left')

# Show sample output
print('MTeamConfs (after merge):', MTeamConfs.shape, 'WTeamConfs (after merge):', WTeamConfs.shape)
print('MTeamConfs (after merge):', MTeamConfs.columns)

MTeamConfs (after merge): (8212, 4) WTeamConfs (after merge): (8212, 4)
MTeamConfs (after merge): Index(['Season', 'TeamID', 'ConfAbbrev', 'Description'], dtype='object')


In [5]:
# Creates Seasons dataframes:
MSeasons = pd.read_csv(DATA_DIR + 'MSeasons.csv')
WSeasons = pd.read_csv(DATA_DIR + 'WSeasons.csv')

# Create a new field called 'StartDate' as DayZero converted to datetime 
MSeasons['StartDate'] = pd.to_datetime(MSeasons['DayZero'])
WSeasons['StartDate'] = pd.to_datetime(WSeasons['DayZero'])

# Show sample output
print('Mens:', MSeasons.shape, 'Womens:', WSeasons.shape)
print('Mens:', MSeasons.columns)

Mens: (39, 7) Womens: (26, 7)
Mens: Index(['Season', 'DayZero', 'RegionW', 'RegionX', 'RegionY', 'RegionZ',
       'StartDate'],
      dtype='object')


In [6]:
# Copy 'Team Conferences' dataframes for merge
df3m = MTeamConfs.copy()
df3w = WTeamConfs.copy()

# Reduce size of 'Seasons' dataframes (for the year 2000 and later)
df4m = MSeasons[MSeasons['Season'] >= 2000].copy()
df4w = WSeasons[WSeasons['Season'] >= 2000].copy()

# Merge 'Team Conferences' with 'Seasons' dataframes on 'Season' column
MTeamConfs = pd.merge(df3m, df4m, on='Season', how='left')
WTeamConfs = pd.merge(df3w, df4w, on='Season', how='left')

# Show sample output
print('MTeamConfs (before):', df3m.shape, 'MSeasons:', df4m.shape, 'MTeamConfs (after):', MTeamConfs.shape)
print('WTeamConfs (before):', df3w.shape, 'WSeasons:', df4w.shape, 'WTeamConfs (after):', WTeamConfs.shape)
print('MTeamConfs (after merge):', MTeamConfs.columns)

MTeamConfs (before): (8212, 4) MSeasons: (24, 7) MTeamConfs (after): (8212, 10)
WTeamConfs (before): (8212, 4) WSeasons: (24, 7) WTeamConfs (after): (8212, 10)
MTeamConfs (after merge): Index(['Season', 'TeamID', 'ConfAbbrev', 'Description', 'DayZero', 'RegionW',
       'RegionX', 'RegionY', 'RegionZ', 'StartDate'],
      dtype='object')


In [7]:
# Creates Team Coaches dataframe:
MCoaches = pd.read_csv(DATA_DIR + 'MTeamCoaches.csv')

# Show sample output
print('MCoaches:', MCoaches.shape)
print('MCoaches:', MCoaches.columns)

MCoaches: (12794, 5)
MCoaches: Index(['Season', 'TeamID', 'FirstDayNum', 'LastDayNum', 'CoachName'], dtype='object')


In [8]:
# Reduce size of 'Coaches' dataframes (for the year 2000 and later)
df5m = MCoaches[MCoaches['Season'] >= 2000].copy()

# Create a new field called 'CoachChg' (fraction of season coached, between 0 and 1) 
df5m['CoachChg'] = df5m.apply(lambda x: (x['LastDayNum'] - x['FirstDayNum']) / 154, axis=1)

# Drop unwanted columns
df5m.drop(df5m.columns[[2, 3]], axis=1, inplace=True)

# Rename dataframe
MCoaches = df5m.copy()

# Show sample output
print('MCoaches:', MCoaches.shape)
print('MCoaches:', MCoaches.columns)

MCoaches: (8303, 4)
MCoaches: Index(['Season', 'TeamID', 'CoachName', 'CoachChg'], dtype='object')


In [9]:
# Merge 'Team Conferences' with 'Coaches' dataframes on 'Season' and 'TeamID' columns
MTeamConfs = pd.merge(MTeamConfs, MCoaches,  how='left', left_on=['Season', 'TeamID'], right_on = ['Season', 'TeamID'])

# Reorder and drop unwanted columns from dataframe
reo_mconfs = [0, 9, 4, 1, 2, 3, 5, 6, 7, 8, 11, 10]
reo_wconfs = [0, 9, 4, 1, 2, 3, 5, 6, 7, 8]
MTeamConfs = MTeamConfs.iloc[:,reo_mconfs].drop(columns=['DayZero'])
WTeamConfs = WTeamConfs.iloc[:,reo_wconfs].drop(columns=['DayZero'])

# Show sample output
print('MTeamConfs (after merge):', MTeamConfs.shape, 'Missing:', MTeamConfs.isna().sum().sum())
print('WTeamConfs (after merge):', WTeamConfs.shape, 'Missing:', WTeamConfs.isna().sum().sum())
print('MTeamConfs (after merge):', MTeamConfs.columns)

MTeamConfs (after merge): (8293, 11) Missing: 0
WTeamConfs (after merge): (8212, 9) Missing: 0
MTeamConfs (after merge): Index(['Season', 'StartDate', 'TeamID', 'ConfAbbrev', 'Description', 'RegionW',
       'RegionX', 'RegionY', 'RegionZ', 'CoachChg', 'CoachName'],
      dtype='object')


In [10]:
seasons_mens = MTeamConfs.to_csv('seasons-mens.csv',index=False)
seasons_womens = WTeamConfs.to_csv('seasons-womens.csv',index=False)