In [3]:
DATA_DIR = 'march-machine-learning-mania-2023/'

In [4]:
# For working with files
import os
import re
import csv

# For manipulating data
import pandas as pd

# For manipulating dates
import datetime

# For getting geographic locations and distances
from geopy.distance import geodesic
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim

#### **Load Data**

##### **Data Section 1 - SEASONS**

In [None]:
# Load MENS Seasons dataframes:
MSeasons = pd.read_csv('seasons-mens.csv')
print(MSeasons.shape)
MSeasons.head(3)

##### **Data Section 2 - TEAMS**

In [5]:
# Creates a Teams dataframes:
MTeams = pd.read_csv(DATA_DIR + 'MTeams.csv')
WTeams = pd.read_csv(DATA_DIR + 'WTeams.csv')

# Show sample output
print('MTeams:', MTeams.shape, 'WTeams:', WTeams.shape)
print('MTeams:', MTeams.columns)

MTeams: (377, 4) WTeams: (375, 2)
MTeams: Index(['TeamID', 'TeamName', 'FirstD1Season', 'LastD1Season'], dtype='object')


In [None]:
reorder = [0, 1, 3, 4, 5, 2, 6]
renames = {'Season':'Season', 'TeamID':'TeamID', 'ConfAbbrev':'ConfID', 'TeamName':'TeamName', 'FirstD1Season':'FirstD1', 'LastD1Season':'LastD1', 'Description':'ConfName'}

df6 = df6.iloc[:,new_order].rename(columns=new_names)


In [8]:
# Creates Team Coaches dataframe:
MCoaches = pd.read_csv(DATA_DIR + 'MTeamCoaches.csv')

# Show sample output
print('MCoaches:', MCoaches.shape)
print('MCoaches:', MCoaches.columns)

MCoaches: (12794, 5)
MCoaches: Index(['Season', 'TeamID', 'FirstDayNum', 'LastDayNum', 'CoachName'], dtype='object')


In [9]:
# Creates Conferences dataframe:
Conferences = pd.read_csv(DATA_DIR + 'Conferences.csv')

# Show sample output
print('Conferences:', Conferences.shape)
print('Conferences:', Conferences.columns)

Conferences: (51, 2)
Conferences: Index(['ConfAbbrev', 'Description'], dtype='object')


In [10]:
# Creates Team Conference dataframes:
MTeamConfs = pd.read_csv(DATA_DIR + 'MTeamConferences.csv')
WTeamConfs = pd.read_csv(DATA_DIR + 'WTeamConferences.csv')

# Show sample output
print('MTeamConfs:', MTeamConfs.shape, 'WTeamConfs:', WTeamConfs.shape)
print('MTeamConfs:', MTeamConfs.columns)

MTeamConfs: (12662, 3) WTeamConfs: (8768, 3)
MTeamConfs: Index(['Season', 'TeamID', 'ConfAbbrev'], dtype='object')


In [11]:
# Creates Alternate Team Name Spellings dataframes:
MSpellings = pd.read_csv(DATA_DIR + 'MTeamSpellings.csv', encoding='latin-1')
WSpellings = pd.read_csv(DATA_DIR + 'WTeamSpellings.csv', encoding='latin-1')

## Pivot table  -------------------------------------------------

import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Group the dataframe by TeamID and apply a function to create a list of unique TeamNameSpellings for each group
grouped_df = MSpellings.groupby('TeamID').apply(lambda x: list(x['TeamNameSpelling'].unique()))

# Determine the maximum number of alternate spellings for a TeamID
max_alt_spellings = grouped_df.apply(len).max()

# Create a new dataframe with the appropriate number of columns
new_columns = ['Name{}'.format(i) for i in range(1, max_alt_spellings + 1)] + ['TeamID']
new_df = pd.DataFrame(columns=new_columns)

# Iterate over the groups in the grouped dataframe and add rows to the new dataframe
for group_name, group_values in grouped_df.items():
    new_row = {key: value for key, value in zip(new_columns, group_values)}
    new_row['TeamID'] = group_name
    new_df = new_df.append(new_row, ignore_index=True)

# Output the resulting dataframe
MSpellings = new_df.copy()

## Add new columns  -------------------------------------------------

name_extended = ['Abilene Christian','Air Force','Akron','Alabama','Alabama AM','Alabama State','Albany','Alcorn','Alliant International','American','Appalachian State','Arizona','Arizona State','Arkansas Little Rock','Arkansas Pine Bluff','Arkansas','Arkansas State','Armstrong','Army','Auburn','Augusta','Austin Peay','Ball State','Baylor','Belmont','Bethune-Cookman','Binghamton','Birmingham Southern','Boise State','Boston College','Boston','Bowling Green','Bradley','Brooklyn','Brown','Bryant','Bucknell','Buffalo','Butler','Brigham Young','Central Michigan','California Polytech','California','Campbell','Canisius','Central Arkansas','Centenary','Central Connecticut','Charleston Southern','Charlotte','Chattanooga','Chicago State','Cincinnati','Citadel','Clemson','Cleveland State','Coastal Carolina','Charleston','Colgate','Colorado','Colorado State','Columbia','Connecticut','Coppin State','Cornell','Creighton','CSU Bakersfield','CSU Fullerton','CSU Northridge','Sacramento State','Dartmouth','Davidson','Dayton','Delaware','Delaware State','Denver','Depaul','Detroit','Drake','Drexel','Duke','Duquesne','Eastern Illinois','Eastern Kentucky','Eastern Michigan','Eastern Washington','East Carolina','SIU Edwardsville','Elon','East Tennessee State','Evansville','Fairleigh Dickinson','Fairfield','Florida Atlantic','Florida Gulf Coast','Florida','Florida AM','Florida International','Florida State','Fordham','Fresno State','Furman','George Washington','Georgia Southern','Gardner Webb','George Mason','Georgetown','Georgia','Georgia State','Georgia Tech','Gonzaga','Grambling','Grand Canyon','Hampton','Hardin-Simmons','Hartford','Harvard','Hawaii','High Point','Hofstra','Holy Cross','Houston','Houston Christian','Howard','Idaho','Idaho State','Illinois Chicago','Illinois','Illinois State','Incarnate Word','Indiana','Indiana State','Iona','Iowa','Iowa State','Fort Wayne','Indiana Purdue','Jackson State','Jacksonville','Jacksonville State','James Madison','Kansas','Kansas State','Kennesaw','Kent','Kentucky','La Salle','Lafayette','Lamar','Lehigh','Liberty','Lipscomb','Long Beach State','LIU Brooklyn','Longwood','Louisiana Tech','Louisville','Loyola Marymount','Loyola Maryland','Loyola Chicago','Louisiana State','Massachusetts Lowell','Maine','Manhattan','Marist','Marquette','Marshall','Maryland','Massachusetts','McNeese','Maryland East Shore','Memphis','Mercer','Miami FL','Miami OH','Michigan','Michigan State','Minnesota','Mississippi','Mississippi State','Missouri','UM Kansas City','Missouri State','Monmouth','Montana','Montana State','Morehead State','Morgan State','Morris Brown','Miss Valley State','Mount Saint Marys','Middle Tennessee State','Murray State','Northern Colorado','North Dakota State','Northern Illinois','Northern Kentucky','Navy','North Carolina AT','North Carolina Central','North Carolina State','Northeastern Illinois','Nebraska Omaha','Nebraska','Nevada','New Hampshire','New Mexico','New Mexico State','New Orleans','Niagara','Nicholls','NJIT','Norfolk State','North Carolina','North Dakota','North Florida','North Texas','Northeastern','Northern Arizona','Northern Iowa','Northwestern','Northwestern Louisiana','Notre Dame','Oakland','Ohio','Ohio State','Oklahoma City','Oklahoma','Oklahoma State','Old Dominion','Oral Roberts','Oregon','Oregon State','Pacific','Penn','Penn State','Pepperdine','Pittsburgh','Portland','Portland State','Prairie View','Presbyterian','Princeton','Providence','Purdue','Quinnipiac','Radford','Rhode Island','Rice','Richmond','Rider','Robert Morris','Rutgers','South Carolina State','South Dakota State','Southern Illinois','Sacred Heart','Sam Houston','Samford','San Diego','San Diego State','San Francisco','San Jose State','UC Santa Barbara','UC Santa Clara','Savannah State','South Carolina Upstate','Southeastern Louisiana','Southeast Missouri State','Seattle','Seton Hall','Stephen F Austin','Siena','Southern Methodist','South Alabama','South Carolina','South Dakota','South Florida','Southern Mississippi','Southern','Southern Utah','St Bonaventure','St Francis NY','St Francis PA','St Johns','St Josephs','Saint Louis','St Marys','St Peters','Stanford','Stetson','Stony Brook','Syracuse','Texas AM Corpus Chris','Texas Christian','Temple','Tennessee','Tennessee State','Tennessee Tech','Texas','Texas AM','Texas State','Texas Tech','UT Martin','Toledo','Towson','Troy','Tulane','Tulsa','Texas Pan American','Texas Southern','Alabama Birmingham','UC Davis','UC Irvine','UC Riverside','Central Florida','UC Los Angeles','UL Lafayette','UL Monroe','UM Baltimore County','NC Asheville','NC Greensboro','NC Wilmington','Nevada Las Vegas','Southern California','UT Arlington','UT San Antonio','Utah','Utah State','Utah Valley','Texas El Paso','Utica','Virginia Commonwealth','Valparaiso','Vanderbilt','Vermont','Villanova','Virginia','Virginia Tech','Virginia Military','Western Carolina','Western Illinois','Western Kentucky','Western Michigan','Winston-Salem State','Western Texas AM','Wagner','Wake Forest','Washington','Washington State','Weber State','West Virginia','UW Green Bay','UW Milwaukee','Wichita State','William Mary','Winthrop','Wisconsin','Wofford','Wright State','Wyoming','Xavier','Yale','Youngstown State','California Baptist','North Alabama','Merrimack','Bellarmine','Dixie State','Tarleton State','UC San Diego','St Thomas MN','Lindenwood','Queens NC','Southern Indiana','Stonehill','Texas AM Commerce']
betexplorer = ["Abilene Christian", "Air Force", "Akron", "Alabama", "Alabama A&M", "Alabama State", "Albany", "Alcorn State", "", "American University", "Appalachian State", "Arizona", "Arizona State", "UALR", "Arkansas-Pine Bluff", "Arkansas", "Arkansas State", "", "Army", "Auburn", "", "Austin Peay", "Ball State", "Baylor", "Belmont", "Bethune-Cookman", "Binghamton", "", "Boise State", "Boston College", "Boston University", "Bowling Green", "Bradley", "", "Brown", "Bryant University", "Bucknell", "Buffalo", "Butler", "Brigham Young", "Central Michigan", "Cal Poly", "California", "Campbell", "Canisius", "Central Arkansas", "", "Central Connecticut State", "Charleston Southern", "Charlotte", "Chattanooga Mocs", "Chicago State", "Cincinnati", "Citadel", "Clemson", "Cleveland State", "Coastal Carolina", "Charleston", "Colgate", "Colorado", "Colorado State", "Columbia", "UConn", "Coppin State", "Cornell", "Creighton", "CSU Bakersfield", "CS Fullerton", "CS Northridge", "Sacramento State", "Dartmouth", "Davidson", "Dayton", "Delaware", "Delaware State", "Denver", "DePaul", "Detroit", "Drake", "Drexel", "Duke", "Duquesne", "Eastern Illinois", "Eastern Kentucky", "Eastern Michigan", "East. Washington", "East Carolina", "Siu Edwardsville", "Elon", "East Tennessee St", "Evansville", "Fairleigh Dickinson", "Fairfield", "Florida Atlantic", "Florida Gulf Coast", "Florida", "Florida A&M", "Florida International", "Florida State", "Fordham", "Fresno State", "Furman", "George Washington", "Georgia Southern", "Gardner Webb", "George Mason", "Georgetown", "Georgia", "Georgia State", "Georgia Tech", "Gonzaga", "Grambling St.", "Grand Canyon", "Hampton", "", "Hartford", "Harvard", "Hawaii", "High Point", "Hofstra", "Holy Cross", "Houston", "Houston Christian", "Howard", "Idaho", "Idaho State", "Illinois (Chi.)", "Illinois", "Illinois State", "Incarnate Word", "Indiana", "Indiana State", "Iona", "Iowa", "Iowa State", "IPFW", "IUPUI", "Jackson State", "Jacksonville", "Jacksonville State", "James Madison", "Kansas", "Kansas State", "Kennesaw State", "Kent State", "Kentucky", "La Salle", "Lafayette", "Lamar", "Lehigh", "Liberty", "Lipscomb", "Long Beach State", "LIU Sharks", "Longwood", "Louisiana Tech", "Louisville", "Loyola Marymount", "Loyola Maryland", "Loyola Chicago", "LSU", "UMass Lowell", "Maine Black Bears", "Manhattan", "Marist", "Marquette", "Marshall", "Maryland", "UMass", "McNeese State", "Md.-East. Shore", "Memphis", "Mercer", "Miami (FL)", "Miami (Ohio)", "Michigan", "Michigan State", "Minnesota", "Ole Miss", "Mississippi St.", "Missouri", "UMKC", "Missouri State", "Monmouth", "Montana", "Montana State", "Morehead State", "Morgan State", "", "Miss. Valley St.", "Mount St. Mary's", "Middle Tenn. St.", "Murray State", "Northern Colorado", "North Dakota St", "Northern Illinois", "Northern Kentucky", "Navy", "N. Carolina A&T", "N. Carolina Central", "NC State", "", "Nebraska O.", "Nebraska", "Nevada", "New Hampshire", "New Mexico", "New Mexico State", "New Orleans", "Niagara", "Nicholls State", "NJIT", "Norfolk State", "North Carolina", "North Dakota", "North Florida", "North Texas", "Northeastern", "Northern Arizona", "Northern Iowa", "Northwestern", "Northwestern St.", "Notre Dame", "Oakland", "Ohio", "Ohio State", "", "Oklahoma", "Oklahoma State", "Old Dominion", "Oral Roberts", "Oregon", "Oregon State", "Pacific", "Penn", "Penn State", "Pepperdine", "Pittsburgh", "Portland", "Portland State", "Prairie View A&M", "Presbyterian", "Princeton", "Providence", "Purdue", "Quinnipiac", "Radford", "Rhode Island", "Rice", "Richmond", "Rider", "Robert Morris", "Rutgers", "South Carolina St", "South Dakota St.", "Southern Illinois", "Sacred Heart", "Sam Houston St.", "Samford", "San Diego Toreros", "San Diego State", "San Francisco", "San Jose State", "UC Santa Barbara", "Santa Clara", "", "USC Upstate", "SE Louisiana", "Southeast Missouri State", "Seattle", "Seton Hall", "Stephen F. Austin", "Siena", "SMU Mustangs", "South Alabama", "South Carolina", "South Dakota Coyotes", "South Florida", "Southern Miss", "Southern Univ.", "Southern Utah", "St. Bonaventure", "St. Francis Brooklyn", "St. Francis (PA)", "St. John's (N.Y.)", "Saint Josephs", "St. Louis", "St. Marys (CA)", "St. Peters", "Stanford", "Stetson", "Stony Brook", "Syracuse", "Texas A&M-CC", "TCU", "Temple", "Tennessee", "Tennessee State", "Tennessee Tech", "Texas", "Texas A&M", "Texas State", "Texas Tech", "UT Martin", "Toledo", "Towson", "Troy", "Tulane", "Tulsa", "UTRGV", "Texas Southern", "UAB", "UC Davis", "UC Irvine", "UC Riverside", "UCF Knights", "UCLA", "Louisiana Lafayette", "Louisiana Monroe", "UMBC Retrievers", "UNC Asheville", "NC Greensboro", "NC Wilmington", "UNLV", "USC", "UT Arlington", "UTSA Roadrunners", "Utah Utes", "Utah State", "Utah Valley State", "UTEP", "", "VCU Rams", "Valparaiso", "Vanderbilt", "Vermont", "Villanova", "Virginia", "Virginia Tech", "VMI", "Western Carolina", "Western Illinois", "Western Kentucky", "Western Michigan", "", "", "Wagner", "Wake Forest", "Washington", "Washington State", "Weber State", "West Virginia", "Wisc. Green Bay", "Wisc. Milwaukee", "Wichita State", "William & Mary", "Winthrop", "Wisconsin", "Wofford", "Wright State", "Wyoming", "Xavier", "Yale", "Youngstown State", "California Baptist", "North Alabama", "Merrimack Warriors", "Bellarmine", "Utah Tech", "Tarleton", "UC San Diego", "St. Thomas (Minn.)", "", "", "", "", ""]

MSpellings['Name0'] = name_extended
MSpellings['Name11'] = betexplorer

reo_spell = [10, 11, 0, 1, 3, 4, 5, 2, 6, 7, 8, 9, 12]
MSpellings = MSpellings.iloc[:,reo_spell]

print(MSpellings.shape)
MSpellings.columns

(377, 13)


Index(['TeamID', 'Name0', 'Name1', 'Name2', 'Name4', 'Name5', 'Name6', 'Name3',
       'Name7', 'Name8', 'Name9', 'Name10', 'Name11'],
      dtype='object')

In [12]:
# Creates Seasons dataframes:
MSeasons = pd.read_csv(DATA_DIR + 'MSeasons.csv')
WSeasons = pd.read_csv(DATA_DIR + 'WSeasons.csv')

# Convert DayZero to datetime field called 'StartDate'
MSeasons['StartDate'] = pd.to_datetime(MSeasons['DayZero'])
WSeasons['StartDate'] = pd.to_datetime(WSeasons['DayZero'])

# Show sample output
print('Mens:', MSeasons.shape, 'Womens:', WSeasons.shape)
print('Mens:', MSeasons.columns)

Mens: (39, 7) Womens: (26, 7)
Mens: Index(['Season', 'DayZero', 'RegionW', 'RegionX', 'RegionY', 'RegionZ',
       'StartDate'],
      dtype='object')


In [13]:
reo_seas = [0, 6, 1, 2, 3, 4, 5]
ren_seas = {'Season':'Season', 'DayZero':'DayZero', 'RegionW':'RegionW', 
            'RegionX':'RegionX', 'RegionY':'RegionY', 'RegionZ':'RegionZ', 
            'StartDate':'StartDate'}

SEASONS = MSeasons[MSeasons['Season'] >= 2000].copy()
SEASONS = SEASONS.iloc[:,reo_seas].drop(columns=['DayZero'])
print(SEASONS.shape)
SEASONS.head(2)

(24, 6)


Unnamed: 0,Season,StartDate,RegionW,RegionX,RegionY,RegionZ
15,2000,1999-11-01,East,South,Midwest,West
16,2001,2000-10-30,East,West,Midwest,South


In [14]:
# Creates Massey Oridnals dataframe:
Ordinals = pd.read_csv(DATA_DIR + 'MMasseyOrdinals.csv')

# Show sample output
print('Ordinals:', Ordinals.shape)
print('Ordinals:', Ordinals.columns)

Ordinals: (4877976, 5)
Ordinals: Index(['Season', 'RankingDayNum', 'SystemName', 'TeamID', 'OrdinalRank'], dtype='object')


##### **Data Section 3 - GAMES**

In [15]:
# Creates Detailed Regular Season dataframes:
MSeasonDetail = pd.read_csv(DATA_DIR + 'MRegularSeasonDetailedResults.csv')
WSeasonDetail = pd.read_csv(DATA_DIR + 'WRegularSeasonDetailedResults.csv')

# Show sample output
print('Mens:', MSeasonDetail.shape, 'Womens:', WSeasonDetail.shape)
print('Mens:', MSeasonDetail.columns)

Mens: (106834, 34) Womens: (70007, 34)
Mens: Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')


In [16]:
# Creates Compact Regular Season dataframes (Optional):
MSeasonCompact = pd.read_csv(DATA_DIR + 'MRegularSeasonCompactResults.csv')
WSeasonCompact = pd.read_csv(DATA_DIR + 'WRegularSeasonCompactResults.csv')

# Show sample output
print('Mens:', MSeasonCompact.shape, 'Womens:', WSeasonCompact.shape)
print('Mens:', MSeasonCompact.columns)

Mens: (180882, 8) Womens: (125397, 8)
Mens: Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT'],
      dtype='object')


In [17]:
# Creates Conference Tourney Games dataframe:
MCTourneyGames = pd.read_csv(DATA_DIR + 'MConferenceTourneyGames.csv')

# Show sample output
print('C_Tourney:', MCTourneyGames.shape)
print('C_Tourney:', MCTourneyGames.columns)

C_Tourney: (5884, 5)
C_Tourney: Index(['Season', 'ConfAbbrev', 'DayNum', 'WTeamID', 'LTeamID'], dtype='object')


In [18]:
# Creates Game Cities dataframes:
MGameCities = pd.read_csv(DATA_DIR + 'MGameCities.csv')
WGameCities = pd.read_csv(DATA_DIR + 'WGameCities.csv')

# Show sample output
print('Mens:', MGameCities.shape, 'Womens:', WGameCities.shape)
print('Mens:', MGameCities.columns)

Mens: (74387, 6) Womens: (70660, 6)
Mens: Index(['Season', 'DayNum', 'WTeamID', 'LTeamID', 'CRType', 'CityID'], dtype='object')


In [19]:
# Creates Cities dataframe:
Cities = pd.read_csv(DATA_DIR + 'Cities.csv')

# Show sample output
print('Cities:', Cities.shape)
print('Cities:', Cities.columns)

Cities: (472, 3)
Cities: Index(['CityID', 'City', 'State'], dtype='object')


##### **Data Section 4 - TOURNAMENTS**

In [20]:
# Creates Detailed Tournament Results dataframes:
MNCAADetail = pd.read_csv(DATA_DIR + 'MNCAATourneyDetailedResults.csv')
WNCAADetail = pd.read_csv(DATA_DIR + 'WNCAATourneyDetailedResults.csv')

# Show sample output
print('Mens:', MNCAADetail.shape, 'Womens:', WNCAADetail.shape)
print('Mens:', MNCAADetail.columns)

Mens: (1248, 34) Womens: (760, 34)
Mens: Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')


In [21]:
# Creates Compact Tournament Results dataframes (Optional):
MNCAACompact = pd.read_csv(DATA_DIR + 'MNCAATourneyCompactResults.csv')
WNCAACompact = pd.read_csv(DATA_DIR + 'WNCAATourneyCompactResults.csv')

# Show sample output
print('Mens:', MNCAACompact.shape, 'Womens:', WNCAACompact.shape)
print('Mens:', MNCAACompact.columns)

Mens: (2384, 8) Womens: (1516, 8)
Mens: Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT'],
      dtype='object')


In [22]:
# Creates Tournament Seed Matchup dataframes:
MNCAASlots = pd.read_csv(DATA_DIR + 'MNCAATourneySlots.csv')
WNCAASlots = pd.read_csv(DATA_DIR + 'WNCAATourneySlots.csv')

# Show sample output
print('Mens:', MNCAASlots.shape, 'Womens:', WNCAASlots.shape)
print('Mens:', MNCAASlots.columns)

Mens: (2385, 4) Womens: (1579, 4)
Mens: Index(['Season', 'Slot', 'StrongSeed', 'WeakSeed'], dtype='object')


In [23]:
# Creates a Tourney Seeding by Round dataframe: 
MNCAARounds = pd.read_csv(DATA_DIR + 'MNCAATourneySeedRoundSlots.csv')

# Show sample output
print('NCAA:', MNCAARounds.shape)
print('NCAA:', MNCAARounds.columns)

NCAA: (720, 5)
NCAA: Index(['Seed', 'GameRound', 'GameSlot', 'EarlyDayNum', 'LateDayNum'], dtype='object')


In [24]:
# Creates Tournament Seeding dataframes:
MNCAASeeds = pd.read_csv(DATA_DIR + 'MNCAATourneySeeds.csv')
WNCAASeeds = pd.read_csv(DATA_DIR + 'WNCAATourneySeeds.csv')

# Show sample output
print('Mens:', MNCAASeeds.shape, 'Womens:', WNCAASeeds.shape)
print('Mens:', MNCAASeeds.columns)

Mens: (2422, 3) Womens: (1540, 3)
Mens: Index(['Season', 'Seed', 'TeamID'], dtype='object')


In [25]:
# Creates Compact Secondary Tournament Results dataframe:
MSTourneyCompact = pd.read_csv(DATA_DIR + 'MSecondaryTourneyCompactResults.csv')

# Show sample output
print('S_Tourney:', MSTourneyCompact.shape)
print('S_Tourney:', MSTourneyCompact.columns)

S_Tourney: (1710, 9)
S_Tourney: Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'SecondaryTourney'],
      dtype='object')


In [26]:
# Creates Secondary Tournament Games dataframe:
MCTourneyGames = pd.read_csv(DATA_DIR + 'MSecondaryTourneyTeams.csv')

# Show sample output
print('S_Tourney:', MCTourneyGames.shape)
print('S_Tourney:', MCTourneyGames.columns)

S_Tourney: (1732, 3)
S_Tourney: Index(['Season', 'SecondaryTourney', 'TeamID'], dtype='object')


##### **Data Section 5 - FORECASTS**

In [27]:
# Creates Compact Tournament Results dataframes (Optional):
Submission2023 = pd.read_csv(DATA_DIR + 'SampleSubmission2023.csv')
SubmissionWarmup = pd.read_csv(DATA_DIR + 'SampleSubmissionWarmup.csv')

# Show sample output
print('2023:', Submission2023.shape, 'Warmup:', SubmissionWarmup.shape)
Submission2023.head()

2023: (130683, 2) Warmup: (614319, 2)


Unnamed: 0,ID,Pred
0,2023_1101_1102,0.5
1,2023_1101_1103,0.5
2,2023_1101_1104,0.5
3,2023_1101_1105,0.5
4,2023_1101_1106,0.5


#### **Build Tables**

In [None]:
GAMES = MRegular.copy()

# Show sample output
print(GAMES.shape)
GAMES.head(2)

In [None]:
# Merge dataframes on 'Season' column
merged_df = pd.merge(GAMES, MSeason, on='Season', how='inner')

# Calculate date using vectorized operations
merged_df['Date'] = merged_df['StartDate'] + pd.to_timedelta(merged_df['DayNum'], unit='D')

# Update Games with the calculated dates
GAMES['Date'] = merged_df['Date']

# Show sample output
print(GAMES.shape)
GAMES.tail(2)

In [30]:
# Merge MTeamConferences with MTeams dataframes on 'TeamID' column
df1 = pd.read_csv(DATA_DIR + 'MTeamConferences.csv')
df2 = Mens.copy()
df3 = pd.merge(df1, df2, on='TeamID', how='left')
print('df1:', df1.shape, 'df2:', df2.shape, 'df3:', df3.shape)

# Merge MTeamConferences with MTeams dataframes on 'TeamID' column
df4 = df3.copy()
df5 = pd.read_csv(DATA_DIR + 'Conferences.csv')
df6 = pd.merge(df4, df5, on='ConfAbbrev', how='left')
print('df4:', df4.shape, 'df5:', df5.shape, 'df6:', df6.shape)

new_order = [0, 1, 3, 4, 5, 2, 6]
new_names = {'Season':'Season', 'TeamID':'TeamID', 'ConfAbbrev':'ConfID', 'TeamName':'TeamName', 'FirstD1Season':'FirstD1', 'LastD1Season':'LastD1', 'Description':'ConfName'}

df6 = df6.iloc[:,new_order].rename(columns=new_names)

df6.head()

NameError: name 'Mens' is not defined

In [None]:
df1[(df1['Season'] <= 1999) - (df1['Season'] > 2000)]['ConfAbbrev']

#### **Custom Functions**

In [None]:
def store_day_zero(file):
    """
    Converts a CSV file with six columns (Season, DayZero, RegionW, RegionX, RegionY, RegionZ) to TWO dictionaries: 
    (1) with Season as the key and DayZero as the value; and 
    (2) with Season as the key and the four regions names as the value (in tuple)

    Parameters:
    file (str): The path and name of the CSV file to convert.

    Returns:
    Returns two dictionaries for DayZero and (RegionW, RegionX, RegionY, RegionZ) using Season as the key.

    Use:
    (1) Use to get the date for any game using Season as key eg. dayz(2022) outputs 2021-11-01
    (2) Use to get the four regions for any season using Season as key eg. dayz(2022) outputs (East, West, Midwest, South)
    """

    # Create five dictionaries with Season as the key and the other columns as the value
    dict_zero = {}
    dict_w = {}
    dict_x = {}
    dict_y = {}
    dict_z = {}

    with open(file, 'r', encoding='latin-1') as csvfile:
        reader = csv.reader(csvfile)
        #header = next(reader) # skip header row
        for row in reader:
            # Extract the values from the row
            season, day_zero, region_w, region_x, region_y, region_z = row
            
            # Populate the dictionaries
            dict_zero.setdefault(season, []).append(day_zero)
            dict_w.setdefault(season, []).append(region_w)
            dict_x.setdefault(season, []).append(region_x)
            dict_y.setdefault(season, []).append(region_y)
            dict_z.setdefault(season, []).append(region_z)

    # Return the dictionaries as a tuple
    return dict_zero, dict_w, dict_x, dict_y, dict_z