In [1]:
# For working with files
import os
import re
import csv

# For manipulating data
import pandas as pd

# For manipulating dates
import datetime

# For getting geographic locations and distances
from geopy.distance import geodesic
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim

#### **Load Data**

##### **Data Section 1 - SEASONS**

In [2]:
# Load MENS Seasons dataframes:
seasonm = pd.read_csv('seasons-mens.csv')

# Load WOMENS Seasons dataframes:
seasonw = pd.read_csv('seasons-womens.csv')

In [3]:
# Show Seasons shape and columns
print('shape (mens):', seasonm.shape, 'shape (womens):', seasonw.shape)
print('SEASONS (mens):', seasonm.columns)
print('SEASONS (womens):', seasonw.columns)

shape (mens): (8293, 11) shape (womens): (8160, 9)
SEASONS (mens): Index(['Season', 'StartDate', 'TeamID', 'ConfAbbrev', 'Description', 'RegionW',
       'RegionX', 'RegionY', 'RegionZ', 'CoachChg', 'CoachName'],
      dtype='object')
SEASONS (womens): Index(['Season', 'StartDate', 'TeamID', 'ConfAbbrev', 'Description', 'RegionW',
       'RegionX', 'RegionY', 'RegionZ'],
      dtype='object')


In [4]:
# Create list of Seasons, Men TeamIDs, and Womens TeamIDs
seasons = seasonm.Season.unique()
men_ids = seasonm.TeamID.unique()
wom_ids = seasonw.TeamID.unique()

##### **Data Section 2 - TEAMS**

In [5]:
# Load MENS Teams dataframes:
teamsm = pd.read_csv('teams-mens.csv')

# Load WOMENS Teams dataframes:
teamsw = pd.read_csv('teams-womens.csv')

In [7]:
# Show Seasons shape and columns
print('shape (mens):', teamsm.shape, 'shape (womens):', teamsw.shape)
print('TEAMS (both):', teamsm.columns)

shape (mens): (368, 14) shape (womens): (366, 14)
TEAMS (both): Index(['TeamID', 'TeamName', 'Name0', 'Name1', 'Name2', 'Name3', 'Name4',
       'Name5', 'Name6', 'Name7', 'Name8', 'Name9', 'Name10', 'Name11'],
      dtype='object')


In [None]:
# Creates Massey Oridnals dataframe:
Ordinals = pd.read_csv(DATA_DIR + 'MMasseyOrdinals.csv')

# Show sample output
print('Ordinals:', Ordinals.shape)
print('Ordinals:', Ordinals.columns)

##### **Data Section 3 - GAMES**

In [None]:
# Creates Detailed Regular Season dataframes:
MSeasonDetail = pd.read_csv(DATA_DIR + 'MRegularSeasonDetailedResults.csv')
WSeasonDetail = pd.read_csv(DATA_DIR + 'WRegularSeasonDetailedResults.csv')

# Show sample output
print('Mens:', MSeasonDetail.shape, 'Womens:', WSeasonDetail.shape)
print('Mens:', MSeasonDetail.columns)

In [None]:
# Creates Compact Regular Season dataframes (Optional):
MSeasonCompact = pd.read_csv(DATA_DIR + 'MRegularSeasonCompactResults.csv')
WSeasonCompact = pd.read_csv(DATA_DIR + 'WRegularSeasonCompactResults.csv')

# Show sample output
print('Mens:', MSeasonCompact.shape, 'Womens:', WSeasonCompact.shape)
print('Mens:', MSeasonCompact.columns)

In [None]:
# Creates Conference Tourney Games dataframe:
MCTourneyGames = pd.read_csv(DATA_DIR + 'MConferenceTourneyGames.csv')

# Show sample output
print('C_Tourney:', MCTourneyGames.shape)
print('C_Tourney:', MCTourneyGames.columns)

In [None]:
# Creates Game Cities dataframes:
MGameCities = pd.read_csv(DATA_DIR + 'MGameCities.csv')
WGameCities = pd.read_csv(DATA_DIR + 'WGameCities.csv')

# Show sample output
print('Mens:', MGameCities.shape, 'Womens:', WGameCities.shape)
print('Mens:', MGameCities.columns)

In [None]:
# Creates Cities dataframe:
Cities = pd.read_csv(DATA_DIR + 'Cities.csv')

# Show sample output
print('Cities:', Cities.shape)
print('Cities:', Cities.columns)

##### **Data Section 4 - TOURNAMENTS**

In [None]:
# Creates Detailed Tournament Results dataframes:
MNCAADetail = pd.read_csv(DATA_DIR + 'MNCAATourneyDetailedResults.csv')
WNCAADetail = pd.read_csv(DATA_DIR + 'WNCAATourneyDetailedResults.csv')

# Show sample output
print('Mens:', MNCAADetail.shape, 'Womens:', WNCAADetail.shape)
print('Mens:', MNCAADetail.columns)

In [None]:
# Creates Compact Tournament Results dataframes (Optional):
MNCAACompact = pd.read_csv(DATA_DIR + 'MNCAATourneyCompactResults.csv')
WNCAACompact = pd.read_csv(DATA_DIR + 'WNCAATourneyCompactResults.csv')

# Show sample output
print('Mens:', MNCAACompact.shape, 'Womens:', WNCAACompact.shape)
print('Mens:', MNCAACompact.columns)

In [None]:
# Creates Tournament Seed Matchup dataframes:
MNCAASlots = pd.read_csv(DATA_DIR + 'MNCAATourneySlots.csv')
WNCAASlots = pd.read_csv(DATA_DIR + 'WNCAATourneySlots.csv')

# Show sample output
print('Mens:', MNCAASlots.shape, 'Womens:', WNCAASlots.shape)
print('Mens:', MNCAASlots.columns)

In [None]:
# Creates a Tourney Seeding by Round dataframe: 
MNCAARounds = pd.read_csv(DATA_DIR + 'MNCAATourneySeedRoundSlots.csv')

# Show sample output
print('NCAA:', MNCAARounds.shape)
print('NCAA:', MNCAARounds.columns)

In [None]:
# Creates Tournament Seeding dataframes:
MNCAASeeds = pd.read_csv(DATA_DIR + 'MNCAATourneySeeds.csv')
WNCAASeeds = pd.read_csv(DATA_DIR + 'WNCAATourneySeeds.csv')

# Show sample output
print('Mens:', MNCAASeeds.shape, 'Womens:', WNCAASeeds.shape)
print('Mens:', MNCAASeeds.columns)

In [None]:
# Creates Compact Secondary Tournament Results dataframe:
MSTourneyCompact = pd.read_csv(DATA_DIR + 'MSecondaryTourneyCompactResults.csv')

# Show sample output
print('S_Tourney:', MSTourneyCompact.shape)
print('S_Tourney:', MSTourneyCompact.columns)

In [None]:
# Creates Secondary Tournament Games dataframe:
MCTourneyGames = pd.read_csv(DATA_DIR + 'MSecondaryTourneyTeams.csv')

# Show sample output
print('S_Tourney:', MCTourneyGames.shape)
print('S_Tourney:', MCTourneyGames.columns)

##### **Data Section 5 - FORECASTS**

In [None]:
# Creates Compact Tournament Results dataframes (Optional):
Submission2023 = pd.read_csv(DATA_DIR + 'SampleSubmission2023.csv')
SubmissionWarmup = pd.read_csv(DATA_DIR + 'SampleSubmissionWarmup.csv')

# Show sample output
print('2023:', Submission2023.shape, 'Warmup:', SubmissionWarmup.shape)
Submission2023.head()

#### **Build Tables**

In [None]:
GAMES = MRegular.copy()

# Show sample output
print(GAMES.shape)
GAMES.head(2)

In [None]:
# Merge dataframes on 'Season' column
merged_df = pd.merge(GAMES, MSeason, on='Season', how='inner')

# Calculate date using vectorized operations
merged_df['Date'] = merged_df['StartDate'] + pd.to_timedelta(merged_df['DayNum'], unit='D')

# Update Games with the calculated dates
GAMES['Date'] = merged_df['Date']

# Show sample output
print(GAMES.shape)
GAMES.tail(2)

In [None]:
# Merge MTeamConferences with MTeams dataframes on 'TeamID' column
df1 = pd.read_csv(DATA_DIR + 'MTeamConferences.csv')
df2 = Mens.copy()
df3 = pd.merge(df1, df2, on='TeamID', how='left')
print('df1:', df1.shape, 'df2:', df2.shape, 'df3:', df3.shape)

# Merge MTeamConferences with MTeams dataframes on 'TeamID' column
df4 = df3.copy()
df5 = pd.read_csv(DATA_DIR + 'Conferences.csv')
df6 = pd.merge(df4, df5, on='ConfAbbrev', how='left')
print('df4:', df4.shape, 'df5:', df5.shape, 'df6:', df6.shape)

new_order = [0, 1, 3, 4, 5, 2, 6]
new_names = {'Season':'Season', 'TeamID':'TeamID', 'ConfAbbrev':'ConfID', 'TeamName':'TeamName', 'FirstD1Season':'FirstD1', 'LastD1Season':'LastD1', 'Description':'ConfName'}

df6 = df6.iloc[:,new_order].rename(columns=new_names)

df6.head()

In [None]:
df1[(df1['Season'] <= 1999) - (df1['Season'] > 2000)]['ConfAbbrev']

#### **Custom Functions**

In [None]:
def store_day_zero(file):
    """
    Converts a CSV file with six columns (Season, DayZero, RegionW, RegionX, RegionY, RegionZ) to TWO dictionaries: 
    (1) with Season as the key and DayZero as the value; and 
    (2) with Season as the key and the four regions names as the value (in tuple)

    Parameters:
    file (str): The path and name of the CSV file to convert.

    Returns:
    Returns two dictionaries for DayZero and (RegionW, RegionX, RegionY, RegionZ) using Season as the key.

    Use:
    (1) Use to get the date for any game using Season as key eg. dayz(2022) outputs 2021-11-01
    (2) Use to get the four regions for any season using Season as key eg. dayz(2022) outputs (East, West, Midwest, South)
    """

    # Create five dictionaries with Season as the key and the other columns as the value
    dict_zero = {}
    dict_w = {}
    dict_x = {}
    dict_y = {}
    dict_z = {}

    with open(file, 'r', encoding='latin-1') as csvfile:
        reader = csv.reader(csvfile)
        #header = next(reader) # skip header row
        for row in reader:
            # Extract the values from the row
            season, day_zero, region_w, region_x, region_y, region_z = row
            
            # Populate the dictionaries
            dict_zero.setdefault(season, []).append(day_zero)
            dict_w.setdefault(season, []).append(region_w)
            dict_x.setdefault(season, []).append(region_x)
            dict_y.setdefault(season, []).append(region_y)
            dict_z.setdefault(season, []).append(region_z)

    # Return the dictionaries as a tuple
    return dict_zero, dict_w, dict_x, dict_y, dict_z