In [1]:
import pandas as pd
import re
import numpy as np

### Teams

In [None]:
spellings = pd.read_csv('WDataFiles_Stage1/WTeamSpellings.csv', encoding = "ISO-8859-1")
spellings.head()

In [None]:
# Put the team names in the same format (lowercase no punctuation) for joins later
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-z&. ]+', ' ')
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-z& ]+', '')

### Helper Functions

In [None]:
nans = lambda df: df[df.isnull().any(axis=1)]  # Function to print out rows with null values

### Sonny Moore Ratings

These ratings aren't pre-tournament, but they were the best I could find

No scraping is allowed on the website so I copy pasted the rankings into .txt files

In [None]:
# reading the txt files with pd.read_txt didn't really work, so have to read them as a csv and then format them correctly
moore_txts = {}
moore_txts[2005] = pd.read_csv('mydata/womens/moore05.txt', sep = "\n", header = None)
moore_txts[2006] = pd.read_csv('mydata/womens/moore06.txt', sep = "\n", header = None)
moore_txts[2007] = pd.read_csv('mydata/womens/moore07.txt', sep = "\n", header = None)
moore_txts[2008] = pd.read_csv('mydata/womens/moore08.txt', sep = "\n", header = None)
moore_txts[2009] = pd.read_csv('mydata/womens/moore09.txt', sep = "\n", header = None)
moore_txts[2010] = pd.read_csv('mydata/womens/moore10.txt', sep = "\n", header = None)
moore_txts[2011] = pd.read_csv('mydata/womens/moore11.txt', sep = "\n", header = None)
moore_txts[2012] = pd.read_csv('mydata/womens/moore12.txt', sep = "\n", header = None)
moore_txts[2013] = pd.read_csv('mydata/womens/moore13.txt', sep = "\n", header = None)
moore_txts[2014] = pd.read_csv('mydata/womens/moore14.txt', sep = "\n", header = None)
moore_txts[2015] = pd.read_csv('mydata/womens/moore15.txt', sep = "\n", header = None)
moore_txts[2016] = pd.read_csv('mydata/womens/moore16.txt', sep = "\n", header = None)
moore_txts[2017] = pd.read_csv('mydata/womens/moore17.txt', sep = "\n", header = None)
moore_txts[2018] = pd.read_csv('mydata/womens/moore18.txt', sep = "\n", header = None)
moore_txts[2019] = pd.read_csv('mydata/womens/moore19.txt', sep = "\n", header = None)

In [None]:
# seasons of data I have ratings for
seasons = list(range(2005, 2020))

moore_data = pd.DataFrame({'Team': [], 'MooreRating': [], 'Season': []})

# for each season
for season in seasons:
    
    # lists to store team ratings
    teams = []
    ratings = []
    
    # for each row in the text file
    for i in range(len(moore_txts[season])):
        row = moore_txts[season].iloc[i, 0].split(' ')
        
        # list of actual elements in row
        actual = []
        
        # for each element in the row, add it to the actuals if it's not an empty string
        for element in row:
            if element != '':
                actual.append(element)
                
        # new stat was added in 2009, need to update magic number to get the team name
        # need to read in each word of the team and join by a space
        if season < 2009:
            teams.append(" ".join(actual[1:len(actual) - 4]))
        else:
            teams.append(" ".join(actual[1:len(actual) - 5]))
            
        # ratings is always the last element
        ratings.append(actual[-1])
        
    # append the seasons data
    moore_data = pd.concat([moore_data, pd.DataFrame({'Team': teams,
                                                    'MooreRating': ratings,
                                                    'Season': [season for i in range(len(teams))]})])

moore_data = moore_data.reset_index(drop = True)
moore_data.head()

In [None]:
# Put the team names in the same format (lowercase no punctuation) for joins later
moore_data['Team'] = moore_data.Team.str.replace('[^a-zA-Z&.() ]+',' ').str.lower()
moore_data['Team'] = moore_data.Team.str.replace('[^a-z& ]+','').str.rstrip()

In [None]:
# function to change team names to match what's in the spelling csv
def fix_name(row):
    if row['Team'] == 'purdue ft wayne':
        return 'pfw'
    elif row['Team'] == 'mass lowell':
        return 'massachusetts lowell'
    elif row['Team'] == 'nj tech':
        return 'new jersey tech'
    elif row['Team'] == 'presbyterian college':
        return 'presbyterian'
    elif row['Team'] == 'loyola illinois':
        return 'loyola chicago'
    elif row['Team'] == 'central connecticut st':
        return 'central conn'
    elif row['Team'] == 'mt st mary s md':
        return 'mt st mary s'
    elif row['Team'] == 'iupu ft wayne':
        return 'pfw'
    elif row['Team'] == 'mississippi valley st':
        return 'ms valley st'
    elif row['Team'] == 'oakland mi':
        return 'oakland'
    elif row['Team'] == 'towson st':
        return 'towson'
    elif row['Team'] == 'ohio university':
        return 'ohio'
    elif row['Team'] == 's f austin':
        return 'stephen f austin'
    elif row['Team'] == 'southern cal':
        return 'usc'
    else:
        return row['Team']

In [None]:
# fix the names for the join
moore_data['Team'] = moore_data.apply(fix_name, axis = 1)

In [None]:
# check to see if join workd
moore_teams = pd.merge(moore_data, spellings, how = 'left', left_on = 'Team', right_on = 'TeamNameSpelling')
nans(moore_teams)

In [None]:
moore_teams = moore_teams.drop(columns = ['Team', 'TeamNameSpelling'])
moore_teams['MooreRating'] = moore_teams['MooreRating'].astype(float)

### Tournament Seeds

In [None]:
# historical NCAA tournament seeds
seeds = pd.read_csv('WDataFiles_Stage1/WNCAATourneySeeds.csv')
seeds.head()

In [None]:
# merge seeds and moore ratings
teams = pd.merge(moore_teams, seeds, on = ['Season', 'TeamID'], how = 'inner').drop_duplicates()
teams.head()

In [None]:
# return just the seed number, no need for region for this use case
def clean_seeds(row):
    return int(row['Seed'][1:3])

In [None]:
# get seed number for each team
teams['Seed'] = teams.apply(clean_seeds, axis = 1)
teams.head()

### Tournament Matchup Data

This will be the data used to predict win probabilities

In [None]:
tournament_data = pd.read_csv('WDataFiles_Stage1/WNCAATourneyCompactResults.csv')
tournament_data = tournament_data.query('Season > 2004').drop(columns = ['DayNum', 'WLoc', 'NumOT'])
tournament_data.head()

In [None]:
game_data = pd.merge(tournament_data, teams[['Season', 'TeamID', 'MooreRating']], left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, teams[['Season', 'TeamID', 'MooreRating']], left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])
game_data = game_data.drop(columns = ['WTeamID', 'LTeamID']).rename(columns = {'WScore': 'Score_x', 'LScore': 'Score_y'})
game_data.head()

In [None]:
game_data.to_csv('mydata/womens/game_data.csv')