In [1]:
import pandas as pd
import re
import numpy as np
from scipy.stats import norm

### Teams

In [2]:
spellings = pd.read_csv('ncaaw-march-mania-2021/WTeamSpellings.csv', encoding = "ISO-8859-1")
spellings.head()

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,3394
1,a&m-corpus christi,3394
2,abilene chr,3101
3,abilene christian,3101
4,abilene-christian,3101


In [3]:
# Put the team names in the same format (lowercase no punctuation) for joins later
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-z&. ]+', ' ')
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-z& ]+', '')

### Helper Functions

In [4]:
nans = lambda df: df[df.isnull().any(axis=1)]  # Function to print out rows with null values

### Sonny Moore Ratings

These ratings aren't pre-tournament, but they were the best I could find

No scraping is allowed on the website so I copy pasted the rankings into .txt files

In [5]:
# reading the txt files with pd.read_txt didn't really work, so have to read them as a csv and then format them correctly
moore_txts = {}
moore_txts[2005] = pd.read_csv('mydata/womens/moore05.txt', sep = "\n", header = None)
moore_txts[2006] = pd.read_csv('mydata/womens/moore06.txt', sep = "\n", header = None)
moore_txts[2007] = pd.read_csv('mydata/womens/moore07.txt', sep = "\n", header = None)
moore_txts[2008] = pd.read_csv('mydata/womens/moore08.txt', sep = "\n", header = None)
moore_txts[2009] = pd.read_csv('mydata/womens/moore09.txt', sep = "\n", header = None)
moore_txts[2010] = pd.read_csv('mydata/womens/moore10.txt', sep = "\n", header = None)
moore_txts[2011] = pd.read_csv('mydata/womens/moore11.txt', sep = "\n", header = None)
moore_txts[2012] = pd.read_csv('mydata/womens/moore12.txt', sep = "\n", header = None)
moore_txts[2013] = pd.read_csv('mydata/womens/moore13.txt', sep = "\n", header = None)
moore_txts[2014] = pd.read_csv('mydata/womens/moore14.txt', sep = "\n", header = None)
moore_txts[2015] = pd.read_csv('mydata/womens/moore15.txt', sep = "\n", header = None)
moore_txts[2016] = pd.read_csv('mydata/womens/moore16.txt', sep = "\n", header = None)
moore_txts[2017] = pd.read_csv('mydata/womens/moore17.txt', sep = "\n", header = None)
moore_txts[2018] = pd.read_csv('mydata/womens/moore18.txt', sep = "\n", header = None)
moore_txts[2019] = pd.read_csv('mydata/womens/moore19.txt', sep = "\n", header = None)

In [6]:
# seasons of data I have ratings for
seasons = list(range(2005, 2020))

moore_data = pd.DataFrame({'Team': [], 'MooreRating': [], 'Season': []})

# for each season
for season in seasons:
    
    # lists to store team ratings
    teams = []
    ratings = []
    
    # for each row in the text file
    for i in range(len(moore_txts[season])):
        row = moore_txts[season].iloc[i, 0].split(' ')
        
        # list of actual elements in row
        actual = []
        
        # for each element in the row, add it to the actuals if it's not an empty string
        for element in row:
            if element != '':
                actual.append(element)
                
        # new stat was added in 2009, need to update magic number to get the team name
        # need to read in each word of the team and join by a space
        if season < 2009:
            teams.append(" ".join(actual[1:len(actual) - 4]))
        else:
            teams.append(" ".join(actual[1:len(actual) - 5]))
            
        # ratings is always the last element
        ratings.append(actual[-1])
        
    # append the seasons data
    moore_data = pd.concat([moore_data, pd.DataFrame({'Team': teams,
                                                    'MooreRating': ratings,
                                                    'Season': [season for i in range(len(teams))]})])

moore_data = moore_data.reset_index(drop = True)
moore_data.head()

Unnamed: 0,Team,MooreRating,Season
0,STANFORD,98.4,2005.0
1,LSU,96.75,2005.0
2,BAYLOR,95.16,2005.0
3,TENNESSEE,94.61,2005.0
4,DUKE,94.31,2005.0


In [7]:
# Put the team names in the same format (lowercase no punctuation) for joins later
moore_data['Team'] = moore_data.Team.str.replace('[^a-zA-Z&.() ]+',' ').str.lower()
moore_data['Team'] = moore_data.Team.str.replace('[^a-z& ]+','').str.rstrip()

In [8]:
# function to change team names to match what's in the spelling csv
def fix_name(row):
    if row['Team'] == 'purdue ft wayne':
        return 'pfw'
    elif row['Team'] == 'mass lowell':
        return 'massachusetts lowell'
    elif row['Team'] == 'nj tech':
        return 'new jersey tech'
    elif row['Team'] == 'presbyterian college':
        return 'presbyterian'
    elif row['Team'] == 'loyola illinois':
        return 'loyola chicago'
    elif row['Team'] == 'central connecticut st':
        return 'central conn'
    elif row['Team'] == 'mt st mary s md':
        return 'mt st mary s'
    elif row['Team'] == 'iupu ft wayne':
        return 'pfw'
    elif row['Team'] == 'mississippi valley st':
        return 'ms valley st'
    elif row['Team'] == 'oakland mi':
        return 'oakland'
    elif row['Team'] == 'towson st':
        return 'towson'
    elif row['Team'] == 'ohio university':
        return 'ohio'
    elif row['Team'] == 's f austin':
        return 'stephen f austin'
    elif row['Team'] == 'southern cal':
        return 'usc'
    else:
        return row['Team']

In [9]:
# fix the names for the join
moore_data['Team'] = moore_data.apply(fix_name, axis = 1)

In [10]:
# check to see if join workd
moore_teams = pd.merge(moore_data, spellings, how = 'left', left_on = 'Team', right_on = 'TeamNameSpelling')
nans(moore_teams)

Unnamed: 0,Team,MooreRating,Season,TeamNameSpelling,TeamID


In [11]:
moore_teams = moore_teams.drop(columns = ['Team', 'TeamNameSpelling']).drop_duplicates()
moore_teams['MooreRating'] = moore_teams['MooreRating'].astype(float)

### Tournament Seeds

In [12]:
# historical NCAA tournament seeds
seeds = pd.read_csv('ncaaw-march-mania-2021/WNCAATourneySeeds.csv')
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272


In [13]:
# merge seeds and moore ratings
teams = pd.merge(moore_teams, seeds, on = ['Season', 'TeamID'], how = 'inner').drop_duplicates()
teams.head()

Unnamed: 0,MooreRating,Season,TeamID,Seed
0,98.4,2005.0,3390,Y02
1,96.75,2005.0,3261,W01
2,95.16,2005.0,3124,X02
3,94.61,2005.0,3397,Z01
4,94.31,2005.0,3181,W02


In [14]:
# return just the seed number, no need for region for this use case
def clean_seeds(row):
    return int(row['Seed'][1:3])

In [15]:
# get seed number for each team
teams['Seed'] = teams.apply(clean_seeds, axis = 1)
teams.head()

Unnamed: 0,MooreRating,Season,TeamID,Seed
0,98.4,2005.0,3390,2
1,96.75,2005.0,3261,1
2,95.16,2005.0,3124,2
3,94.61,2005.0,3397,1
4,94.31,2005.0,3181,2


### Tournament Matchup Data

This will be the data used to predict win probabilities

In [16]:
tournament_data = pd.read_csv('ncaaw-march-mania-2021/WNCAATourneyCompactResults.csv')
tournament_data = tournament_data.query('Season > 2004').drop(columns = ['DayNum', 'WLoc', 'NumOT'])
tournament_data.head()

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore
441,2005,3113,87,3184,65
442,2005,3124,91,3229,70
443,2005,3208,75,3349,49
444,2005,3243,70,3132,60
445,2005,3277,73,3108,41


In [17]:
game_data = pd.merge(tournament_data, teams[['Season', 'TeamID', 'MooreRating']], left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, teams[['Season', 'TeamID', 'MooreRating']], left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])
game_data = game_data.drop(columns = ['WTeamID', 'LTeamID']).rename(columns = {'WScore': 'Score_x', 'LScore': 'Score_y'})
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,MooreRating_x,TeamID_y,MooreRating_y
0,2005,87,65,3113,86.08,3184,66.75
1,2005,70,61,3113,86.08,3323,84.71
2,2005,91,70,3124,95.16,3229,66.2
3,2005,69,46,3124,95.16,3332,79.34
4,2005,64,57,3124,95.16,3278,87.91


In [18]:
# To make team x be the team with the higher rating and team y be the team with the lower rating
def switch_teams(row):
    # if rating x is less than rating y
    if row['MooreRating_x'] < row['MooreRating_y']:
        underdog = row['Score_x']  # "Worse" team's score
        favorite = row['Score_y'] # "Better" team's score
        row['Score_x'] = favorite
        row['Score_y'] = underdog
        underdog = row['TeamID_x']  # "Worse" team's ID
        favorite = row['TeamID_y']  # "Better" team's ID
        row['TeamID_x'] = favorite
        row['TeamID_y'] = underdog
        underdog = row['MooreRating_x']  # "Worse" team's rating
        favorite = row['MooreRating_y']  # "Better" team's rating
        row['MooreRating_x'] = favorite
        row['MooreRating_y'] = underdog
    return row
                       
game_data = game_data.apply(switch_teams, axis = 1)
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,MooreRating_x,TeamID_y,MooreRating_y
0,2005.0,87.0,65.0,3113.0,86.08,3184.0,66.75
1,2005.0,70.0,61.0,3113.0,86.08,3323.0,84.71
2,2005.0,91.0,70.0,3124.0,95.16,3229.0,66.2
3,2005.0,69.0,46.0,3124.0,95.16,3332.0,79.34
4,2005.0,64.0,57.0,3124.0,95.16,3278.0,87.91


In [19]:
game_data = pd.merge(game_data, teams[['Season', 'TeamID', 'Seed']], left_on = ['Season', 'TeamID_x'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, teams[['Season', 'TeamID', 'Seed']], left_on = ['Season', 'TeamID_y'], right_on = ['Season', 'TeamID'])
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,MooreRating_x,TeamID_y,MooreRating_y,TeamID_x.1,Seed_x,TeamID_y.1,Seed_y
0,2005.0,87.0,65.0,3113.0,86.08,3184.0,66.75,3113,5,3184,12
1,2005.0,70.0,61.0,3113.0,86.08,3323.0,84.71,3113,5,3323,4
2,2005.0,91.0,70.0,3124.0,95.16,3229.0,66.2,3124,2,3229,15
3,2005.0,69.0,46.0,3124.0,95.16,3332.0,79.34,3124,2,3332,10
4,2005.0,55.0,58.0,3395.0,82.85,3332.0,79.34,3395,7,3332,10


In [20]:
# Start with the stats for each team
matchups = game_data.drop(columns = ['Score_x', 'Score_y', 'TeamID_x', 'TeamID_y'])

# Response variable for upset probabilities
matchups['Upset'] = game_data['Score_x'] < game_data['Score_y']  
matchups['Upset'] = matchups['Upset'].astype('int64')

# Response variable for predicting spread
matchups['ScoreDiff'] = game_data['Score_x'] - game_data['Score_y']

# Predictors

# Difference in NCAA tournament Seeds
matchups['SeedDiff'] = matchups['Seed_x'] - matchups['Seed_y']

# Difference in Moore Rating
matchups['MoorePredictedSpread'] = matchups['MooreRating_x'] - matchups['MooreRating_y']

matchups.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pan

Unnamed: 0,Season,MooreRating_x,MooreRating_y,Seed_x,Seed_y,Upset,ScoreDiff,SeedDiff,MoorePredictedSpread
0,2005.0,86.08,66.75,5,12,0,22.0,-7,19.33
1,2005.0,86.08,84.71,5,4,0,9.0,1,1.37
2,2005.0,95.16,66.2,2,15,0,21.0,-13,28.96
3,2005.0,95.16,79.34,2,10,0,23.0,-8,15.82
4,2005.0,82.85,79.34,7,10,1,-3.0,-3,3.51


In [21]:
matchups.to_csv('mydata/womens/matchups_no_stats.csv', index = False)

### Regular Season Stats

The regular season stats that Kaggle provides only go back to 2010, so I will have 2 different datasets (as the seeds and efficiency ratings go back to 2005). And then I can test which dataset/model is more effective to decide which one to use (or possibly use an average of the two).

In [22]:
reg_season = pd.read_csv('ncaaw-march-mania-2021/WRegularSeasonDetailedResults.csv')
reg_season.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,11,3103,63,3237,49,H,0,23,54,...,13,6,10,11,27,11,23,7,6,19
1,2010,11,3104,73,3399,68,N,0,26,62,...,21,14,27,14,26,7,20,4,2,27
2,2010,11,3110,71,3224,59,A,0,29,62,...,14,19,23,17,23,8,15,6,0,15
3,2010,11,3111,63,3267,58,A,0,27,52,...,26,16,25,22,22,15,11,14,5,14
4,2010,11,3119,74,3447,70,H,1,30,74,...,17,11,21,21,32,12,14,4,2,14


In [23]:
reg_season['ScoreDiff'] = reg_season['WScore'] - reg_season['LScore']

# Adjust score differential for home court
def adj_score_for_location(row):
    if row['WLoc'] == 'H':
        return row['ScoreDiff'] - 4
    elif row['WLoc'] == 'A':
        return row['ScoreDiff'] + 4
    else:
        return row['ScoreDiff']
    
reg_season['AdjScoreDiff'] = reg_season.apply(adj_score_for_location, axis = 1)

In [24]:
# Get moore offensive efficiency and defensive efficiency
moore_ratings = moore_teams[['Season', 'TeamID', 'MooreRating']]

# Get stats I need from regrular season stats
my_data = reg_season[['Season', 'WTeamID', 'LTeamID', 'WFGM', 'LFGM', 'WFGA', 'LFGA', 'WFGM3', 'LFGM3', 'WFGA3', 'LFGA3', 'WFTM', 'LFTM', 'WFTA', 'LFTA', 'WAst', 'LAst', 'WTO', 'LTO', 'WOR', 'LOR', 'WDR', 'LDR', 'AdjScoreDiff']]

# join stats and moore ratings for winning team
my_data = pd.merge(my_data, moore_ratings, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID']).rename(columns = {'MooreRating': 'WMooreRating'})

# join stats and moore ratings for losing team
my_data = pd.merge(my_data, moore_ratings, how = 'outer', left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID']).rename(columns = {'MooreRating': 'LMooreRating'})

my_data = my_data.drop(columns = ['TeamID_x', 'TeamID_y'])

In [25]:
# What Moore predicted the adjusted score differential  would be for each game
my_data['PredictedAdjScoreDiff'] = my_data['WMooreRating'] - my_data['LMooreRating']
my_data.head()

Unnamed: 0,Season,WTeamID,LTeamID,WFGM,LFGM,WFGA,LFGA,WFGM3,LFGM3,WFGA3,...,WTO,LTO,WOR,LOR,WDR,LDR,AdjScoreDiff,WMooreRating,LMooreRating,PredictedAdjScoreDiff
0,2010,3103.0,3237.0,23.0,20.0,54.0,54.0,5.0,3.0,9.0,...,18.0,23.0,10.0,11.0,26.0,27.0,10.0,64.45,43.12,21.33
1,2010,3231.0,3237.0,26.0,18.0,59.0,51.0,7.0,5.0,20.0,...,19.0,24.0,16.0,10.0,27.0,21.0,19.0,74.67,43.12,31.55
2,2010,3282.0,3237.0,26.0,11.0,65.0,43.0,4.0,0.0,22.0,...,15.0,22.0,18.0,8.0,29.0,24.0,25.0,57.62,43.12,14.5
3,2010,3282.0,3237.0,23.0,17.0,54.0,46.0,5.0,5.0,18.0,...,22.0,23.0,17.0,12.0,19.0,17.0,14.0,57.62,43.12,14.5
4,2010,3293.0,3237.0,30.0,22.0,60.0,54.0,14.0,0.0,23.0,...,16.0,19.0,13.0,15.0,20.0,22.0,24.0,55.99,43.12,12.87


In [26]:
# w_data is data for games in which the team won, and l_data is data for the games in which the team lost
w_data = my_data.groupby(['Season', 'WTeamID']).sum().drop(columns = ['LTeamID', 'AdjScoreDiff', 'PredictedAdjScoreDiff', 'WMooreRating', 'LMooreRating']).reset_index()
l_data = my_data.groupby(['Season', 'LTeamID']).sum().drop(columns = ['WTeamID', 'AdjScoreDiff', 'PredictedAdjScoreDiff', 'WMooreRating', 'LMooreRating']).reset_index()
w_data.head()

Unnamed: 0,Season,WTeamID,WFGM,LFGM,WFGA,LFGA,WFGM3,LFGM3,WFGA3,LFGA3,...,WFTA,LFTA,WAst,LAst,WTO,LTO,WOR,LOR,WDR,LDR
0,2010,3102.0,23.0,18.0,51.0,57.0,6.0,10.0,14.0,27.0,...,21.0,12.0,13.0,13.0,14.0,18.0,11.0,14.0,29.0,19.0
1,2010,3103.0,407.0,323.0,952.0,968.0,74.0,68.0,208.0,263.0,...,337.0,280.0,242.0,166.0,287.0,324.0,241.0,252.0,438.0,353.0
2,2010,3104.0,308.0,234.0,655.0,650.0,53.0,45.0,150.0,190.0,...,216.0,202.0,189.0,119.0,226.0,244.0,158.0,163.0,308.0,236.0
3,2010,3105.0,305.0,287.0,666.0,808.0,44.0,50.0,117.0,199.0,...,371.0,313.0,158.0,176.0,360.0,310.0,158.0,215.0,358.0,258.0
4,2010,3106.0,235.0,186.0,598.0,608.0,27.0,31.0,84.0,123.0,...,338.0,318.0,146.0,89.0,231.0,214.0,179.0,170.0,307.0,237.0


In [27]:
wl_data = pd.merge(w_data, l_data, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'LTeamID'], how = 'outer')
wl_data = wl_data.fillna(0)
# must do a outer join and fill NaNs with zeros due to undefeated teams

In [28]:
# Caculate season stats
stats = pd.DataFrame()
stats['Season'] = wl_data['Season']
stats['TeamID'] = wl_data['WTeamID']
stats['3ptRate'] = (wl_data['WFGA3_x'] + wl_data['LFGA3_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'])
stats['Ast%'] = (wl_data['WAst_x'] + wl_data['LAst_y']) / (wl_data['WFGM_x'] + wl_data['LFGM_y'])
stats['FT%'] = (wl_data['WFTM_x'] + wl_data['LFTM_y']) / (wl_data['WFTA_x'] + wl_data['LFTA_y'])
stats['OppFT%'] = (wl_data['WFTM_y'] + wl_data['LFTM_x']) / (wl_data['WFTA_y'] + wl_data['LFTA_x'])
stats['Opp3ptRate'] = (wl_data['WFGA3_y'] + wl_data['LFGA3_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'])
stats['OppAst%'] = (wl_data['WAst_y'] + wl_data['LAst_x']) / (wl_data['WFGM_y'] + wl_data['LFGM_x'])
stats['EFG%'] = (wl_data['WFGM_x'] + wl_data['LFGM_y'] + .5 * wl_data['WFGM3_x'] + .5 * wl_data['LFGM3_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'])
stats['EFGD%'] = (wl_data['WFGM_y'] + wl_data['LFGM_x'] + .5 * wl_data['WFGM3_y'] + .5 * wl_data['LFGM3_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'])
stats['TOR%'] = (wl_data['WTO_x'] + wl_data['LTO_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'] - wl_data['WOR_x'] - wl_data['LOR_y'] + wl_data['WTO_x'] + wl_data['LTO_y'] + .44 * (wl_data['WFTA_x'] + wl_data['LFTA_y']))
stats['TORD%'] = (wl_data['WTO_y'] + wl_data['LTO_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'] - wl_data['WOR_y'] - wl_data['LOR_x'] + wl_data['WTO_y'] + wl_data['LTO_x'] + .44 * (wl_data['WFTA_y'] + wl_data['LFTA_x']))
stats['ORB%'] = (wl_data['WOR_x'] + wl_data['LOR_y']) / (wl_data['WOR_x'] + wl_data['LOR_y'] + wl_data['WDR_y'] + wl_data['LDR_x'])
stats['OppORB%'] = 1 - (wl_data['WDR_x'] + wl_data['LDR_y']) / (wl_data['WOR_y'] + wl_data['LOR_x'] + wl_data['WDR_x'] + wl_data['LDR_y'])
stats['FTR'] = (wl_data['WFTA_x'] + wl_data['LFTA_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'])
stats['FTRD'] = (wl_data['WFTA_y'] + wl_data['LFTA_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'])
stats['2P%'] = (wl_data['WFGM_x'] + wl_data['LFGM_y'] - (wl_data['WFGM3_x'] + wl_data['LFGM3_y'])) / (wl_data['WFGA_x'] + wl_data['LFGA_y'] - (wl_data['WFGA3_x'] + wl_data['LFGA3_y']))
stats['2P%D'] = (wl_data['WFGM_y'] + wl_data['LFGM_x'] - (wl_data['WFGM3_y'] + wl_data['LFGM3_x'])) / (wl_data['WFGA_y'] + wl_data['LFGA_x'] - (wl_data['WFGA3_y'] + wl_data['LFGA3_x']))
stats['3P%'] = (wl_data['WFGM3_x'] + wl_data['LFGM3_y']) / (wl_data['WFGA3_x'] + wl_data['LFGA3_y'])
stats['3P%D'] = (wl_data['WFGM3_y'] + wl_data['LFGM3_x']) / (wl_data['WFGA3_y'] + wl_data['LFGA3_x'])
stats.head()

Unnamed: 0,Season,TeamID,3ptRate,Ast%,FT%,OppFT%,Opp3ptRate,OppAst%,EFG%,EFGD%,TOR%,TORD%,ORB%,OppORB%,FTR,FTRD,2P%,2P%D,3P%,3P%D
0,2010,3102.0,0.302419,0.535448,0.678378,0.676259,0.363521,0.723545,0.403226,0.547182,0.275893,0.223086,0.343166,0.396247,0.248656,0.264091,0.393064,0.537313,0.284444,0.376307
1,2010,3103.0,0.225573,0.604198,0.705782,0.683186,0.275925,0.558029,0.437877,0.425409,0.268796,0.273712,0.396648,0.366487,0.354644,0.342632,0.42757,0.40536,0.315508,0.318681
2,2010,3104.0,0.237091,0.529986,0.616016,0.642072,0.274656,0.52381,0.433093,0.439794,0.258451,0.25938,0.353535,0.364328,0.270405,0.365252,0.430131,0.43083,0.295082,0.308977
3,2010,3105.0,0.194465,0.478182,0.69863,0.656693,0.222151,0.59187,0.430444,0.428525,0.343308,0.282486,0.39089,0.393852,0.478514,0.408886,0.423146,0.425497,0.307116,0.292754
4,2010,3106.0,0.180872,0.62243,0.647989,0.67141,0.233333,0.531792,0.373455,0.389796,0.292782,0.267895,0.390485,0.333333,0.45283,0.52585,0.362986,0.364685,0.280576,0.314869


In [29]:
# compute season variance stats for each team
stats2 = pd.DataFrame()
for season in teams['Season'].unique():
    if season >= 2010:
        for team in teams['TeamID'].unique():
            if len(teams[(teams['Season'] == season) & (teams['TeamID'] == team) & (teams['Seed'] > 0)]) > 0:
                season_data = my_data[my_data['Season'] == season]
                w_data = season_data[season_data['WTeamID'] == team]
                w_data.columns = ['Season', 'TeamID', 'OppTeamID', 'FGM', 'OppFGM', 'FGA', 'OppFGA', 'FGM3', 'OppFGM3', 'FGA3', 'OppFGA3', 'FTM', 'OppFTM', 'FTA', 'OppFTA', 'Ast', 'OppAst', 'TO', 'OppTO', 'OR', 'OppOR', 'DR', 'OppDR', 'AdjScoreDiff', 'MooreRating', 'OppMooreRating', 'PredictedAdjScoreDiff']
                l_data = season_data[season_data['LTeamID'] == team]
                l_data = l_data[['Season', 'LTeamID', 'WTeamID', 'LFGM', 'WFGM', 'LFGA', 'WFGA', 'LFGM3', 'WFGM3', 'LFGA3', 'WFGA3', 'LFTM', 'WFTM', 'LFTA', 'WFTA', 'LAst', 'WAst', 'LTO', 'WTO', 'LOR', 'WOR', 'LDR', 'WDR', 'AdjScoreDiff', 'LMooreRating', 'WMooreRating', 'PredictedAdjScoreDiff']]
                l_data['AdjScoreDiff'] = -1 * l_data['AdjScoreDiff']
                l_data['PredictedAdjScoreDiff'] = -1 * l_data['PredictedAdjScoreDiff']
                l_data.columns = w_data.columns
                team_data = pd.concat([w_data, l_data])
                team_data['3ptRate'] = team_data['FGA3'] / team_data['FGA']
                team_data['Opp3ptRate'] = team_data['OppFGA3'] / team_data['OppFGA']
                team_data['Ast%'] = team_data['Ast'] / team_data['FGM']
                team_data['OppAst%'] = team_data['OppAst'] / team_data['OppFGM']
                team_data['eFG%'] = (team_data['FGM'] + .5 * team_data['FGM3']) / team_data['FGA']
                team_data['OppeFG%'] = (team_data['OppFGM'] + .5 * team_data['OppFGM3']) / team_data['OppFGA']
                team_data['3pt%'] = team_data['FGM3'] / team_data['FGA3']
                team_data['Opp3pt%'] = team_data['OppFGM3'] / team_data['OppFGA3']
                team_data['FT%'] = team_data['FTM'] / team_data['FTA']
                team_data['FTR'] = team_data['FTA'] / team_data['FGA']
                team_data['OppFTR'] = team_data['OppFTA'] / team_data['OppFGA']
                team_data['OR%'] = team_data['OR'] / (team_data['OR'] + team_data['OppDR'])
                team_data['OppOR%'] = team_data['OppOR'] / (team_data['OppOR'] + team_data['DR'])
                team_data['TO%'] = team_data['TO'] / (team_data['TO'] + team_data['FGA'] - team_data['OR'] + .44 * team_data['FTA'])
                team_data['OppTO%'] = team_data['OppTO'] / (team_data['OppTO'] + team_data['OppFGA'] - team_data['OppOR'] + .44 * team_data['OppFTA'])
                team_data['TotalPoss'] = team_data['TO'] + team_data['OppTO'] + team_data['FGA'] + team_data['OppFGA'] - team_data['OR'] - team_data['OR'] + .44 * (team_data['FTA'] + team_data['OppFTA'])
                team_data['GameScore'] = team_data['AdjScoreDiff'] - team_data['PredictedAdjScoreDiff']
                stats2 = pd.concat([stats2, pd.DataFrame({'Season': [season],
                                                         'TeamID': [team],
                                                         '3ptRateVar': [np.var(team_data['3ptRate'])],
                                                         'Opp3ptRateVar': [np.var(team_data['Opp3ptRate'])],
                                                         'eFG%Var': [np.var(team_data['eFG%'])],
                                                         'OppeFG%Var': [np.var(team_data['OppeFG%'])],
                                                         '3pt%Var': [np.var(team_data['3pt%'])],
                                                         'Opp3pt%Var': [np.var(team_data['Opp3pt%'])],
                                                         'Ast%Var': [np.var(team_data['Ast%'])],
                                                         'OppAst%Var': [np.var(team_data['OppAst%'])],
                                                         'FT%Var': [np.var(team_data['FT%'])],
                                                         'FTRVar': [np.var(team_data['FTR'])],
                                                         'OppFTRVar': [np.var(team_data['OppFTR'])],
                                                         'OR%Var': [np.var(team_data['OR%'])],
                                                         'OppOR%Var': [np.var(team_data['OppOR%'])],
                                                         'TO%Var': [np.var(team_data['TO%'])],
                                                         'OppTO%Var': [np.var(team_data['OppTO%'])],
                                                         'TotalPossVar': [np.var(team_data['TotalPoss'])],
                                                         'AvgTotalPoss': [np.mean(team_data['TotalPoss'])],
                                                         'GameScoreVar': [np.var(team_data['GameScore'])]})])
stats2.head()

Unnamed: 0,Season,TeamID,3ptRateVar,Opp3ptRateVar,eFG%Var,OppeFG%Var,3pt%Var,Opp3pt%Var,Ast%Var,OppAst%Var,FT%Var,FTRVar,OppFTRVar,OR%Var,OppOR%Var,TO%Var,OppTO%Var,TotalPossVar,AvgTotalPoss,GameScoreVar
0,2010.0,3390,0.008415,0.005157,0.006079,0.005445,0.007826,0.008772,0.009252,0.017744,0.012917,0.012582,0.005698,0.009542,0.007838,0.002165,0.002997,268.232575,135.415,58.491617
0,2010.0,3261,0.004111,0.007336,0.006041,0.010749,0.02711,0.026899,0.019657,0.025484,0.013066,0.01849,0.020243,0.007318,0.007551,0.00398,0.008162,113.234473,132.464828,163.683383
0,2010.0,3124,0.005612,0.014637,0.009683,0.007028,0.043473,0.009283,0.018689,0.019067,0.012956,0.031475,0.011766,0.007507,0.008312,0.002809,0.004487,231.3102,143.55,163.244206
0,2010.0,3397,0.006081,0.010545,0.005376,0.00395,0.01279,0.015435,0.013426,0.017646,0.020417,0.018426,0.01833,0.006989,0.003753,0.00457,0.003195,125.220436,135.19625,96.628153
0,2010.0,3181,0.00337,0.011399,0.006336,0.010653,0.01726,0.016776,0.012013,0.018947,0.016619,0.014693,0.0192,0.00878,0.009094,0.003486,0.005427,128.586875,139.875,143.638373


In [30]:
stats_merge = pd.merge(stats, stats2, on = ['Season', 'TeamID'])
teams = pd.merge(teams, stats_merge, on = ['Season', 'TeamID'])
teams.head()

Unnamed: 0,MooreRating,Season,TeamID,Seed,3ptRate,Ast%,FT%,OppFT%,Opp3ptRate,OppAst%,...,FT%Var,FTRVar,OppFTRVar,OR%Var,OppOR%Var,TO%Var,OppTO%Var,TotalPossVar,AvgTotalPoss,GameScoreVar
0,114.8,2010.0,3163,1,0.276465,0.623,0.723077,0.630556,0.288594,0.486111,...,0.015902,0.01236,0.007544,0.006966,0.007483,0.001416,0.005599,141.409973,140.36875,70.878509
1,102.7,2010.0,3390,1,0.309338,0.62967,0.735974,0.630542,0.251128,0.449405,...,0.012917,0.012582,0.005698,0.009542,0.007838,0.002165,0.002997,268.232575,135.415,58.491617
2,95.52,2010.0,3401,2,0.187346,0.553996,0.693944,0.718663,0.256336,0.569207,...,0.021499,0.015539,0.021758,0.007529,0.006896,0.002672,0.004322,201.841711,144.08625,124.641742
3,95.45,2010.0,3397,1,0.230377,0.550165,0.678383,0.679487,0.302521,0.512938,...,0.020417,0.018426,0.01833,0.006989,0.003753,0.00457,0.003195,125.220436,135.19625,96.628153
4,94.47,2010.0,3304,1,0.337707,0.512438,0.709413,0.698734,0.249857,0.460857,...,0.0088,0.019215,0.012685,0.008906,0.006045,0.003859,0.002995,138.626206,141.577333,135.902214


In [31]:
for season in range(2010, 2020):
    print(str(season) + ": " + str(len(teams[teams['Season'] == season])))

2010: 64
2011: 64
2012: 64
2013: 64
2014: 64
2015: 64
2016: 64
2017: 64
2018: 64
2019: 64


### Tournament Matchups

In [32]:
tournament_data = pd.read_csv('ncaaw-march-mania-2021/WNCAATourneyCompactResults.csv')
tournament_data = tournament_data.query('Season > 2004').drop(columns = ['DayNum', 'WLoc', 'NumOT'])
tournament_data.head()

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore
441,2005,3113,87,3184,65
442,2005,3124,91,3229,70
443,2005,3208,75,3349,49
444,2005,3243,70,3132,60
445,2005,3277,73,3108,41


In [33]:
game_data = pd.merge(tournament_data, teams[['Season', 'TeamID', 'MooreRating']], left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, teams[['Season', 'TeamID', 'MooreRating']], left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])
game_data = game_data.drop(columns = ['WTeamID', 'LTeamID']).rename(columns = {'WScore': 'Score_x', 'LScore': 'Score_y'})
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,MooreRating_x,TeamID_y,MooreRating_y
0,2010,69,55,3124,91.81,3201,78.17
1,2010,49,33,3124,91.81,3207,82.29
2,2010,77,62,3124,91.81,3397,95.45
3,2010,51,48,3124,91.81,3181,92.39
4,2010,67,66,3173,78.46,3395,78.4


In [34]:
# To make team x be the team with the higher rating and team y be the team with the lower rating
def switch_teams(row):
    # if rating x is less than rating y
    if row['MooreRating_x'] < row['MooreRating_y']:
        underdog = row['Score_x']  # "Worse" team's score
        favorite = row['Score_y'] # "Better" team's score
        row['Score_x'] = favorite
        row['Score_y'] = underdog
        underdog = row['TeamID_x']  # "Worse" team's ID
        favorite = row['TeamID_y']  # "Better" team's ID
        row['TeamID_x'] = favorite
        row['TeamID_y'] = underdog
        underdog = row['MooreRating_x']  # "Worse" team's rating
        favorite = row['MooreRating_y']  # "Better" team's rating
        row['MooreRating_x'] = favorite
        row['MooreRating_y'] = underdog
    return row
                       
game_data = game_data.apply(switch_teams, axis = 1)
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,MooreRating_x,TeamID_y,MooreRating_y
0,2010.0,69.0,55.0,3124.0,91.81,3201.0,78.17
1,2010.0,49.0,33.0,3124.0,91.81,3207.0,82.29
2,2010.0,62.0,77.0,3397.0,95.45,3124.0,91.81
3,2010.0,48.0,51.0,3181.0,92.39,3124.0,91.81
4,2010.0,67.0,66.0,3173.0,78.46,3395.0,78.4


In [35]:
game_data = pd.merge(game_data, teams.drop(columns = ['MooreRating']), left_on = ['Season', 'TeamID_x'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, teams.drop(columns = ['MooreRating']), left_on = ['Season', 'TeamID_y'], right_on = ['Season', 'TeamID'])
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,MooreRating_x,TeamID_y,MooreRating_y,TeamID_x.1,Seed_x,3ptRate_x,...,FT%Var_y,FTRVar_y,OppFTRVar_y,OR%Var_y,OppOR%Var_y,TO%Var_y,OppTO%Var_y,TotalPossVar_y,AvgTotalPoss_y,GameScoreVar_y
0,2010.0,69.0,55.0,3124.0,91.81,3201.0,78.17,3124,4,0.153633,...,0.011738,0.017101,0.015985,0.006837,0.004105,0.003004,0.005762,92.019009,141.230303,86.551053
1,2010.0,49.0,33.0,3124.0,91.81,3207.0,82.29,3124,4,0.153633,...,0.017657,0.020303,0.023385,0.004521,0.008373,0.003912,0.006692,267.448576,135.352,77.245596
2,2010.0,62.0,77.0,3397.0,95.45,3124.0,91.81,3397,1,0.230377,...,0.012956,0.031475,0.011766,0.007507,0.008312,0.002809,0.004487,231.3102,143.55,163.244206
3,2010.0,48.0,51.0,3181.0,92.39,3124.0,91.81,3181,2,0.226961,...,0.012956,0.031475,0.011766,0.007507,0.008312,0.002809,0.004487,231.3102,143.55,163.244206
4,2010.0,70.0,50.0,3163.0,114.8,3124.0,91.81,3163,1,0.276465,...,0.012956,0.031475,0.011766,0.007507,0.008312,0.002809,0.004487,231.3102,143.55,163.244206


In [36]:
# Start with the stats for each team
matchups = game_data.drop(columns = ['Score_x', 'Score_y', 'TeamID_x', 'TeamID_y'])

# Response variable for upset probabilities
matchups['Upset'] = game_data['Score_x'] < game_data['Score_y']  
matchups['Upset'] = matchups['Upset'].astype('int64')

# Response variable for predicting spread
matchups['ScoreDiff'] = game_data['Score_x'] - game_data['Score_y']

# Predictors

# Difference in NCAA tournament Seeds
matchups['SeedDiff'] = matchups['Seed_x'] - matchups['Seed_y']

# Difference in Moore Rating
matchups['MoorePredictedSpread'] = matchups['MooreRating_x'] - matchups['MooreRating_y']

# Offensive vs defensive EFG% averages and differences
matchups['xOffyDefEFGAvg'] = (game_data['EFG%_x'] + game_data['EFGD%_y']) / 2
matchups['yOffxDefEFGAvg'] = (game_data['EFG%_y'] + game_data['EFGD%_x']) / 2
matchups['xOffyOffEFGDiff'] = matchups['xOffyDefEFGAvg'] - matchups['yOffxDefEFGAvg']

# Offensive vs defensive turnover rate averages and differences
matchups['xOffyDefTOAvg'] = (game_data['TOR%_x'] + game_data['TORD%_y']) / 2
matchups['yOffxDefTOAvg'] = (game_data['TOR%_y'] + game_data['TORD%_x']) / 2
matchups['xOffyOffTODiff'] = matchups['xOffyDefTOAvg'] - matchups['yOffxDefTOAvg']

# Offensive vs defensive rebound rate averages and differences
matchups['xOffRebAvg'] = (game_data['ORB%_x'] + game_data['OppORB%_y']) / 2
matchups['yOffRebAvg'] = (game_data['ORB%_y'] + game_data['OppORB%_x']) / 2
matchups['xOffyOffRebDiff'] = matchups['xOffRebAvg'] - matchups['yOffRebAvg']

# Offensive vs defensive FT rate averages and differences
matchups['xOffyDefFTRateAvg'] = (game_data['FTR_x'] + game_data['FTRD_y']) / 2
matchups['yOffxDefFTRateAvg'] = (game_data['FTR_y'] + game_data['FTRD_x']) / 2
matchups['xOffyOffFTRateDiff'] = matchups['xOffyDefFTRateAvg'] - matchups['yOffxDefFTRateAvg']

# Offensive vs defensive assist rate averages and differences
matchups['AbsxOffyDefAstDiff'] = abs(game_data['Ast%_x'] - game_data['OppAst%_y'])
matchups['AbsyOffxDefAstDiff'] = abs(game_data['Ast%_y'] - game_data['OppAst%_x'])
matchups['xOffyDefAstAvg'] = (game_data['Ast%_x'] + game_data['OppAst%_y']) / 2
matchups['yOffxDefAstAvg'] = (game_data['Ast%_y'] + game_data['OppAst%_x']) / 2
matchups['xOffyOffAstDiff'] = matchups['xOffyDefAstAvg'] - matchups['yOffxDefAstAvg']


# Sum of the variance in game possession of both teams
matchups['TotalPossVarSum'] = game_data['TotalPossVar_x'] + game_data['TotalPossVar_y']

# Sum of the variance in game performance of both teams
matchups['GameScoreVarSum'] = game_data['GameScoreVar_x'] + game_data['GameScoreVar_y']

# Average of the average number of possessions in a game for both teams
matchups['AvgTotalPoss'] = 0.5 * (game_data['AvgTotalPoss_x'] + game_data['AvgTotalPoss_y'])

# Naive upset probability using predicted spread and sum of variance
matchups['MooreNaiveUpsetProbability'] = norm.cdf(0, loc = matchups['MoorePredictedSpread'], scale = (0.5 * matchups['GameScoreVarSum']) ** 0.5)

matchups.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pan

Unnamed: 0,Season,MooreRating_x,MooreRating_y,Seed_x,3ptRate_x,Ast%_x,FT%_x,OppFT%_x,Opp3ptRate_x,OppAst%_x,...,xOffyOffFTRateDiff,AbsxOffyDefAstDiff,AbsyOffxDefAstDiff,xOffyDefAstAvg,yOffxDefAstAvg,xOffyOffAstDiff,TotalPossVarSum,GameScoreVarSum,AvgTotalPoss,MooreNaiveUpsetProbability
0,2010.0,91.81,78.17,4,0.153633,0.562874,0.726562,0.687204,0.318363,0.449704,...,0.103332,0.046362,0.116533,0.539693,0.50797,0.031723,323.329209,249.79526,142.390152,0.111138
1,2010.0,91.81,82.29,4,0.153633,0.562874,0.726562,0.687204,0.318363,0.449704,...,0.137651,0.109423,0.21603,0.617586,0.557719,0.059867,498.758776,240.489802,139.451,0.192651
2,2010.0,95.45,91.81,1,0.230377,0.550165,0.678383,0.679487,0.302521,0.512938,...,-0.085583,0.100461,0.049937,0.499935,0.537906,-0.037971,356.530636,259.872359,139.373125,0.374739
3,2010.0,92.39,91.81,2,0.226961,0.53085,0.662016,0.665517,0.309621,0.603648,...,-0.118697,0.081146,0.040774,0.490277,0.583261,-0.092984,359.897075,306.88258,141.7125,0.481327
4,2010.0,114.8,91.81,1,0.276465,0.623,0.723077,0.630556,0.288594,0.486111,...,-0.052871,0.173296,0.076763,0.536352,0.524493,0.011859,372.720173,234.122715,141.959375,0.016799


In [37]:
matchups.to_csv('mydata/womens/matchups.csv', index = False)