In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
from scipy.stats import zscore, norm

### Teams

In [2]:
# File from Kaggle, all possible team spellings (to get to the TeamID)
#spellings = pd.read_csv('MDataFiles_Stage1/MTeamSpellings.csv', encoding = 'ISO-8859-1')
spellings = pd.read_csv('TeamSpellings.csv', encoding = 'ISO-8859-1')
spellings.head()

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101


In [19]:
# Put the team names in the same format (lowercase no punctuation) for joins
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-zA-Z&.()\' ]+',' ').str.lower()
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-z&.()\' ]+','')

### Helper Functions and Variables

In [4]:
# last days before tournament starts, these are the days we want to scrape to avoid any leakage
last_days = {2008: '2008-03-19',
             2009: '2009-03-18',
             2010: '2010-03-17',
            2011: '2011-03-16',
            2012: '2012-03-14',
            2013: '2013-03-20',
            2014: '2014-03-19',
            2015: '2015-03-18',
            2016: '2016-03-16',
            2017: '2017-03-15',
            2018: '2018-03-14',
            2019: '2019-03-20'}

# seasons to scrape, 2008-2019
seasons = [2008 + i for i in range(12)]

In [5]:
nans = lambda df: df[df.isnull().any(axis=1)]  # Function to print out rows with null values

In [60]:
# returns name of team that is in the spellings csv
def fix_name(row):
    if row['Team'] == 'st marys':
        return 'st marys ca'
    elif row['Team'] == 'wins salem' or row['Team'] == 'winston salem st.':
        return 'winston salem'
    elif row['Team'] == 'w virginia':
        return 'west virginia'
    elif row['Team'] == 'n carolina':
        return 'north carolina'
    elif row['Team'] == 'tx christian':
        return 'tcu'
    elif row['Team'] == 'va tech':
        return 'virginia tech'
    elif row['Team'] == 'miss state':
        return 'mississippi st'
    elif row['Team'] == 'st bonavent':
        return 'st bonaventure'
    elif row['Team'] == 'loyola chi':
        return 'loyola chicago'
    elif row['Team'] == 's methodist':
        return 'smu'
    elif row['Team'] == 'n mex state':
        return 'new mexico st'
    elif row['Team'] == 's carolina':
        return 'south carolina'
    elif row['Team'] == 'boston col':
        return 'boston college'
    elif row['Team'] == 'e tenn st':
        return 'etsu'
    elif row['Team'] == 'nc grnsboro':
        return 'unc greensboro'
    elif row['Team'] == 'central fl':
        return 'ucf'
    elif row['Team'] == 'utah val st':
        return 'utah valley state'
    elif row['Team'] == 'northeastrn':
        return 'northeastern'
    elif row['Team'] == 'ga tech':
        return 'georgia tech'
    elif row['Team'] == 'col charlestn':
        return 'college of charleston'
    elif row['Team'] == 'st josephs':
        return 'st josephs pa'
    elif row['Team'] == 'u penn':
        return 'penn'
    elif row['Team'] == 'ste f austin':
        return 'stephen f austin'
    elif row['Team'] == 'fla gulf cst':
        return 'florida gulf coast'
    elif row['Team'] == 'grd canyon':
        return 'grand canyon'
    elif row['Team'] == 'tx arlington':
        return 'ut arlington'
    elif row['Team'] == 'n iowa':
        return 'northern iowa'
    elif row['Team'] == 'la tech':
        return 'louisiana tech'
    elif row['Team'] == 'wm & mary':
        return 'william & mary'
    elif row['Team'] == 'jksnville st':
        return 'jacksonville st'
    elif row['Team'] == 'app state':
        return 'appalachian st'
    elif row['Team'] == 'san fransco':
        return 'san francisco'
    elif row['Team'] == 'e washingtn':
        return 'eastern washington'
    elif row['Team'] == 'geo wshgtn':
        return 'george washington'
    elif row['Team'] == 'u mass':
        return 'umass'
    elif row['Team'] == 'maryland bc':
        return 'umbc'
    elif row['Team'] == 'wash state':
        return 'washington st'
    elif row['Team'] == 'tx san ant':
        return 'utsa'
    elif row['Team'] == 'st fran (pa)' or row['Team'] == 'st. francis pa':
        return 'st francis pa'
    elif row['Team'] == 'miami oh':
        return 'miami ohio'
    elif row['Team'] == 'geo mason':
        return 'george mason'
    elif row['Team'] == 'wi milwkee':
        return 'milwaukee'
    elif row['Team'] == 'tn state':
        return 'tennessee st'
    elif row['Team'] == 'tn tech':
        return 'tennessee tech'
    elif row['Team'] == 'nc wilmgton':
        return 'unc wilmington'
    elif row['Team'] == 's alabama':
        return 'south alabama'
    elif row['Team'] == 'lg beach st':
        return 'long beach st'
    elif row['Team'] == 'james mad':
        return 'james madison'
    elif row['Team'] == 'sam hous st':
        return 'sam houston st'
    elif row['Team'] == 'cs bakersfld' or row['Team'] == 'cal st. bakersfield':
        return 'cal state bakersfield'
    elif row['Team'] == 'loyola mymt':
        return 'loyola marymount'
    elif row['Team'] == 's mississippi':
        return 'southern miss'
    elif row['Team'] == 'bowling grn':
        return 'bowling green'
    elif row['Team'] == 'tx el paso':
        return 'utep'
    elif row['Team'] == 'n hampshire':
        return 'new hampshire'
    elif row['Team'] == 'rob morris':
        return 'robert morris'
    elif row['Team'] == 'wi grn bay':
        return 'green bay'
    elif row['Team'] == 'charl south':
        return 'charleston southern'
    elif row['Team'] == 'abl christian':
        return 'abilene christian'
    elif row['Team'] == 'gard webb':
        return 'gardner webb'
    elif row['Team'] == 'tx pan am':
        return 'texas pan american'
    elif row['Team'] == 'se missouri' or row['Team'] == 'southeast missouri st.':
        return 'se missouri st'
    elif row['Team'] == 'neb omaha':
        return 'omaha'
    elif row['Team'] == 's florida':
        return 'south florida'
    elif row['Team'] == 'mass lowell':
        return 'umass lowell'
    elif row['Team'] == 'e carolina':
        return 'east carolina'
    elif row['Team'] == 'tx a&m cc' or row['Team'] == 'texas a&m corpus chris':
        return 'a&m corpus chris'
    elif row['Team'] == 's utah':
        return 'southern utah'
    elif row['Team'] == 'n florida':
        return 'north florida'
    elif row['Team'] == 'sacred hrt':
        return 'sacred heart'
    elif row['Team'] == 'st fran (ny)':
        return 'st francis ny'
    elif row['Team'] == 'ar lit rock':
        return 'arkansas little rock'
    elif row['Team'] == 'beth cook':
        return 'bethune cookman'
    elif row['Team'] == 'sac state':
        return 'sacramento st'
    elif row['Team'] == 'siu edward':
        return 'southern illinois'
    elif row['Team'] == 'youngs st':
        return 'youngstown st'
    elif row['Team'] == 'nw state':
        return 'northwestern st'
    elif row['Team'] == 'cal st nrdge':
        return 'cal state northridge'
    elif row['Team'] == 'ark pine bl':
        return 'arkansas pine bluff'
    elif row['Team'] == 'va military':
        return 'vmi'
    elif row['Team'] == 'incar word':
        return 'incarnate word'
    elif row['Team'] == 'n arizona':
        return 'northern arizona' 
    elif row['Team'] == 's car state':
        return 'south carolina state'
    elif row['Team'] == 'nw st':
        return 'northwestern st'
    elif row['Team'] == 'miss val st' or row['Team'] == 'mississippi valley st.':
        return 'mississippi valley state'
    elif row['Team'] == 'maryland es':
        return 'umes'
    elif row['Team'] == 'alab a&m':
        return 'alabama a&m' 
    elif row['Team'] == 'n alabama':
        return 'north alabama'
    elif row['Team'] == 'la lafayette':
        return 'louisiana lafayette'
    elif row['Team'] == 'grambling st':
        return 'grambling state'
    elif row['Team'] == 'ut rio grande valley':
        return 'texas rio grande valley'
    else:
        return row['Team']

### Teamrank Ratings

Scraping teamrankings.com for their CBB Ratings

In [24]:
# scrape each season's ratings
season_list = []
team_list = []
rating_list = []
for season in seasons:
    time.sleep(5)
    teamrank_url = 'https://www.teamrankings.com/ncaa-basketball/ranking/predictive-by-other?date=' + last_days[season]
    teamrank_page = requests.get(teamrank_url)
    teamrank_soup = BeautifulSoup(teamrank_page.content, 'lxml')
    teamrank_rows = teamrank_soup.select('tbody tr')
    for row in teamrank_rows:
        anchor = row.select('.nowrap')[0].select('a')
        if not anchor: # this if statement is neccessary due to teamrank
            continue
        season_list.append(season)
        team_list.append(anchor[0].get_text())
        rating_list.append(row.find_all('td')[2].get_text())  # magic number
teamrank_temp = pd.DataFrame({'Season': season_list, 'Team': team_list, 'TeamrankRating': rating_list})

In [31]:
teamrank = teamrank_temp.copy()
teamrank.head()

Unnamed: 0,Season,Team,TeamrankRating
0,2008,Kansas,32.4
1,2008,N Carolina,29.4
2,2008,Memphis,28.7
3,2008,Duke,28.5
4,2008,UCLA,28.2


In [32]:
teamrank.tail()

Unnamed: 0,Season,Team,TeamrankRating
4158,2019,Alcorn State,-17.7
4159,2019,Miss Val St,-18.3
4160,2019,Maryland ES,-19.7
4161,2019,Delaware St,-21.6
4162,2019,Chicago St,-21.7


In [33]:
# Put the team names in the same format (lowercase no punctuation) for joins
teamrank['Team'] = teamrank['Team'].str.replace('[^a-zA-Z&.()\' ]+',' ').str.lower()
teamrank['Team'] = teamrank['Team'].str.replace('[^a-z&.()\' ]+','')

In [34]:
# fix the names in order to join to get the team id
teamrank['Team'] = teamrank.apply(fix_name, axis = 1)

In [35]:
teamrank_teams = pd.merge(teamrank, spellings, how = 'left', left_on = 'Team', right_on = 'TeamNameSpelling')
nans(teamrank_teams)

Unnamed: 0,Season,Team,TeamrankRating,TeamNameSpelling,TeamID
335,2008,california san diego,1.6,,
404,2008,dixie state,-3.3,,
801,2009,california san diego,2.7,,
1328,2010,california san diego,-0.7,,


The above teams were not in division 1

In [67]:
teamrank_teams = teamrank_teams[['TeamID', 'Season', 'TeamrankRating']].drop_duplicates()
teamrank_teams.head()

Unnamed: 0,TeamID,Season,TeamrankRating
0,1242.0,2008,32.4
1,1314.0,2008,29.4
3,1272.0,2008,28.7
4,1181.0,2008,28.5
5,1417.0,2008,28.2


### Trank Ratings

Getting Trank ratings from barttorvik.com

In [54]:
trank = pd.read_csv('http://barttorvik.com/teamslicejson.php?year=2008&csv=1&type=R', header = None)
trank = trank[[0, 1, 2, 3, 26]]
trank.columns = ['Team', 'OE', 'DE', 'TrankRating', 'Tempo']
trank['Season'] = 2008
for season in seasons:
    time.sleep(3)
    if season != 2008:
        trank_url = 'http://barttorvik.com/teamslicejson.php?year=' + str(season) + '&csv=1&type=R'
        trank_temp = pd.read_csv(trank_url, header = None)
        trank_temp = trank_temp[[0, 1, 2, 3, 26]]
        trank_temp.columns = ['Team', 'OE', 'DE', 'TrankRating', 'Tempo']
        trank_temp['Season'] = season
        trank = pd.concat([trank, trank_temp])
trank = trank.reset_index(drop = True)
trank.head()

Unnamed: 0,Team,OE,DE,TrankRating,Tempo,Season
0,Jackson St.,91.259939,107.681226,0.129789,72.4,2008
1,Mississippi,112.21789,98.272341,0.821427,71.1,2008
2,TCU,98.968705,99.941178,0.471918,69.0,2008
3,Albany,101.159577,103.145314,0.444343,65.3,2008
4,Wyoming,99.388001,102.575619,0.410223,70.5,2008


In [55]:
trank.tail()

Unnamed: 0,Team,OE,DE,TrankRating,Tempo,Season
4172,Fordham,96.800091,102.817876,0.333237,66.2,2019
4173,Alabama St.,93.767566,110.787449,0.12807,68.4,2019
4174,Wichita St.,105.897571,96.538612,0.743471,69.2,2019
4175,Oakland,107.71395,108.870026,0.469346,69.7,2019
4176,Boise St.,107.634044,102.400989,0.639494,66.9,2019


In [None]:
# Calculate season average tempo
avg_tempo = trank.groupby('Season').agg(AvgTempo = ('Tempo', 'mean'))

# join to trank
trank = pd.merge(trank, avg_tempo, on = 'Season')

trank.head()

In [56]:
# Put the team names in the same format (lowercase no punctuation) for joins
trank['Team'] = trank['Team'].str.replace('[^a-zA-Z&.()\' ]+',' ').str.lower()
trank['Team'] = trank['Team'].str.replace('[^a-z&.()\' ]+','')

In [62]:
# fix the names in order to join to get the team id
trank['Team'] = trank.apply(fix_name, axis = 1)

In [63]:
trank_copy = trank.copy()
trank_teams = pd.merge(trank_copy, spellings, how = 'left', left_on = 'Team', right_on = 'TeamNameSpelling')
nans(trank_teams)

Unnamed: 0,Team,OE,DE,TrankRating,Tempo,Season,TeamNameSpelling,TeamID


In [69]:
trank_teams = trank_teams[['TeamID', 'Season', 'TrankRating', 'OE', 'DE', 'Tempo']].drop_duplicates()
trank_teams.head()

Unnamed: 0,TeamID,Season,TrankRating,OE,DE,Tempo
0,1238,2008,0.129789,91.259939,107.681226,72.4
1,1279,2008,0.821427,112.21789,98.272341,71.1
2,1395,2008,0.471918,98.968705,99.941178,69.0
3,1107,2008,0.444343,101.159577,103.145314,65.3
4,1461,2008,0.410223,99.388001,102.575619,70.5


In [73]:
teams = pd.merge(teamrank_teams, trank_teams, how = 'inner', on = ['Season', 'TeamID'])
teams.head()

Unnamed: 0,TeamID,Season,TeamrankRating,TrankRating,OE,DE,Tempo
0,1242.0,2008,32.4,0.981585,120.970641,85.610492,69.5
1,1314.0,2008,29.4,0.957196,120.240748,91.770372,75.1
2,1272.0,2008,28.7,0.969683,113.254181,83.789458,70.7
3,1181.0,2008,28.5,0.960742,117.213494,88.761128,73.7
4,1417.0,2008,28.2,0.966422,116.350781,86.87395,66.2


In [None]:
# Calculate Z score of ratings to put them on same scale
teams['TeamrankZScore'] = teams.groupby('Season')['TeamrankRating'].transform(lambda x: zscore(x))
teams['TrankZScore'] = teams.groupby('Season')['TrankRating'].transform(lambda x: zscore(x))
teams.head()

In [None]:
# Take a weighted average of ratings
teams['WeightedRating'] = 0.55 * teams['TrankZScore'] + 0.45 * teams['TeamrankZScore']
teams = teams.drop(columns = ['TrankZScore', 'TeamrankZScore'])
teams.head()

### Tournament Seeds

In [None]:
seeds = pd.read_csv('MDataFiles_Stage1/MNCAATourneySeeds.csv')
seeds.head()

In [None]:
# merge seeds with team data
teams = pd.merge(teams, seeds, on = ['Season', 'TeamID'], how = 'inner')
teams.head()

In [None]:
# extract just the seed number, no need for region here
def clean_seeds(row):
    return int(row['Seed'][1:3])

teams['Seed'] = teams.apply(clean_seeds, axis = 1)
teams.head()

### Regular Season Stats

Kaggle provides a csv file of the regular season results for all torunament teams since 2003. I will use this dataset to get the remaining offensive and defensive stats I need for my analysis.

In [None]:
reg_season = pd.read_csv('MDataFiles_Stage1/MRegularSeasonDetailedResults.csv')
reg_season.head()

In [None]:
reg_season = reg_season.query('Season > 2007')  # because these are the years we have efficiency data

In [None]:
# Score differential per posseassion
reg_season['ScoreDiffPerPoss'] = 2 * (reg_season['WScore'] - reg_season['LScore']) / (reg_season['WFGA'] + reg_season['WTO'] + 0.44 * reg_season['WFTA'] - reg_season['WOR'] + reg_season['LFGA'] + reg_season['LTO'] + 0.44 * reg_season['LFTA'] - reg_season['LOR'])

# Adjust score differential for home court
def adj_score_for_location(row):
    if row['WLoc'] == 'H':
        return row['ScoreDiffPerPoss'] - 0.05
    elif row['WLoc'] == 'A':
        return row['ScoreDiffPerPoss'] + 0.05
    else:
        return row['ScoreDiffPerPoss']
    
reg_season['AdjScoreDiffPerPoss'] = reg_season.apply(adj_score_for_location, axis = 1)

In [None]:
# Get trank offensive efficiency and defensive efficiency
trank_ratings = trank_teams[['Season', 'OE', 'DE']]

# Get stats I need from regrular season stats
my_data = reg_season[['Season', 'WTeamID', 'LTeamID', 'WFGM', 'LFGM', 'WFGA', 'LFGA', 'WFGM3', 'LFGM3', 'WFGA3', 'LFGA3', 'WFTM', 'LFTM', 'WFTA', 'LFTA', 'WAst', 'LAst', 'WTO', 'LTO', 'WOR', 'LOR', 'WDR', 'LDR', 'AdjScoreDiffPerPoss']]

# join stats and trank ratings for winning team
my_data = pd.merge(my_data, trank_ratings, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID']).rename(columns = {'OE': 'WOE', 'DE': 'WDE'})

# join stats and trank ratings for losing team
my_data = pd.merge(my_data, trank_ratings, how = 'outer', left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID']).rename(columns = {'OE': 'LOE', 'DE': 'LDE'})

my_data = my_data.drop(columns = ['TeamID_x', 'TeamID_y'])

In [None]:
# What Trank predicted the adjusted score differential per possession would be for each game
my_data['PredictedAdjScoreDiffPerPoss'] = (my_data['WOE'] + my_data['LDE'] - (my_data['WDE'] + my_data['LOE'])) / 100
my_data.head()

In [None]:
# w_data is data for games in which the team won, and l_data is data for the games in which the team lost
w_data = my_data.groupby(['Season', 'WTeamID']).sum().drop(columns = ['LTeamID', 'AdjScoreDiffPerPoss', 'PredictedAdjScoreDiffPerPoss', 'WOE', 'WDE', 'LOE', 'LDE']).reset_index()
l_data = my_data.groupby(['Season', 'LTeamID']).sum().drop(columns = ['WTeamID', 'AdjScoreDiffPerPoss', 'PredictedAdjScoreDiffPerPoss', 'WOE', 'WDE', 'LOE', 'LDE']).reset_index()
w_data.head()

In [None]:
l_data.head()

In [None]:
wl_data = pd.merge(w_data, l_data, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'LTeamID'], how = 'outer')
wl_data = wl_data.fillna(0)
# must do a outer join and fill NaNs with zeros due to undefeated teams

In [None]:
# Caculate
stats = pd.DataFrame()
stats['Season'] = wl_data['Season']
stats['TeamID'] = wl_data['WTeamID']
stats['3ptRate'] = (wl_data['WFGA3_x'] + wl_data['LFGA3_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'])
stats['Ast%'] = (wl_data['WAst_x'] + wl_data['LAst_y']) / (wl_data['WFGM_x'] + wl_data['LFGM_y'])
stats['FT%'] = (wl_data['WFTM_x'] + wl_data['LFTM_y']) / (wl_data['WFTA_x'] + wl_data['LFTA_y'])
stats['OppFT%'] = (wl_data['WFTM_y'] + wl_data['LFTM_x']) / (wl_data['WFTA_y'] + wl_data['LFTA_x'])
stats['Opp3ptRate'] = (wl_data['WFGA3_y'] + wl_data['LFGA3_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'])
stats['OppAst%'] = (wl_data['WAst_y'] + wl_data['LAst_x']) / (wl_data['WFGM_y'] + wl_data['LFGM_x'])
stats['EFG%'] = (wl_data['WFGM_x'] + wl_data['LFGM_y'] + .5 * wl_data['WFGM3_x'] + .5 * wl_data['LFGM3_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'])
stats['EFGD%'] = (wl_data['WFGM_y'] + wl_data['LFGM_x'] + .5 * wl_data['WFGM3_y'] + .5 * wl_data['LFGM3_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'])
stats['TOR%'] = (wl_data['WTO_x'] + wl_data['LTO_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'] - wl_data['WOR_x'] - wl_data['LOR_y'] + wl_data['WTO_x'] + wl_data['LTO_y'] + .44 * (wl_data['WFTA_x'] + wl_data['LFTA_y']))
stats['TORD%'] = (wl_data['WTO_y'] + wl_data['LTO_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'] - wl_data['WOR_y'] - wl_data['LOR_x'] + wl_data['WTO_y'] + wl_data['LTO_x'] + .44 * (wl_data['WFTA_y'] + wl_data['LFTA_x']))
stats['ORB%'] = (wl_data['WOR_x'] + wl_data['LOR_y']) / (wl_data['WOR_x'] + wl_data['LOR_y'] + wl_data['WDR_y'] + wl_data['LDR_x'])
stats['OppORB%'] = 1 - (wl_data['WDR_x'] + wl_data['LDR_y']) / (wl_data['WOR_y'] + wl_data['LOR_x'] + wl_data['WDR_x'] + wl_data['LDR_y'])
stats['FTR'] = (wl_data['WFTA_x'] + wl_data['LFTA_y']) / (wl_data['WFGA_x'] + wl_data['LFGA_y'])
stats['FTRD'] = (wl_data['WFTA_y'] + wl_data['LFTA_x']) / (wl_data['WFGA_y'] + wl_data['LFGA_x'])
stats['2P%'] = (wl_data['WFGM_x'] + wl_data['LFGM_y'] - (wl_data['WFGM3_x'] + wl_data['LFGM3_y'])) / (wl_data['WFGA_x'] + wl_data['LFGA_y'] - (wl_data['WFGA3_x'] + wl_data['LFGA3_y']))
stats['2P%D'] = (wl_data['WFGM_y'] + wl_data['LFGM_x'] - (wl_data['WFGM3_y'] + wl_data['LFGM3_x'])) / (wl_data['WFGA_y'] + wl_data['LFGA_x'] - (wl_data['WFGA3_y'] + wl_data['LFGA3_x']))
stats['3P%'] = (wl_data['WFGM3_x'] + wl_data['LFGM3_y']) / (wl_data['WFGA3_x'] + wl_data['LFGA3_y'])
stats['3P%D'] = (wl_data['WFGM3_y'] + wl_data['LFGM3_x']) / (wl_data['WFGA3_y'] + wl_data['LFGA3_x'])
stats.head()

In [None]:
stats2 = pd.DataFrame()
for season in teams['Season'].unique():
    for team in teams['TeamID'].unique():
        if len(teams[(teams['Season'] == season) & (teams['TeamID'] == team) & (teams['Seed'] > 0)]) > 0:
            season_data = my_data[my_data['Season'] == season]
            w_data = season_data[season_data['WTeamID'] == team]
            w_data.columns = ['Season', 'TeamID', 'OppTeamID', 'FGM', 'OppFGM', 'FGA', 'OppFGA', 'FGM3', 'OppFGM3', 'FGA3', 'OppFGA3', 'FTM', 'OppFTM', 'FTA', 'OppFTA', 'Ast', 'OppAst', 'TO', 'OppTO', 'OR', 'OppOR', 'DR', 'OppDR', 'AdjScoreDiffPerPoss', 'OE', 'DE', 'OppOE', 'OppDE', 'PredictedAdjScoreDiffPerPoss']
            l_data = season_data[season_data['LTeamID'] == team]
            l_data = l_data[['Season', 'LTeamID', 'WTeamID', 'LFGM', 'WFGM', 'LFGA', 'WFGA', 'LFGM3', 'WFGM3', 'LFGA3', 'WFGA3', 'LFTM', 'WFTM', 'LFTA', 'WFTA', 'LAst', 'WAst', 'LTO', 'WTO', 'LOR', 'WOR', 'LDR', 'WDR', 'AdjScoreDiffPerPoss', 'LOE', 'LDE', 'WOE', 'WDE', 'PredictedAdjScoreDiffPerPoss']]
            l_data['AdjScoreDiffPerPoss'] = -1 * l_data['AdjScoreDiffPerPoss']
            l_data['PredictedAdjScoreDiffPerPoss'] = -1 * l_data['PredictedAdjScoreDiffPerPoss']
            l_data.columns = w_data.columns
            team_data = pd.concat([w_data, l_data])
            team_data['3ptRate'] = team_data['FGA3'] / team_data['FGA']
            team_data['Opp3ptRate'] = team_data['OppFGA3'] / team_data['OppFGA']
            team_data['Ast%'] = team_data['Ast'] / team_data['FGM']
            team_data['OppAst%'] = team_data['OppAst'] / team_data['OppFGM']
            team_data['eFG%'] = (team_data['FGM'] + .5 * team_data['FGM3']) / team_data['FGA']
            team_data['OppeFG%'] = (team_data['OppFGM'] + .5 * team_data['OppFGM3']) / team_data['OppFGA']
            team_data['3pt%'] = team_data['FGM3'] / team_data['FGA3']
            team_data['Opp3pt%'] = team_data['OppFGM3'] / team_data['OppFGA3']
            team_data['FT%'] = team_data['FTM'] / team_data['FTA']
            team_data['FTR'] = team_data['FTA'] / team_data['FGA']
            team_data['OppFTR'] = team_data['OppFTA'] / team_data['OppFGA']
            team_data['OR%'] = team_data['OR'] / (team_data['OR'] + team_data['OppDR'])
            team_data['OppOR%'] = team_data['OppOR'] / (team_data['OppOR'] + team_data['DR'])
            team_data['TO%'] = team_data['TO'] / (team_data['TO'] + team_data['FGA'] - team_data['OR'] + .44 * team_data['FTA'])
            team_data['OppTO%'] = team_data['OppTO'] / (team_data['OppTO'] + team_data['OppFGA'] - team_data['OppOR'] + .44 * team_data['OppFTA'])
            team_data['TotalPoss'] = team_data['TO'] + team_data['OppTO'] + team_data['FGA'] + team_data['OppFGA'] - team_data['OR'] - team_data['OR'] + .44 * (team_data['FTA'] + team_data['OppFTA'])
            team_data['GameScore'] = team_data['AdjScoreDiffPerPoss'] - team_data['PredictedAdjScoreDiffPerPoss']
            stats2 = pd.concat([stats2, pd.DataFrame({'Season': [season],
                                                     'TeamID': [team],
                                                     '3ptRateVar': [np.var(team_data['3ptRate'])],
                                                     'Opp3ptRateVar': [np.var(team_data['Opp3ptRate'])],
                                                     'eFG%Var': [np.var(team_data['eFG%'])],
                                                     'OppeFG%Var': [np.var(team_data['OppeFG%'])],
                                                     '3pt%Var': [np.var(team_data['3pt%'])],
                                                     'Opp3pt%Var': [np.var(team_data['Opp3pt%'])],
                                                     'Ast%Var': [np.var(team_data['Ast%'])],
                                                     'OppAst%Var': [np.var(team_data['OppAst%'])],
                                                     'FT%Var': [np.var(team_data['FT%'])],
                                                     'FTRVar': [np.var(team_data['FTR'])],
                                                     'OppFTRVar': [np.var(team_data['OppFTR'])],
                                                     'FTR2Var': [np.var(team_data['FTR2'])],
                                                     'OppFTR2Var': [np.var(team_data['OppFTR2'])],
                                                     'OR%Var': [np.var(team_data['OR%'])],
                                                     'OppOR%Var': [np.var(team_data['OppOR%'])],
                                                     'TO%Var': [np.var(team_data['TO%'])],
                                                     'OppTO%Var': [np.var(team_data['OppTO%'])],
                                                     'TotalPossVar': [np.var(team_data['TotalPoss'])],
                                                     'GameScoreVar': [np.var(team_data['GameScore'])]})])
stats2.head()

In [None]:
stats_merge = pd.merge(stats, stats2, on = ['Season', 'TeamID'])
teams = pd.merge(teams, stats_merge, on = ['Season', 'TeamID'])
teams.head()

In [None]:
for season in range(2008, 2020):
    print(str(season) + ": " + str(len(teams[teams['Season'] == season])))

### Tournament Matchups

In [None]:
ratings = teams[['Season', 'TeamID', 'Weighted_Rating']]
ratings.head()

In [None]:
tournament_data = pd.read_csv('MDataFiles_Stage1/MNCAATourneyCompactResults.csv')
tournament_data.head()

In [None]:
# The rating data is only from 2008 and 2010-2018 so we'll only use results from 2008 and 2010-2018
tournament_data = tournament_data.query('Season > 2007').drop(columns = ['DayNum', 'WLoc', 'NumOT'])

In [None]:
game_data = pd.merge(tournament_data, ratings, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, ratings, left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])
game_data = game_data.drop(columns = ['WTeamID', 'LTeamID']).rename(columns = {'WScore': 'Score_x', 'LScore': 'Score_y'})
game_data.head()

We want to set up the data to have the "better" team as team x and the "worse" team as team y rather than x being the winner and y being the loser because when we make predictions we'll want to predict upsets instead of just picking whether an arbitrary team wins or loses. The code below rearranges team x and team y so team x always has the higher weighted rating. In the data above, the 9th row has team x with a lower weighted rating than team y.

In [None]:
# To make team x be the team with the higher rating and team y be the team with the lower rating
def switch_teams(row)
    # if rating x is less than rating y
    if row['Weighted_Rating_x'] < row['Weighted_Rating_y']:
        underdog = row['Score_x']  # "Worse" team's score
        favorite = row['Score_y'# "Better" team's score
        row['Score_x'] = favorite
        row['Score_y'] = underdog
        underdog = row['TeamID_x']  # "Worse" team's ID
        favorite = row['TeamID_y']  # "Better" team's ID
        row['TeamID_x'] = favorite
        row['TeamID_y'] = underdog
        underdog = row['Weighted_Rating_x']  # "Worse" team's rating
        favorite = row['Weighted_Rating_y']  # "Better" team's rating
        row['Weighted_Rating_x'] = favorite
        row['Weighted_Rating_y'] = underdog
    return row
                       
game_data = game_data.apply(switch_teams, axis = 1)
game_data.head(60)

In [None]:
game_data = pd.merge(game_data, teams, left_on = ['Season', 'TeamID_x'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, teams, left_on = ['Season', 'TeamID_y'], right_on = ['Season', 'TeamID'])
game_data.head()

In [None]:
game_data = game_data.loc[:,~game_data.columns.duplicated()]  # to remove duplicate columns
game_data.head()

In [None]:
# Remove play-in games between 16 seeds
game_data = game_data.query('Seed_x != Seed_y or Seed_x != 16').reset_index(drop = True)

In [None]:
# Start with the stats for each team
matchups = game_data.drop(columns = ['Score_x', 'Score_y', 'TeamID_x', 'TeamID_y'])

# Response variable
matchups['Upset'] = game_data['Score_x'] < game_data['Score_y']  
matchups['Upset'] = matchups['Upset'].astype('int64')

# Predictors

# Difference in NCAA tournament Seeds
matchups['SeedDiff'] = matchups['Seed_x'] - matchups['Seed_y']

# Trank Predicted Spread
matchups['TrankPredictedSpreadPerPoss'] = (game_data['Trank_OE_x'] +  game_data['Trank_DE_y'] - game_data['Trank_OE_y'] - game_data['Trank_DE_x']) / 100
matchups['TrankPredictedPoss'] = game_data['Tempo_x'] * game_data['Tempo_y'] / game_data['AvgTempo']
matchups['TrankPredictedSpread'] = matchups['TrankPredictedPoss'] * matchups['TrankPredictedSpreadPerPoss']

# Teamrank Predicted Spread
matchups['TeamrankPredictedSpread'] = game_data['TeamrankRating_x'] - game_data['TeamrankRating_y']
matchups['TeamrankPredictedSpreadPerPoss'] = matchups['TeamrankPredictedSpread'] / matchups['TrankPredictedPoss']

# tempo difference and absolute value of tempo difference for Trank tempo
matchups['TrankTempoDiff'] = game_data['Tempo_x'] - game_data['Tempo_y']
matchups['AbsTrankTempoDiff'] = abs(matchups['TrankTempoDiff'])

# Offensive vs defensive EFG% averages and differences
matchups['xOffyDefEFGDiff'] = game_data['EFG%_x'] - game_data['EFGD%_y']
matchups['yOffxDefEFGDiff'] = game_data['EFG%_y'] - game_data['EFGD%_x']
matchups['xOffyDefEFGAvg'] = (game_data['EFG%_x'] + game_data['EFGD%_y']) / 2
matchups['yOffxDefEFGAvg'] = (game_data['EFG%_y'] + game_data['EFGD%_x']) / 2

# Offensive vs defensive turnover rate averages and differences
matchups['xOffyDefTODiff'] = game_data['TOR%_x'] - game_data['TORD%_y']
matchups['yOffxDefTODiff'] = game_data['TOR%_y'] - game_data['TORD%_x']
matchups['xOffyDefTOAvg'] = (game_data['TOR%_x'] + game_data['TORD%_y']) / 2
matchups['yOffxDefTOAvg'] = (game_data['TOR%_y'] + game_data['TORD%_x']) / 2

# Offensive vs defensive rebound rate averages and differences
matchups['xOffyDefRebDiff'] = game_data['ORB%_x'] - game_data['OppORB%_y']
matchups['yOffxDefRebDiff'] = game_data['ORB%_y'] - game_data['OppORB%_x']
matchups['xOffRebAvg'] = (game_data['ORB%_x'] + game_data['OppORB%_y']) / 2
matchups['yOffRebAvg'] = (game_data['ORB%_y'] + game_data['OppORB%_x']) / 2
matchups['xOffyOffRebDiff'] = matchups['xOffRebAvg'] - matchups['yOffRebAvg']

# Offensive vs defensive FT rate averages and differences
matchups['xOffyDefFTRateDiff'] = game_data['FTR_x'] - game_data['FTRD_y']
matchups['yOffxDefFTRateDiff'] = game_data['FTR_y'] - game_data['FTRD_x']
matchups['xOffyDefFTRateAvg'] = (game_data['FTR_x'] + game_data['FTRD_y']) / 2
matchups['yOffxDefFTRateAvg'] = (game_data['FTR_y'] + game_data['FTRD_x']) / 2

# Offensive vs defensive assist rate averages and differences
matchups['xOffyDefAstDiff'] = game_data['Ast%_x'] - game_data['OppAst%_y']
matchups['yOffxDefAstDiff'] = game_data['Ast%_y'] - game_data['OppAst%_x']
matchups['AbsxOffyDefAstDiff'] = abs(matchups['xOffyDefAstDiff'])
matchups['AbsyOffxDefAstDiff'] = abs(matchups['yOffxDefAstDiff'])

# Sum of the variance in game possession of both teams
matchups['TotalPossVarSum'] = game_data['TotalPossVar_x'] + game_data['TotalPossVar_y']

# Sum of the variance in game performance of both teams (weighted by ratio of tempo and predicted tempo, ie sample sizes)
matchups['GameScoreVarSum'] = (game_data['Tempo_x'] / matchups['TrankPredictedPoss']) * game_data['GameScoreVar_x'] + (game_data['Tempo_y'] / matchups['TrankPredictedPoss']) * game_data['GameScoreVar_y']

# Naive upset probability using predicted spread and sum of variance
matchups['TrankNaiveUpsetProbability'] = norm.cdf(0, loc = matchups['TrankPredictedSpreadPerPoss'], scale = (0.5 * matchups['GameScoreVarSum']) ** 0.5)
matchups['TeamrankNaiveUpsetProbability'] = norm.cdf(0, loc = matchups['TeamrankPredictedSpreadPerPoss'], scale = (0.5 * matchups['GameScoreVarSum']) ** 0.5)

In [None]:
# Remove 1 v 16 matchups and 2 v 15 matchups, they were less predictive due to 15 and 16 seeds being much worse
matchups = matchups.query('SeedDiff > -13').reset_index(drop = True)
matchups.head()

In [None]:
matchups.to_csv('mydata/mens/matchups.csv', index = False)