In [1]:
import os
os.chdir("..")

import pandas as pd
import statsmodels.formula.api as smf

from src.classes.constants import DATA_PATH

### Set up data

In [2]:
df = pd.read_csv(f'{DATA_PATH}/tourney_outcomes.csv') # see get_trank_data.py
df.head()

Unnamed: 0,rk,team,pake,pase,wins,loss,w%,r64,r32,s16,...,f4,f2,champ,top2,f4%,champ%,season,conf,seed,region
0,1,Kansas,2.1,2.7,6,0,1.0,1,1,1,...,1,1,1,1,57.3%,29.6%,2008,B12,1.0,0
1,2,Davidson,1.6,2.4,3,1,0.75,1,1,1,...,0,0,0,0,7.8%,2.2%,2008,SC,10.0,0
2,3,Memphis,1.5,1.7,5,1,0.833,1,1,1,...,1,1,0,1,50.6%,20.2%,2008,CUSA,1.0,2
3,4,Villanova,1.4,1.5,2,1,0.667,1,1,1,...,0,0,0,0,0.9%,0.1%,2008,BE,12.0,0
4,5,Xavier,1.4,1.2,3,1,0.75,1,1,1,...,0,0,0,0,8.3%,0.8%,2008,A10,3.0,3


In [3]:
conf_df = df.groupby(['season','conf'])[['team','pake','pase']].agg({'team':'count','pake':'sum','pase':'sum'}).reset_index()
conf_df['pase'] = conf_df['pase'].apply(lambda x: round(x, 1))
conf_df['pake'] = conf_df['pake'].apply(lambda x: round(x, 1))
conf_df.columns=['season','conf','teams_conf','pake_conf','pase_conf']
# conf_df

In [4]:
df = df.merge(conf_df, how='left', on=['season','conf'])

# subtract out the team's own performance
df['pake_conf'] = df['pake_conf'] - df['pake']
df['pase_conf'] = df['pase_conf'] - df['pase']

df = df[['season','conf','team','seed','pake','pase','teams_conf','pake_conf','pase_conf']]

### Analysis

In [5]:
min_teams = 3

#### 1. Basic regression

In [6]:
results = smf.ols(f'pase ~ pase_conf', data=df[df.teams_conf>=min_teams]).fit()
results.params

Intercept   -0.006007
pase_conf   -0.001583
dtype: float64

the coefficient on pase_conf is basically 0, meaning there's no effect of conference peers' performance on yours.

but wait! this is biased because any time one team advances, it comes at the expense of another - so the bracket structure causes a negative correlation. Is that hiding some positive effect on underlying team performance? 

to do that we'll need a simulation!

#### 2. Simulated tournaments

In [12]:
# stop here and run scripts/create_bracket.py if you haven't already!
sim_pase = pd.read_csv(f'{DATA_PATH}sim_pase.csv')

In [13]:
conf_df = sim_pase.groupby(['season','conf'])[['team','pase']].agg({'team':'count','pase':'sum'}).reset_index()
conf_df['pase'] = conf_df['pase'].apply(lambda x: round(x, 1))
conf_df.columns=['season','conf','teams_conf','pase_conf']

sim_pase = sim_pase.merge(conf_df, how='left', on=['season','conf'])

# subtract out the team's own performance
sim_pase['pase_conf'] = sim_pase['pase_conf'] - sim_pase['pase']

sim_pase = sim_pase[['season','conf','team','seed','pase','teams_conf','pase_conf']]

In [19]:
results = smf.ols(f'pase ~ pase_conf', data=sim_pase[sim_pase.teams_conf>=min_teams]).fit()
results.params

Intercept    0.000149
pase_conf   -0.042135
dtype: float64

In [20]:
results.pvalues

Intercept    0.924973
pase_conf    0.026301
dtype: float64

in our simulation, there was a significant negative correlation between performance among teams in the same conference - due entirely to the bracket structure (one team's advancement must come at the expense of another)

so the gap between our actual coefficient and the simulated one (.04) is roughly how much conference peers rise or fall together - not huge but not trivial either

#### 3. Causes

In [21]:
results = smf.ols(f'pake ~ pake_conf', data=df[df.teams_conf>=min_teams]).fit()
results.params

Intercept    0.006239
pake_conf   -0.035511
dtype: float64

This is equivalent to #1, except using PAKE (computer ratings instead of seeds) 

The coefficient is much closer to our bracket-structure simulation, suggesting that most of the correlation between teams' outcomes is due to committee seeding, not computer ratings over or underrating conferences. 