In [None]:
import pandas as pd
import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
import theano.tensor as tt
import numpy as np
from kedro.runner import SequentialRunner

%reload_kedro
%load_ext autoreload
%autoreload 2

In [None]:
from analyze_naqt.pipeline import scrape_data_pipeline

In [None]:
pipeline = scrape_data_pipeline()
process_pipeline = pipeline.from_nodes("process game data")

In [None]:
SequentialRunner().run(process_pipeline, io)

# Analyze game data

In [None]:
all_games = io.load("2018_all_games")

### Model

Model and code are based on [this article](http://danielweitzenfeld.github.io/passtheroc/blog/2014/10/28/bayes-premier-league/).

Our goal is to come up with an underlying team strength parameter as well as uncertainty around that team strength parameter. Let $y_{gj}$ be the observed score for team $j$ in game $g$. We model the score using a Poisson distribution i.e. $y_{gj}|\theta_{gj} \sim Poisson(\theta_{gj})$. Note that there is one $\theta_{gj}$ for each team in each round. 

At the next level of the model, we model each $\theta$ as a log-linear function:
$$
\log \theta_{g1} = attack_1 - defense_2 \\
\log \theta_{g2} = attack_2 - defense_1
$$
i.e. we assume there is an attack and defense strength for each team. These parameters are modeled as a hierarchical model where, for each team $t$,
$$
attack_t \sim N(\mu_{attack},\tau_{attack}) \\
defense_t \sim N(\mu_{defense},\tau_{defense})
$$
And in turn we have hyperpriors where $\mu_{attack}, \mu_{defense} \sim N(.,.)$ and $\tau_{attack}, \tau_{defense} \sim Gamma(.,.)$

To ensure identifiability, we make the attack and defense parameters sum to 0:
$$
\sum_{t \in Teams} attack_t = 0 \\
\sum_{t \in Teams} defense_t = 0
$$


In [None]:
num_teams = len(set(all_games["team_1"].unique()) | set(all_games["team_2"].unique()))
team_1 = all_games["team_1_index"].values
team_2 = all_games["team_2_index"].values

In [None]:
# %debug
with pm.Model() as model:
    
    #hyperpriors
    tau_attack = pm.HalfNormal('tau_attack', sigma=10)

    # priors
    atts_star = pm.Normal("atts_star", mu=0, sd=tau_attack, shape=num_teams)
    
    # likelihood of observed data
    diff_theta = atts_star[team_1] - atts_star[team_2]
    point_diff = pm.Normal('team_1_points', mu=diff_theta, sigma=100, observed=all_games["point_diff"])

    trace = pm.sample()

In [None]:
diff_summary = az.from_pymc3(trace)

In [None]:
team_indices = (
    pd.DataFrame(list(set(all_games["team_1"].unique()) | set(all_games["team_2"].unique())))
    .reset_index()
    .rename({0: "team"}, axis=1)
    .set_index("team")
    .to_dict()["index"]
)
team_ranks = (trace.get_values("atts_star")).mean(axis=0).argsort()[::-1]
team_indices_reverse = {val:key for key, val in team_indices.items()}
[team_indices_reverse[idx] for idx in team_ranks]