In [None]:
import pandas as pd
import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
import theano.tensor as tt
import numpy as np
from sklearn.preprocessing import StandardScaler

%reload_kedro
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Analyze game data

In [None]:
all_games = catalog.load("primary_HSNCT_all_games_2019")

### Model

In [None]:
all_games['point_diff_per_20tuh'] = (all_games['point_diff'] / all_games['tuh']) * 20

In [None]:
all_games.groupby("team_1")["team_1_negs"].mean().sort_values(ascending=False)

In [None]:
teams = set(all_games["team_1"].unique()) | set(all_games["team_2"].unique())
num_teams = len(teams)
num_rounds = all_games["round"].nunique() + 1
team_1 = all_games["team_1_index"].values
team_2 = all_games["team_2_index"].values
round_idx = all_games['round'].values

mu_point_diff = all_games["point_diff_per_20tuh"].mean()
sigma_point_diff = all_games["point_diff_per_20tuh"].std()
point_diff_norm = ((all_games["point_diff_per_20tuh"] - mu_point_diff) / sigma_point_diff).values

In [None]:
# %debug
with pm.Model() as model:
    
    #hyperpriors
    sigma_strength = pm.HalfCauchy('sigma_strength', beta=0.05)

    # priors
    strength = pm.HalfNormal("strength", sd=sigma_strength, shape=num_teams)
    
    # likelihood of observed data
    mu_diff = strength[team_1] - strength[team_2]
    sigma_diff = pm.HalfCauchy("sigma_dfif", beta=0.1)
    point_diff = pm.Normal('point_diff', mu=mu_diff, sigma=sigma_diff, observed=point_diff_norm)
    point_diff_unstandardized = pm.Deterministic("point_diff_unstandardized", (point_diff *sigma_point_diff) + mu_point_diff )

    prior_pred = pm.sample_prior_predictive()    
    trace = pm.sample()
    ppc = pm.sample_posterior_predictive(trace)


In [None]:
vals = prior_pred["point_diff_unstandardized"]
np.quantile(vals, 0.01), np.quantile(vals, 0.99)

In [None]:
team_1_index = all_games[["team_1", "team_1_index"]].set_index("team_1").to_dict()["team_1_index"]
team_2_index = all_games[["team_2", "team_2_index"]].set_index("team_2").to_dict()["team_2_index"]
team_indices = {**team_1_index, **team_2_index}
team_indices_reverse = {val:key for key, val in team_indices.items()}

In [None]:
diff_summary = az.from_pymc3(trace,
                             posterior_predictive=ppc,
                            coords={"school":list(team_indices.keys())},
                            dims={"strength":["school"]})

In [None]:
az.plot_ppc(diff_summary)

In [None]:
tmp = az.summary(diff_summary)

In [None]:
summary_df = az.summary(diff_summary, var_names=['strength']).reset_index()

In [None]:
summary_df

In [None]:
sns.distplot(trace["strength"][:,74])

In [None]:
import re
def get_team_name(x):
    matches = re.findall(r"strength\[([0-9]+)]",x)
    assert len(matches) == 1
    team_num = int(matches[0])
    return team_indices_reverse[team_num]

In [None]:
summary_df['team'] = summary_df['index'].apply(get_team_name)

In [None]:
summary_df.sort_values("mean", ascending=False).head(10)

# Player model

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd

In [None]:
url = "https://www.naqt.com/stats/tournament/individuals.jsp?tournament_id=9500&playoffs=true"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
tb = soup.find("table", class_="data-freeze-2")

In [None]:
all_players = [str(x) for x in tb.find_all("a") if "tournament/team" not in str(x)]
pattern_player_number = "team_member_id=([0-9]+)"
pattern_player_name = ">(.+)<"
player_number_tuple = [
    (re.findall(pattern_player_number, line)[0], re.findall(pattern_player_name, line)[0])
    for line in all_players
]

In [None]:
result = pd.DataFrame()
for i, (number, player) in enumerate(player_number_tuple):
    if i % 10 == 0:
        print(i)
#     try:
    tmp = pd.read_html(
        "https://www.naqt.com/stats/tournament/player.jsp?team_member_id=" + number
    )[1]
    tmp["team"] = player
    result = pd.concat([result, tmp])
    time.sleep(0.05)
#     except:
#         print("{0} {1} {2}".format(i, number, team))

In [None]:
result = result.rename({'team':'player'}, axis=1)

In [None]:
result.info()

In [None]:
(result
 .query("Round != 'Total'")
 .drop("P%", axis=1)
 .astype({"Round":np.int8,
         "Opponent":'category',
         'Result':'category',
         'TUH':np.int8,
         "P":np.int8,
         "TU":np.int8,
         "I":np.int8,
         "player":"category"
         })

)