# Gym class 2

In [1]:
import arviz as az
import pymc as pm, pytensor as pt
import matplotlib.pyplot as plt

import pandas as pd, numpy as np
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', None)

In [None]:
from graphviz import Digraph

dot = Digraph()
dot.node('H','Height')
dot.node('S','Sprint')
dot.node('B','Basketball')
dot.edges(['HB','SB'])
dot

Generate a fake dataset

In [None]:
# Simulate height and weight from independent normal distributions
n_samples = 100
height = np.random.normal(loc=170, scale=10, size=n_samples).round(0)  # Height in cm
sprint = np.random.normal(loc=14, scale=1.5, size=n_samples).round(1)   # 100m time

# z-scores
hz, sz = (height-170)/10, -(sprint-14)/1.5

# Likes basketball
bp = np.round(5*(1/(1+np.exp(-1.5*(hz+sz)))))

# Combine into a dataframe
df = pd.DataFrame({'height':height,'sprint':sprint,'basketball':bp })

#df.to_csv('gym_class_big.csv')
#df.basketball.value_counts()

from plotnine import *

ggplot(df,aes(x='height',y='sprint',color='basketball')) + geom_point()

# Basic model

In [None]:
with pm.Model() as mdl:
    h = df['height']
    s = df['sprint']
    hc = pm.Normal('hcoef')
    sc = pm.Normal('scoef')
    pm.Normal('bb',hc*h+sc*s,1,observed=df['basketball'])
    idata = pm.sample()

print(az.summary(idata,var_names=['hcoef','scoef']))
az.plot_trace(idata)
plt.show()

# Latent sprint

In [None]:
import arviz as az

with pm.Model() as mdl:
    h = df['height']
    s = pm.Normal('sprint',14,1.5,size=(len(df)))
    
    hc = pm.Normal('hcoef')
    sc = pm.Normal('scoef')
    pm.Normal('bb',hc*h+sc*s,1,observed=df['basketball'])
    idata = pm.sample()

print(az.summary(idata,var_names=['hcoef','scoef']))
az.plot_trace(idata)
plt.show()

In [None]:
# Plot inferred sprint value against the "real" value
df['sprint_inf'] = idata.posterior.sprint.median(['chain','draw'])
ggplot(df,aes(x='sprint',y='sprint_inf')) + geom_point() + geom_abline()

# Latent both

In [None]:
import arviz as az

with pm.Model() as mdl:
    h = pm.Normal('height',170,10,size=(len(df)))
    s = pm.Normal('sprint',14,1.5,size=(len(df)))
    
    hc = pm.Normal('hcoef')
    sc = pm.Normal('scoef')
    pm.Normal('bb',hc*h+sc*s,1,observed=df['basketball'])
    idata = pm.sample()

print(az.summary(idata,var_names=['hcoef','scoef']))
az.plot_trace(idata)
plt.show()