# Dataset description

Data was collected from gym teachers of high school boys.
For each boy, we know the height (in cm) and their 100m sprint time (in s).
Additionally, they were all asked if they liked playing basketball, dodgeball and soccer in the class.

# Analysis

We are interested in the effect of height on sprint times

In [2]:
import pandas as pd
df = pd.read_csv('./data/gym_class.csv')

Set some basic semi-informative priors and run the linear model

In [None]:
import pymc as pm
import pytensor.tensor as pt

with pm.Model() as model:

    icept = pm.Normal('icept',10,10)
    hcoef = pm.Normal('hcoef',0,0.2)
    gcoef = pm.Normal('gcoef',0,1,size=3)

    res = icept + hcoef*df['height'] + pt.dot(gcoef,df[['basketball','dodgeball','soccer']].to_numpy().T)

    err = pm.HalfNormal('err')
    pm.Normal('obs',res,err,observed=df['sprint'])

    trace = pm.sample()

In [None]:
# Plot the distributions 
import arviz as az, matplotlib.pyplot as plt
az.plot_posterior(trace)
plt.show()

In [None]:
# Write the numerical summary table
pm.summary(trace)

In [None]:
# Let's look at raw data
from plotnine import *
ggplot(df,aes(x='height',y='sprint',color='basketball')) + geom_point()

# Synthetic data generation

In [None]:
import numpy as np, pandas as pd

# Simulate height and weight from independent normal distributions
n_samples = 100
height = np.random.normal(loc=170, scale=10, size=n_samples).round(0)  # Height in cm
sprint = np.random.normal(loc=14, scale=1.5, size=n_samples).round(1)   # 100m time

# z-scores
hz, sz = (height-170)/10, -(sprint-14)/1.5

# Likes basketball
bp =  np.random.uniform(size=n_samples)<(1/(1+np.exp(-3*(hz+sz+0.3))))

# Likes dodgeball
dp =  np.random.uniform(size=n_samples)<(1/(1+np.exp(-1*sz-0.5*hz)))

# Likes soccer
sp =  np.random.uniform(size=n_samples)<(1/(1+np.exp(-2*(sz-0.5))))

# Combine into a dataframe
df = pd.DataFrame({'height':height,'sprint':sprint,'basketball':bp,'dodgeball':dp,'soccer':sp})

df.to_csv('gym_class_big.csv')