In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

sns.set()

In [None]:


cols = [('A','pop'),('A','dead'),('B','pop'),('B','dead')]

# example populations of two countries

df2 = pd.DataFrame(data=[[100000,100,900000,900], #child: pop,dead,pop,dead : A,B
                         [900000,9000,100000,1000]],    #adult pop,dead : A,B
                   columns=pd.MultiIndex.from_tuples(cols),index=['child','adult'])

# define a standard population, by age groups
standard_pop = pd.DataFrame({'child' : [500000],
                            'adult' : [500000]}).T

standard_pop.columns=['pop']

df2

In [None]:
df2_trans = df2.T.unstack().unstack().reset_index()
df2_trans.columns = ['age_grp','state','country','value']
df2_trans
grp = df2_trans.groupby('state')
dead = grp.get_group('dead').reset_index(drop=True)
pop = grp.get_group('pop').reset_index(drop=True)

all_data = pd.concat([pop,dead],axis=1)
all_data = all_data.iloc[:,[2,0,3,7]]

all_data.columns = ['country','age_grp','pop','dead']

def create_country_idx(c):
    if c == 'A' : return  1
    if c == 'B' : return 2
    
def create_age_idx(a):
    if a == 'child' : return 1
    if a == 'adult' : return 2
    

all_data['country_idx'] = all_data['country'].apply(create_country_idx)
all_data['age_idx'] = all_data['age_grp'].apply(create_age_idx)

all_data

In [None]:
#PYMC
import pymc as pm
from pymc.Matplot import plot as pmplot


### condition on country by assigning a unique alpha to each country
### condition on age_grp by assigning a unique beta to each age group

# model: 
# dead ~ Binomial(population,p)
# p = logit(alpha[country_idx] + beta[age_idx])
# alpha[country_idx] ~ Normal(0,10)
# beta[age_idx] ~ Normal(0,10) 


country_idx = all_data['country_idx'] 
age_idx = all_data['age_idx']

alpha = pm.Normal('alpha',0, 1 / 10 ** 2,size=2) # 2 countries

beta = pm.Normal('beta',0,1 / 10 ** 2,size=2) # 2 age groups


@pm.deterministic
def logit_age(age_idx=age_idx-1,country_idx=country_idx-1,alpha=alpha,beta=beta):
        
    return np.exp(alpha[country_idx] + beta[age_idx]) / (
        1 + np.exp(alpha[country_idx] + beta[age_idx]))


lkh_age = pm.Binomial('lkh_age',n=all_data['pop'],p=logit_age,observed=True,
                      value=all_data['dead'])

model = pm.Model([alpha,beta,logit_age,lkh_age])

_map = pm.MAP(model)
_map.fit()

mcmc = pm.MCMC(model)
sample = mcmc.sample(50000,20000,3)



In [None]:
def logit_pure(x):
    return (np.exp(x)) / (1 + np.exp(x))

def logistic_pure(p):
    return np.log(p / (1 - p))

In [None]:
alpha_A = mcmc.trace('alpha')[:,0]
alpha_B = mcmc.trace('alpha')[:,1]

beta_child = mcmc.trace('beta')[:,0]
beta_adult = mcmc.trace('beta')[:,1]

pmplot(alpha_A,'alpha_A')
pmplot(alpha_B,'alpha_B')

pmplot(beta_child,'beta_child')
pmplot(beta_adult,'beta_adult')

result = pd.DataFrame({'alpha_A' : alpha_A,
                      'alpha_B' : alpha_B,
                      'beta_child' : beta_child,
                      'beta_adult' : beta_adult})

result.describe()

In [None]:
result_p = pd.DataFrame()

result_p['A_child'] = logit_pure(result['alpha_A'] + result['beta_child']) 
result_p['A_adult'] = logit_pure(result['alpha_A'] + result['beta_adult']) 
result_p['B_child'] = logit_pure(result['alpha_B'] + result['beta_child']) 
result_p['B_adult'] = logit_pure(result['alpha_B'] + result['beta_adult']) 

result_p.describe()

In [None]:
pymc_means = result_p.describe().loc['mean',:]

pymc_deaths_per_M_A_child = pymc_means.loc['A_child'] * standard_pop.loc['child'] 
pymc_deaths_per_M_A_adult = pymc_means.loc['A_adult'] * standard_pop.loc['adult'] 

pymc_deaths_per_M_B_child = pymc_means.loc['B_child'] * standard_pop.loc['child'] 
pymc_deaths_per_M_B_adult = pymc_means.loc['B_adult'] * standard_pop.loc['adult'] 

print (pymc_deaths_per_M_A_child)
print (pymc_deaths_per_M_A_adult)
print (pymc_deaths_per_M_B_child)
print (pymc_deaths_per_M_B_adult)



In [None]:
(pymc_means.loc['A_adult'] * df2.loc['adult',('A','pop')]) / (df2.loc['adult',('A','pop')] / 1e6)

In [None]:
df2.loc['All',:] = df2.sum() 


In [None]:
df2[('A','ratio')] = df2[('A','dead')] / df2[('A','pop')]
df2[('B','ratio')] = df2[('B','dead')] / df2[('B','pop')]
df2

In [None]:
df2 = df2.reindex(columns=[('A','pop'),('A','dead'),('A','ratio'),('B','pop'),('B','dead'),('B','ratio')])

In [None]:
df2[('A','dead_per_M')] = df2[('A','dead')] / (df2[('A','pop')] / 1e6)
df2[('B','dead_per_M')] = df2[('B','dead')] / (df2[('B','pop')] / 1e6)

df2 = df2.reindex(columns=[('A','pop'),('A','dead'),('A','ratio'),('A','dead_per_M'),
                           ('B','pop'),('B','dead'),('B','ratio'),('B','dead_per_M')])

df2

In [None]:
standard_pop

In [None]:
standard_pop['expected_A'] = df2[('A','ratio')] * standard_pop['pop']
standard_pop['expected_B'] = df2[('B','ratio')] * standard_pop['pop']
standard_pop.loc['All',:] = standard_pop.sum()
standard_pop['rate_A'] = standard_pop['expected_A'] / standard_pop['pop']
standard_pop['rate_B'] = standard_pop['expected_B'] / standard_pop['pop']
standard_pop['expected_A_per_M'] = standard_pop['expected_A'] / (standard_pop.loc['All','pop'] / 1e6)
standard_pop['expected_B_per_M'] = standard_pop['expected_B'] / (standard_pop.loc['All','pop'] / 1e6)


standard_pop

In [None]:
standard_pop.loc['All',['expected_A','expected_B']].plot(kind='bar',title='standarized deaths per million')

In [None]:
##### example on Simpson's Paradox from link below: 

# https://www.healthknowledge.org.uk/e-learning/epidemiology/specialists/standardisation
# fake data from the above link'


df = pd.DataFrame({'age_grp': ['inf','inf','child','child','adult','adult','old','old'],
                  'country' : ['A','B','A','B','A','B','A','B'],
                  'pop' : [1000000,1000000,6000000,1500000,5500000,550000,2500000,120000],
                  'dead' : [1000,1000,7000,6300,20000,3000,120000,6000]})


df['age_grp'] = pd.Categorical(df['age_grp'],['inf','child','adult','old'])

df

In [None]:
pivot = pd.pivot_table(df,index='age_grp',columns='country',values=['pop','dead'],aggfunc=sum,margins=True)
pivot

In [None]:
pivot['age_ratio_A'] = pivot[('pop','A')] / pivot.loc['All',('pop','A')]
pivot['age_ratio_B'] = pivot[('pop','B')] / pivot.loc['All',('pop','B')]

pivot['dead_ratio_A'] = pivot[('dead','A')] / pivot[('pop','A')]
pivot['dead_ratio_B'] = pivot[('dead','B')] / pivot[('pop','B')]

pivot['delta_dead_ratio'] = pivot['dead_ratio_B'] - pivot['dead_ratio_A']
pivot['factor'] = pivot['dead_ratio_B'] / pivot['dead_ratio_A']

pivot

In [None]:
# for the fake data, the observation here is that Simpson's Paradox strikes again - overall country A has higher death ratio,
# despite the fact that it has lower death ratios in each age group. 

pivot.loc['All','dead_ratio_A' : 'dead_ratio_B'].plot(kind='bar',color=['b','orange'],title='Death Ratio per country')

In [None]:
pivot.loc[:'old','age_ratio_A':'age_ratio_B'].plot(kind='bar',
                                                   color=['b','orange'],title='age proportions per country')
plt.legend(['A age ratio','B age ratio'])
plt.ylabel('percentage')

In [None]:
pivot.loc[:'old','dead_ratio_A': 'dead_ratio_B'].plot(kind='bar',
                                                     color=['b','orange'],title='death proportions by age and country')
plt.legend(['A dead ratio','B dead ratio'])
plt.ylabel('percentage')

In [None]:
pivot['delta_dead_ratio'].plot(kind='bar',title='diff death ratio B - A')

In [None]:
pivot['factor'].plot(kind='bar')