In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

sns.set()
last_complete_month = '2020-06-30'
last_complete_week = pd.Timestamp(last_complete_month).week

# last complete week may be a few days off... but we are not doing rocket science here,
# no need for precision to nth decimal... let's subtract one week for the 2015-2019 baseline,
# so we are not getting positive bias for 2020

last_complete_week -= 1

last_complete_week

In [None]:
df = pd.read_csv('scb_dead_per_age.csv',sep=';',header=None,usecols=range(19),index_col=0)
cols = ['2019_tot','2019_M_64','2019_M_79','2019_M_89','2019_M_90+',
       '2019_W_64','2019_W_79','2019_W_89','2019_W_90+',
       '2020_tot','2020_M_64','2020_M_79','2020_M_89','2020_M_90+',
       '2020_W_64','2020_W_79','2020_W_89','2020_W_90+']
index = pd.date_range('2020-01-01','2020-12-31')
df.index = index
df.columns = cols
df = df.loc[:last_complete_month]
df

In [None]:
totals = df[['2019_tot','2020_tot']]
totals

In [None]:
monthly_totals = totals.groupby(totals.index.month).sum()
monthly_totals

In [None]:
deaths_2015_2019 = pd.read_csv('scb_dead_per_age_2015-2019.csv',sep=';',
                               header=None,index_col=0,usecols=range(10))
deaths_2015_2019.index.name='week'
deaths_2015_2019[64] = deaths_2015_2019[2] + deaths_2015_2019[6]
deaths_2015_2019[79] = deaths_2015_2019[3] + deaths_2015_2019[7]
deaths_2015_2019[89] = deaths_2015_2019[4] + deaths_2015_2019[8]
deaths_2015_2019[90] = deaths_2015_2019[5] + deaths_2015_2019[9]
deaths_2015_2019 = deaths_2015_2019[[64,79,89,90]]
deaths_2015_2019 = deaths_2015_2019.loc[:last_complete_week,:]
deaths_2015_2019 = deaths_2015_2019.sum()
deaths_2015_2019

In [None]:
foo = monthly_totals.T.plot(kind='bar',stacked=True,figsize=(18,12))
foo

In [None]:
monthly = df.resample('M').sum()
monthly.drop(['2019_tot','2020_tot'],axis=1,inplace=True)
monthly_sums_per_age = monthly.sum()
cols = ['2019_M_64','2020_M_64','2019_M_79','2020_M_79','2019_M_89','2020_M_89','2019_M_90+','2020_M_90+',
        '2019_W_64','2020_W_64','2019_W_79','2020_W_79','2019_W_89','2020_W_89','2019_W_90+','2020_W_90+']

monthly_sums_per_age = monthly_sums_per_age[cols]
monthly_sums_per_age

In [None]:
monthly_sums_per_age.plot(kind='bar')

In [None]:
foo = monthly_sums_per_age.index.str.contains('W')
foo

In [None]:
monthly_sums_per_age = pd.DataFrame(monthly_sums_per_age,columns=['dead'])
monthly_sums_per_age['female'] = foo
monthly_sums_per_age

In [None]:
monthly_sums_per_age.reset_index(inplace=True,drop=False)
monthly_sums_per_age


In [None]:
bar = monthly_sums_per_age['index'].str[-3:].str.replace('_','').str.replace('+','').astype(int)
bar

In [None]:
monthly_sums_per_age['age'] = bar
monthly_sums_per_age['year'] = monthly_sums_per_age['index'].apply(lambda x : x[:4])
monthly_sums_per_age['year'] = monthly_sums_per_age['year'].apply(lambda x : pd.Timestamp(x).year)
monthly_sums_per_age['gender'] = monthly_sums_per_age['female'].apply(lambda x : 'F' if x == True else 'M')
monthly_sums_per_age.index = monthly_sums_per_age[['year','gender','age']]
monthly_sums_per_age

In [None]:
monthly_sums_per_age = monthly_sums_per_age.groupby(['year','age'])['dead'].sum()


In [None]:
yearly_diff = monthly_sums_per_age.unstack().T
yearly_diff['delta'] = yearly_diff[2020] - yearly_diff[2019]
yearly_diff['pct'] = 100 * yearly_diff[2020] / yearly_diff[2019] - 100
print (yearly_diff.sum())
yearly_diff

In [None]:
# population 2019 2020 - based on previous dec previous year

pop = pd.read_csv('scb_population_2019_2020.csv',sep=';',header=None,usecols=[1,2,3,4,5,6])
pop.index.name = 'age'
pop['2019'] = pop[2] + pop[5]
pop['2020'] = pop[3] + pop[6]

pop = pop[['2019','2020']]

pop_2019_64 = pop.loc[:64,'2019'].sum()
pop_2019_79 = pop.loc[65:79,'2019'].sum()
pop_2019_89 = pop.loc[80:89,'2019'].sum()
pop_2019_90 = pop.loc[90:,'2019'].sum()

print (pop_2019_64)
print (pop_2019_79)
print (pop_2019_89)
print (pop_2019_90)

print (pop_2019_64 + pop_2019_79 + pop_2019_89 + pop_2019_90)
print ()
pop_2020_64 = pop.loc[:64,'2020'].sum()
pop_2020_79 = pop.loc[65:79,'2020'].sum()
pop_2020_89 = pop.loc[80:89,'2020'].sum()
pop_2020_90 = pop.loc[90:,'2020'].sum()

print (pop_2020_64)
print (pop_2020_79)
print (pop_2020_89)
print (pop_2020_90)

print (pop_2020_64 + pop_2020_79 + pop_2020_89 + pop_2020_90)

In [None]:
### fake data for categorical regression

pop_df = pd.DataFrame({'pop_0-9' : [100000,200000],
                      'pop_10-19' : [200000,400000],
                      'pop_20-29' : [300000,600000],
                      'dead_0-9' : [1000,2000],
                      'dead_10-19' : [2000,4000],
                      'dead_20-29' : [3000,6000]},
                      index=[2000,2010])

pop_df.at[2000,'pop_all'] = pop_df.loc[2000,'pop_0-9':'pop_20-29'].sum() 
pop_df.at[2010,'pop_all'] = pop_df.loc[2010,'pop_0-9':'pop_20-29'].sum() 

pop_df.at[2000,'dead_all'] = pop_df.loc[2000,'dead_0-9' : 'dead_20-29'].sum()
pop_df.at[2010,'dead_all'] = pop_df.loc[2010,'dead_0-9' : 'dead_20-29'].sum()

pop_df.at[2000,'dead_all_ratio'] = pop_df.loc[2000,'dead_all'] / pop_df.loc[2000,'pop_all']
pop_df.at[2010,'dead_all_ratio'] = pop_df.loc[2010,'dead_all'] / pop_df.loc[2010,'pop_all']

pop_df['dead_0-9_ratio'] = pop_df['dead_0-9'] / pop_df['pop_0-9']
pop_df['dead_10-19_ratio'] = pop_df['dead_10-19'] / pop_df['pop_10-19']
pop_df['dead_20-29_ratio'] = pop_df['dead_20-29'] / pop_df['pop_20-29']

pop_df

In [None]:
yearly_diff

In [None]:
yearly_diff['2015-2019'] = deaths_2015_2019
yearly_diff

In [None]:
# https://www.healthknowledge.org.uk/e-learning/epidemiology/specialists/standardisation
# fake data from the above link
'''
df = pd.DataFrame({'age_grp': ['inf','inf','child','child','adult','adult','old','old'],
                  'country' : ['A','B','A','B','A','B','A','B'],
                  'pop' : [1000000,1000000,6000000,1500000,5500000,550000,2500000,120000],
                  'dead' : [1000,1000,7000,6300,20000,3000,120000,6000]})
'''

df = pd.DataFrame({'age_grp': ['inf','inf','child','child','adult','adult','old','old'],
                  'country' : ['A','B','A','B','A','B','A','B'],
                  'pop' : [pop_2019_64,pop_2020_64,pop_2019_79,pop_2020_79,
                           pop_2019_89,pop_2020_89,pop_2019_90,pop_2020_90],
                  'dead' : [yearly_diff.loc[64,'2015-2019'],yearly_diff.loc[64,2020],
                            yearly_diff.loc[79,'2015-2019'],yearly_diff.loc[79,2020],
                            yearly_diff.loc[89,'2015-2019'],yearly_diff.loc[89,2020],
                            yearly_diff.loc[90,'2015-2019'],yearly_diff.loc[90,2020]]})

df['age_grp'] = pd.Categorical(df['age_grp'],['inf','child','adult','old'])

df

In [None]:
pivot = pd.pivot_table(df,index='age_grp',columns='country',values=['pop','dead'],aggfunc=sum,margins=True)
pivot

In [None]:
pivot['age_ratio_A'] = pivot[('pop','A')] / pivot.loc['All',('pop','A')]
pivot['age_ratio_B'] = pivot[('pop','B')] / pivot.loc['All',('pop','B')]

pivot['dead_ratio_A'] = pivot[('dead','A')] / pivot[('pop','A')]
pivot['dead_ratio_B'] = pivot[('dead','B')] / pivot[('pop','B')]

pivot['delta_dead_ratio'] = pivot['dead_ratio_B'] - pivot['dead_ratio_A']
pivot['factor'] = pivot['dead_ratio_B'] / pivot['dead_ratio_A']

pivot

In [None]:
# for the fake data, the observation here is that Simpson's Paradox strikes again - overall country A has higher death ratio,
# despite the fact that it has lower death ratios in each age group. 

pivot.loc['All','dead_ratio_A' : 'dead_ratio_B'].plot(kind='bar',color=['b','orange'],title='Death Ratio per country')

In [None]:
pivot.loc[:'old','age_ratio_A':'age_ratio_B'].plot(kind='bar',
                                                   color=['b','orange'],title='age proportions per country')
plt.legend(['A age ratio','B age ratio'])
plt.ylabel('percentage')

In [None]:
pivot.loc[:'old','dead_ratio_A': 'dead_ratio_B'].plot(kind='bar',
                                                     color=['b','orange'],title='death proportions by age and country')
plt.legend(['A dead ratio','B dead ratio'])
plt.ylabel('percentage')

In [None]:
pivot['delta_dead_ratio'].plot(kind='bar',title='diff death ratio B - A')

In [None]:
pivot['factor'].plot(kind='bar')

In [None]:
pivot


In [None]:
### prep for pymc

def age_encode(x):
    if x == 'inf' : return 1
    if x == 'child' : return 2
    if x == 'adult' : return 3
    if x == 'old' : return 4
    
def country_encode(x):
    if x == 'A' : return 1
    if x == 'B' : return 2

    
df['age_idx'] = df['age_grp'].apply(age_encode).astype(int)
df['country_index'] = df['country'].apply(country_encode)
df

In [None]:
def logit_pure(x):
    return (np.exp(x)) / (1 + np.exp(x))

def logistic_pure(p):
    return np.log(p / (1 - p))

In [None]:
import pymc as pm
from pymc.Matplot import plot as pmplot


### control for age_grp by assigning a unique alpha to each age_grp,
# setting that alpha as the p for death country A, and then having a common offset for country B

# model: 
# dead ~ Binomial(population,p)
# p = logit(alpha[age_idx] + beta * year)
# alpha[age_idx] ~ Normal(0,10)
# beta ~ Normal(0,10)

x = df['country_index']
age_idx = df['age_idx']

alpha = pm.Normal('alpha',0, 1 / 10 ** 2,size=4) #4 age groups

beta = pm.Normal('beta',0,1 / 10 ** 2)

@pm.deterministic
def logit_age(age_idx=age_idx-1,alpha=alpha,beta=beta,x=x):
    
    # index for alpha : subtract 1 from age_idx that goes 1..4 to get zero-based index of python
    
    return (np.exp(alpha[age_idx]+beta*x)) / (1 + np.exp(alpha[age_idx]+beta*x))

lkh_age = pm.Binomial('lkh_age',n=df['pop'],p=logit_age,observed=True,value=df['dead'])

model = pm.Model([alpha,beta,logit_age,lkh_age])

mcmc = pm.MCMC(model)
sample = mcmc.sample(500000,100000,2)

In [None]:
alpha_inf = mcmc.trace('alpha')[:,0]
alpha_child = mcmc.trace('alpha')[:,1]
alpha_adult = mcmc.trace('alpha')[:,2]
alpha_old = mcmc.trace('alpha')[:,3]

beta = mcmc.trace('beta')[:]

pmplot(alpha_inf,'alpha_inf')
pmplot(alpha_child,'alpha_child')
pmplot(alpha_adult,'alpha_adult')
pmplot(alpha_old,'alpha_old')
pmplot(beta,'beta')


result = pd.DataFrame({'alpha_inf' : alpha_inf,
                        'alpha_child' : alpha_child,
                      'alpha_adult' : alpha_adult,
                       'alpha_old' : alpha_old,
                      'beta' : beta})

result.describe()

In [None]:
p_result = pd.DataFrame()

p_result['A_inf_p'] = logit_pure(result['alpha_inf'])
p_result['A_child_p'] = logit_pure(result['alpha_child'])
p_result['A_adult_p'] = logit_pure(result['alpha_adult'])
p_result['A_old_p'] = logit_pure(result['alpha_old'])


p_result['B_inf_p'] = logit_pure(result['alpha_inf'] + result['beta'])
p_result['B_child_p'] = logit_pure(result['alpha_child'] + result['beta'])
p_result['B_adult_p'] = logit_pure(result['alpha_adult'] + result['beta'])
p_result['B_old_p'] = logit_pure(result['alpha_old'] + result['beta'])

In [None]:
p_result.describe()

In [None]:
pivot


In [None]:
plt.figure(figsize=(18,12))

plt.subplot(411)
plt.title('Probability Distribution for probability of dying, age group <=64')
plt.hist(p_result.A_inf_p,density=True,color='b',alpha=0.7,label='2015-2019')
plt.hist(p_result.B_inf_p,density=True,color='orange',alpha=0.7,label='2020')
plt.xlabel('probability of dying')
plt.ylabel('probability density')
plt.legend(loc='upper left')

plt.subplot(412)
plt.title('Probability Distribution for probability of dying, age group 65-79')

plt.hist(p_result.A_child_p,density=True,color='b',alpha=0.7,label='2015-2019')
plt.hist(p_result.B_child_p,density=True,color='orange',alpha=0.7,label='2020')
plt.xlabel('probability of dying')
plt.ylabel('probability density')
plt.legend(loc='upper left')

plt.subplot(413)
plt.title('Probability Distribution for probability of dying, age group 80-89')

plt.hist(p_result.A_adult_p,density=True,color='b',alpha=0.7,label='2015-2019')
plt.hist(p_result.B_adult_p,density=True,color='orange',alpha=0.7,label='2020')
plt.xlabel('probability of dying')
plt.ylabel('probability density')
plt.legend(loc='upper left')

plt.subplot(414)
plt.title('Probability Distribution for probability of dying, age group 90+')

plt.hist(p_result.A_old_p,density=True,color='b',alpha=0.7,label='2015-2019')
plt.hist(p_result.B_old_p,density=True,color='orange',alpha=0.7,label='2020')
plt.xlabel('probability of dying')
plt.ylabel('probability density')
plt.legend(loc='upper left')

plt.tight_layout()

plt.savefig('probabilty2die.jpg',format='jpg',dpi=1200)