In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import requests

sns.set()

In [None]:
#### the first url uses age of death at end of year, the second uses age reached at death ####

#official_deaths_url = 'http://api.scb.se/OV0104/v1/doris/sv/ssd/START/BE/BE0101/BE0101I/DodaFodelsearK'
official_deaths_url = 'http://api.scb.se/OV0104/v1/doris/sv/ssd/START/BE/BE0101/BE0101I/DodaHandelseK'

officeal_deaths_query = {
  "query": [
    {
      "code": "Region",
      "selection": {
        "filter": "vs:RegionRiket99",
        "values": [
          "00"
        ]
      }
    },
    {
      "code": "Alder",
      "selection": {
        "filter": "vs:Ålder1årA",
        "values": [
          "0",
          "1",
          "2",
          "3",
          "4",
          "5",
          "6",
          "7",
          "8",
          "9",
          "10",
          "11",
          "12",
          "13",
          "14",
          "15",
          "16",
          "17",
          "18",
          "19",
          "20",
          "21",
          "22",
          "23",
          "24",
          "25",
          "26",
          "27",
          "28",
          "29",
          "30",
          "31",
          "32",
          "33",
          "34",
          "35",
          "36",
          "37",
          "38",
          "39",
          "40",
          "41",
          "42",
          "43",
          "44",
          "45",
          "46",
          "47",
          "48",
          "49",
          "50",
          "51",
          "52",
          "53",
          "54",
          "55",
          "56",
          "57",
          "58",
          "59",
          "60",
          "61",
          "62",
          "63",
          "64",
          "65",
          "66",
          "67",
          "68",
          "69",
          "70",
          "71",
          "72",
          "73",
          "74",
          "75",
          "76",
          "77",
          "78",
          "79",
          "80",
          "81",
          "82",
          "83",
          "84",
          "85",
          "86",
          "87",
          "88",
          "89",
          "90",
          "91",
          "92",
          "93",
          "94",
          "95",
          "96",
          "97",
          "98",
          "99",
          "100+"
        ]
      }
    },
    {
      "code": "Kon",
      "selection": {
        "filter": "item",
        "values": [
          "1",
          "2"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          "2015",
          "2016",
          "2017",
          "2018",
          "2019"
        ]
      }
    }
  ],
  "response": {
    "format": "json"
  }
}

In [None]:
r = requests.post(official_deaths_url,json=officeal_deaths_query)
r.status_code

In [None]:
json = r.json()['data']
json[0]

In [None]:
df = pd.DataFrame.from_dict(json)
df[['area','age','gender','year']] = df['key'].to_list()
df['dead'] = df['values'].apply(lambda x : x[0]).astype(int)
df['age'] = df['age'].apply(lambda x : x.replace('+','')).astype(int)
df['bin'] = pd.cut(df['age'],[-1,64,79,89,200],labels=['-64','65-79','80-89','90+'])
df.drop(['key','area','values'],axis=1,inplace=True)
df = df.groupby(['year','bin']).sum()
df

In [None]:
#### sum of age groups for two genders ####
age_sum_64 = np.arange(65).sum() * 2
age_sum_65_79 = np.arange(65,80).sum() * 2
age_sum_80_89 = np.arange(80,90).sum() * 2
age_sum_90 = np.arange(90,101).sum() * 2

print (age_sum_64)
print (age_sum_65_79)
print (age_sum_80_89)
print (age_sum_90)

In [None]:
### prel data ####

scb_prel_file = pd.ExcelFile('https://www.scb.se/hitta-statistik/statistik-efter-amne/befolkning/befolkningens-sammansattning/befolkningsstatistik/pong/tabell-och-diagram/preliminar-statistik-over-doda/')

scb_prel_file.sheet_names

In [None]:
scb_prel_tabell2 = scb_prel_file.parse(sheet_name='Tabell 2',skiprows=7,usecols=[0,1,4,5,6,7,9,10,11,12])
scb_prel_tabell2['År'] = scb_prel_tabell2['År'].astype(str)
scb_prel_tabell2

In [None]:
unknowns = scb_prel_tabell2.loc[scb_prel_tabell2['DagMånad'].str.contains('Okänd dödsdag')].copy()
unknowns.drop('DagMånad',axis=1,inplace=True)
unknowns.set_index('År',inplace=True)
unknowns


In [None]:
month_name_map = dict(zip(['januari','februari','mars','april','maj','juni',
                          'juli','augusti','september','oktober','november','december'],
                         np.arange(1,13)))

knowns = scb_prel_tabell2.loc[~scb_prel_tabell2['DagMånad'].str.contains('Okänd dödsdag')].copy()
knowns                                        

In [None]:
knowns['DagMånad'] = knowns['DagMånad'].apply(lambda x : str(month_name_map[x.split()[1]]) + '-' + x.split()[0])
knowns

In [None]:
knowns['date'] = pd.to_datetime(knowns['År'].str.cat(knowns['DagMånad'],sep='-'))
knowns.drop(['År','DagMånad'],axis=1,inplace=True)
knowns.set_index('date',inplace=True)
knowns

In [None]:
def group_genders(col):
    if '0-64' in col:
        return '-64'
    
    if '65-79' in col:
        return '65-79'
    
    if '80-89' in col:
        return '80-89'
    
    if '90+' in col:
        return '90+'

In [None]:
common_knowns = knowns.groupby(group_genders,axis=1).sum()
common_knowns

In [None]:
common_unknowns = unknowns.groupby(group_genders,axis=1).sum()
common_unknowns = common_unknowns[::-1]
common_unknowns

In [None]:
yearly_common_knowns = common_knowns.groupby(common_knowns.index.year).sum()
yearly_common_knowns

In [None]:
common_unknowns

In [None]:
yearly_common_knowns.index = common_unknowns.index

In [None]:
prel_totals = yearly_common_knowns + common_unknowns
prel_totals

In [None]:
prel_totals['tot']  = prel_totals.sum(axis=1)
prel_totals

In [None]:
official_totals = df['dead'].copy().unstack()
official_totals.columns = ['-64','65-79','80-89','90+']
official_totals['tot'] = official_totals.sum(axis=1)

In [None]:
official_totals

In [None]:
prel_totals - official_totals

In [None]:
common_unknowns

In [None]:
common_knowns

In [None]:
#### transform example ####

daily_proportions = common_knowns.groupby(common_knowns.index.year).transform(lambda x : x / x.sum())

#### cant do all age groups in one bang, for some reason ####

def add_unknowns(grp):
    
    return daily_proportions[grp] * daily_proportions.groupby(daily_proportions.index.year)[grp].transform(
        lambda x : common_unknowns.loc[str(x.index[0].year),grp]) + common_knowns[grp]

corrected_64 = add_unknowns('-64')
corrected_65_79 = add_unknowns('65-79')
corrected_80_89 = add_unknowns('80-89')
corrected_90_plus = add_unknowns('90+')

corrected_age_grp_timeline = pd.concat([corrected_64,corrected_65_79,corrected_80_89,corrected_90_plus],
                                      axis=1)

corrected_age_grp_timeline

In [None]:
### transform vs apply : same result ###
print ('apply : ', common_knowns.groupby(common_knowns.index.year).apply(lambda x : x / x.sum()).head(10))
print ('transform : ', common_knowns.groupby(common_knowns.index.year).transform(lambda x : x / x.sum()).head(10))

In [None]:
corrected_age_grp_timeline.describe()

In [None]:
fig,axes = plt.subplots(figsize=(18,12))
sns.violinplot(data=corrected_age_grp_timeline[['-64','65-79','80-89','90+']],ax=axes,scale='count')

plt.title('SWEDEN distribution of daily deaths per age group 2015-2021 YTD ')
plt.ylabel('number of daily deaths')
plt.xlabel('age group')
plt.savefig('scb_error_age_grp_daily_deaths_violin_2015_2021_YTD.jpg',format='jpg')

In [None]:
avg_daily_deaths = corrected_age_grp_timeline.groupby(
    [corrected_age_grp_timeline.index.month,corrected_age_grp_timeline.index.day]).mean()

avg_daily_deaths

In [None]:
avg_daily_deaths.plot()

In [None]:
p_base = np.array([1/365] * 365)

monthly_boost = np.array([4] * 90 + [3] * 30 + [2] * 30 + [1] * 90 + [2] * 60 + [3] * 30 + [4] * 30 + [4] * 5)

p_death = p_base * monthly_boost
p_death = p_death / p_death.sum()


In [None]:
#### simulation ####

### probability distribution with peaks early and late in the year ###

N = 100000
death_day = np.random.choice(range(1,366),replace=True,p=p_death,size=N)
birth_day = np.random.randint(1,366,N)
years_completed = np.random.randint(60,100,N)

df = pd.DataFrame({'death_day' : death_day,
                  'birth_day' : birth_day,
                  'years_completed_bef_death' : years_completed})


df['age_eoy'] = df.apply(lambda row : row.years_completed_bef_death + 1 if\
                         row.death_day < row.birth_day else row.years_completed_bef_death,axis=1)

df['shifted'] = df.apply(lambda row : 1 if row.years_completed_bef_death < row.age_eoy else 0,axis=1)

In [None]:
df

In [None]:
df.describe()

In [None]:
fig,ax = plt.subplots(figsize=(18,12))
sns.violinplot(data=df[['birth_day','death_day']])
plt.ylabel('day of year')

In [None]:
(df['age_eoy'] > df['years_completed_bef_death']).sum()

In [None]:
daily_proportions

In [None]:
#### different way to compute corrected timeline ####
years = daily_proportions.groupby(daily_proportions.index.year).transform(
        lambda x : str(x.index[0].year))

### here's the trick : we want the year for lookup in common_unknowns for each row, so we take it from the -64 col
### only. Otherwise we get the year repeated for each col, and that clearly does not work as key to common_unknowns
unknowns_timeline = common_unknowns.loc[years.iloc[:,0]]
unknowns_timeline.index = daily_proportions.index
unknowns_timeline

In [None]:
corrected_age_grp_timeline_2 = unknowns_timeline * daily_proportions + common_knowns
corrected_age_grp_timeline_2

In [None]:
corrected_age_grp_timeline_2.groupby(corrected_age_grp_timeline_2.index.year).sum().sum(axis=1)

In [None]:
corrected_age_grp_timeline_2['day_of_year'] = corrected_age_grp_timeline_2.index.dayofyear
corrected_age_grp_timeline_2 = corrected_age_grp_timeline_2[::-1]
corrected_age_grp_timeline_2

In [None]:

day_of_death = corrected_age_grp_timeline_2.loc[:'2020-12-31'].groupby('day_of_year').mean().astype(int)
day_of_death

In [None]:
day_dist_64 = np.repeat(day_of_death.index.values,day_of_death['-64'])
day_dist_65_79 = np.repeat(day_of_death.index.values,day_of_death['65-79'])
day_dist_80_89 = np.repeat(day_of_death.index.values,day_of_death['80-89'])
day_dist_90_plus = np.repeat(day_of_death.index.values,day_of_death['90+'])

fig,ax = plt.subplots(figsize=(18,12))
plt.title('SWEDEN distribution of deaths over day of year, per age group, 2015 - 2020\nDataSource : scb.se')
plt.ylabel('day of year')
plt.xlabel('age group')

day_dist_64 = pd.Series(day_dist_64)
day_dist_65_79 = pd.Series(day_dist_65_79)
day_dist_80_89 = pd.Series(day_dist_80_89)
day_dist_90_plus = pd.Series(day_dist_90_plus)

day_dist = pd.concat([day_dist_64,day_dist_65_79,day_dist_80_89,day_dist_90_plus],axis=1,ignore_index=True)
day_dist.columns = ['-64','65-79','80-89','90+']

sns.violinplot(data=day_dist,scale='count')
plt.savefig('scb_error_dist_of_deaths_days_of_year.jpg',format='jpg')

In [None]:
day_dist.describe()

In [None]:
#### truncating the data from float to int will change the total sums somewhat ####
corrected_age_grp_monthly_timeline_2 = corrected_age_grp_timeline_2.resample('M').sum().iloc[:,:-1].astype(int)
corrected_age_grp_monthly_timeline_2.groupby(corrected_age_grp_monthly_timeline_2.index.year).sum().sum(axis=1)

In [None]:
month_of_death = corrected_age_grp_monthly_timeline_2.groupby(
    corrected_age_grp_monthly_timeline_2.index.month).mean().astype(int)

month_of_death

In [None]:
month_dist_64 = np.repeat(month_of_death.index.values,month_of_death['-64'])
month_dist_65_79 = np.repeat(month_of_death.index.values,month_of_death['65-79'])
month_dist_80_89 = np.repeat(month_of_death.index.values,month_of_death['80-89'])
month_dist_90_plus = np.repeat(month_of_death.index.values,month_of_death['90+'])

month_dist_64 = pd.Series(month_dist_64)
month_dist_65_79 = pd.Series(month_dist_65_79)
month_dist_80_89 = pd.Series(month_dist_80_89)
month_dist_90_plus = pd.Series(month_dist_90_plus)

month_dist = pd.concat([month_dist_64,month_dist_65_79,month_dist_80_89,month_dist_90_plus],axis=1,ignore_index=True)
month_dist.columns = ['-64','65-79','80-89','90+']

month_dist


In [None]:
fig,ax = plt.subplots(figsize=(18,12))
sns.violinplot(data=month_dist,ax=ax,scale='count')

In [None]:
fig,axes = plt.subplots(len(month_dist.columns),figsize=(18,12))

for i,c in enumerate(month_dist.columns):
    sns.kdeplot(data=month_dist[c],ls='dashdot',ax=axes[i])

In [None]:
### monthly pct OF YEARLY deaths per age grp ###
### good example of transform ###

pct_deaths_monthly = corrected_age_grp_monthly_timeline_2[:-1].groupby(
    corrected_age_grp_monthly_timeline_2.index[:-1].year).transform(lambda x : x / x.sum())

pct_deaths_monthly

In [None]:
pct_deaths_monthly.groupby(pct_deaths_monthly.index.year).sum()

In [None]:
#### another example of transform - yearly sum presented for each daily row ####

corrected_age_grp_timeline = corrected_age_grp_timeline[::-1]

corrected_age_grp_timeline.groupby(
    corrected_age_grp_timeline.index.year).transform(np.sum)



In [None]:
title='SWEDEN age grp monthly proportion of yearly deaths\nDataSource : scb.se'
pct_deaths_monthly.plot(subplots=True,figsize=(18,12),style='o--',title=title,sharey=True)

plt.savefig('scb_error_prop_monthly_deaths_age_grp_timeline.jpg',format='jpg')