<a href="https://colab.research.google.com/github/therealthaibinh/jupyter_notebooks/blob/master/Excess_mortality_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook variables
Hint: Use `Shift+Enter` to run a cell and automatically move on to the next cell

Documentation of csv: https://www.mortality.org/Public/STMF_DOC/STMFNote.pdf

Country codes: https://www.mortality.org/cgi-bin/hmd/DataAvailability.php

In [2]:
# You probably want to change just this number:
nWeekWindow = 26

# Possible age columns: 'D0_14', 'D15_64', 'D65_74','D75_84', 'D85p'
# lstKeepAges = ['D0_14', 'D15_64', 'D65_74','D75_84', 'D85p']
lstKeepAges = ['D65_74','D75_84', 'D85p']


#################################################################

# Probably stay the same

strURL_mortality = "https://www.mortality.org/Public/STMF/Outputs/stmf.csv"

lstCountriesEurope = ['AUT', 'BEL', 'BGR', 'CZE', 'DNK', 'GBRTENW', 'EST',
                      'FIN', 'FRATNP', 'DEUTNP', 'HUN', 'ISL', 'ITA', 
                      'LVA', 'LTU', 'LUX', 'NLD', 'NOR', 'POL', 'PRT',
                      'GBR_SCO', 'SVK', 'ESP', 'SWE', 'CHE']

nRefYearStart = 2015
nRefYearEnd = 2019 #inclusive

strSex = 'b'


# Don't touch
lstKeepCol = ['CountryCode', 'Year', 'Week', 'Sex'] + lstKeepAges

In [1]:
%pylab inline

import pandas as pd
from datetime import datetime


# pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

Populating the interactive namespace from numpy and matplotlib


# Load file

In [3]:
print("LOADING DATA ON " + str(datetime.now()))
# First loading is just to get its header
print(pd.read_csv(strURL_mortality, nrows=1, header=None).loc[0,0])

# Second loading is to load actual data
df_mortality = pd.read_csv(strURL_mortality, skiprows=2)
df_mortality.head()

LOADING DATA ON 2020-09-10 04:37:55.497462
#HMD STMF pooled file. Last modified: 2020-09-04 14:06:31 


Unnamed: 0,CountryCode,Year,Week,Sex,D0_14,D15_64,D65_74,D75_84,D85p,DTotal,R0_14,R15_64,R65_74,R75_84,R85p,RTotal,Split,SplitSex,Forecast
0,AUT,2000,1,m,7.0,183.0,212.0,249.0,163.0,814.0,0.00052,0.003513,0.037607,0.095138,0.231834,0.010925,0,0,0
1,AUT,2000,1,f,2.0,104.0,141.0,338.0,468.0,1053.0,0.000156,0.002002,0.019553,0.061442,0.224357,0.013238,0,0,0
2,AUT,2000,1,b,9.0,287.0,353.0,587.0,631.0,1867.0,0.000343,0.002759,0.027474,0.072305,0.226242,0.01212,0,0,0
3,AUT,2000,2,m,4.0,195.0,195.0,259.0,187.0,840.0,0.000297,0.003743,0.034591,0.098958,0.265969,0.011274,0,0,0
4,AUT,2000,2,f,6.0,109.0,126.0,312.0,509.0,1062.0,0.000469,0.002099,0.017473,0.056716,0.244012,0.013352,0,0,0


### We only care about keeping the total sexes column

In [4]:
df_mortality = df_mortality[df_mortality['Sex']==strSex]
df_mortality.head()

Unnamed: 0,CountryCode,Year,Week,Sex,D0_14,D15_64,D65_74,D75_84,D85p,DTotal,R0_14,R15_64,R65_74,R75_84,R85p,RTotal,Split,SplitSex,Forecast
2,AUT,2000,1,b,9.0,287.0,353.0,587.0,631.0,1867.0,0.000343,0.002759,0.027474,0.072305,0.226242,0.01212,0,0,0
5,AUT,2000,2,b,10.0,304.0,321.0,571.0,696.0,1902.0,0.000381,0.002922,0.024983,0.070334,0.249547,0.012347,0,0,0
8,AUT,2000,3,b,13.0,342.0,346.0,573.0,753.0,2027.0,0.000495,0.003287,0.026929,0.070581,0.269984,0.013158,0,0,0
11,AUT,2000,4,b,24.0,295.0,342.0,571.0,708.0,1940.0,0.000914,0.002836,0.026618,0.070334,0.25385,0.012593,0,0,0
14,AUT,2000,5,b,16.0,304.0,314.0,563.0,731.0,1928.0,0.000609,0.002922,0.024439,0.069349,0.262096,0.012516,0,0,0


# Explore

In [6]:
df_mortality['CountryCode'].unique()

array(['AUT', 'BEL', 'BGR', 'CHE', 'CZE', 'DEUTNP', 'DNK', 'ESP', 'EST',
       'FIN', 'FRATNP', 'GBRTENW', 'GBR_SCO', 'GRC', 'HRV', 'HUN', 'ISL',
       'ISR', 'ITA', 'LTU', 'LUX', 'LVA', 'NLD', 'NOR', 'POL', 'PRT',
       'RUS', 'SVK', 'SVN', 'SWE', 'USA'], dtype=object)

### Which countries do _not_ have all years of reference data?

In [7]:
lstFullCountryYearWeek = []
for strCountryCode in df_mortality['CountryCode'].unique():
    for nYear in range(nRefYearStart, nRefYearEnd+1):
        for nWeek in range(1,53):
            lstFullCountryYearWeek.append(strCountryCode + " " + str(nYear) + " " + str(nWeek))

print("No reference data for these country/year/week combos:")
set(lstFullCountryYearWeek)-set((df_mortality["CountryCode"] + " " + df_mortality["Year"].astype(str) + " " + df_mortality["Week"].astype(str)).values)

No reference data for these country/year/week combos:


{'DEUTNP 2015 1',
 'DEUTNP 2015 10',
 'DEUTNP 2015 11',
 'DEUTNP 2015 12',
 'DEUTNP 2015 13',
 'DEUTNP 2015 14',
 'DEUTNP 2015 15',
 'DEUTNP 2015 16',
 'DEUTNP 2015 17',
 'DEUTNP 2015 18',
 'DEUTNP 2015 19',
 'DEUTNP 2015 2',
 'DEUTNP 2015 20',
 'DEUTNP 2015 21',
 'DEUTNP 2015 22',
 'DEUTNP 2015 23',
 'DEUTNP 2015 24',
 'DEUTNP 2015 25',
 'DEUTNP 2015 26',
 'DEUTNP 2015 27',
 'DEUTNP 2015 28',
 'DEUTNP 2015 29',
 'DEUTNP 2015 3',
 'DEUTNP 2015 30',
 'DEUTNP 2015 31',
 'DEUTNP 2015 32',
 'DEUTNP 2015 33',
 'DEUTNP 2015 34',
 'DEUTNP 2015 35',
 'DEUTNP 2015 36',
 'DEUTNP 2015 37',
 'DEUTNP 2015 38',
 'DEUTNP 2015 39',
 'DEUTNP 2015 4',
 'DEUTNP 2015 40',
 'DEUTNP 2015 41',
 'DEUTNP 2015 42',
 'DEUTNP 2015 43',
 'DEUTNP 2015 44',
 'DEUTNP 2015 45',
 'DEUTNP 2015 46',
 'DEUTNP 2015 47',
 'DEUTNP 2015 48',
 'DEUTNP 2015 49',
 'DEUTNP 2015 5',
 'DEUTNP 2015 50',
 'DEUTNP 2015 51',
 'DEUTNP 2015 52',
 'DEUTNP 2015 6',
 'DEUTNP 2015 7',
 'DEUTNP 2015 8',
 'DEUTNP 2015 9',
 'GRC 2015 1',
 'GRC 

## What is the max number of weeks available for each country in 2020?

In [8]:
df_mortality[(df_mortality['Year']==2020)].sort_values(by=['CountryCode','Week']).\
        drop_duplicates(subset=['CountryCode'], keep='last')[['CountryCode','Week']].\
        sort_values(by='Week')

Unnamed: 0,CountryCode,Week
82664,SVN,13
70124,POL,26
56477,LUX,26
50066,ITA,26
36596,GRC,26
13739,CZE,28
73331,PRT,29
79505,SVK,30
31019,FRATNP,30
42773,HUN,31


# Doing stuff

## Get 2020 data (`df_mortality_2020`)

In [9]:
df_mortality_2020 = df_mortality[(df_mortality['Year']==2020)][lstKeepCol]
df_mortality_2020['SUM'] = df_mortality_2020[lstKeepAges].sum(axis='columns')
df_mortality_2020.head()

Unnamed: 0,CountryCode,Year,Week,Sex,D65_74,D75_84,D85p,SUM
3122,AUT,2020,1,b,221.0,481.0,687.0,1389.0
3125,AUT,2020,2,b,261.0,490.0,712.0,1463.0
3128,AUT,2020,3,b,272.0,537.0,753.0,1562.0
3131,AUT,2020,4,b,256.0,515.0,736.0,1507.0
3134,AUT,2020,5,b,277.0,591.0,844.0,1712.0


## Generate reference data (`df_mortality_reference`)
For each week, get the average value over the years that were defined above

In [10]:
df_mortality_reference = df_mortality[(df_mortality['Year']>=nRefYearStart) &\
                                       (df_mortality['Year']<(nRefYearEnd+1))]
df_mortality_reference['SUM'] = df_mortality_reference[lstKeepAges].sum(axis='columns')

df_mortality_reference = pd.DataFrame(df_mortality_reference[lstKeepCol+['SUM']].groupby(by=['CountryCode','Week'])['SUM'].mean()).\
                            reset_index().\
                            rename(columns={'SUM':'REF'})
df_mortality_reference.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,CountryCode,Week,REF
0,AUT,1,1570.2
1,AUT,2,1615.6
2,AUT,3,1539.8
3,AUT,4,1512.0
4,AUT,5,1568.0


## Jam the dataframes together

In [11]:
df_mortality_join = df_mortality_2020[lstKeepCol+['SUM']].merge(df_mortality_reference, on=['CountryCode','Week'])
df_mortality_join['DIFF'] = df_mortality_join['SUM']-df_mortality_join['REF']
df_mortality_join['EXCESS_ONLY'] = df_mortality_join['DIFF'].clip(0, None)

df_mortality_join.head()

Unnamed: 0,CountryCode,Year,Week,Sex,D65_74,D75_84,D85p,SUM,REF,DIFF,EXCESS_ONLY
0,AUT,2020,1,b,221.0,481.0,687.0,1389.0,1570.2,-181.2,0.0
1,AUT,2020,2,b,261.0,490.0,712.0,1463.0,1615.6,-152.6,0.0
2,AUT,2020,3,b,272.0,537.0,753.0,1562.0,1539.8,22.2,22.2
3,AUT,2020,4,b,256.0,515.0,736.0,1507.0,1512.0,-5.0,0.0
4,AUT,2020,5,b,277.0,591.0,844.0,1712.0,1568.0,144.0,144.0


## Create excess mortality up to week defined above

In [12]:
df_excess_mortality = pd.DataFrame(df_mortality_join[df_mortality_join['Week']<=nWeekWindow].groupby(by=['CountryCode'])[['REF','EXCESS_ONLY']].sum()).reset_index()
df_excess_mortality.head()

Unnamed: 0,CountryCode,REF,EXCESS_ONLY
0,AUT,35890.0,1511.8
1,BEL,48622.6,8913.2
2,BGR,44810.2,169.0
3,CHE,30119.2,1969.2
4,CZE,47066.0,1165.0


# Final comparisons

In [13]:
print("Europe:")
s_temp = df_excess_mortality[df_excess_mortality['CountryCode'].isin(lstCountriesEurope)][['REF','EXCESS_ONLY']].sum()
print("\tReference deaths:", int(s_temp['REF']))
print("\tExcess deaths:", int(s_temp['EXCESS_ONLY']))
print("\tPercent increase:", round(s_temp['EXCESS_ONLY']*100.0/s_temp['REF'], 2))

print()

print("United States:")
s_temp = df_excess_mortality[df_excess_mortality['CountryCode'].isin(['USA'])][['REF','EXCESS_ONLY']].sum()
print("\tReference deaths:", int(s_temp['REF']))
print("\tExcess deaths:", int(s_temp['EXCESS_ONLY']))
print("\tPercent increase:", round(s_temp['EXCESS_ONLY']*100.0/s_temp['REF'], 2))

Europe:
	Reference deaths: 2099756
	Excess deaths: 239605
	Percent increase: 11.41

United States:
	Reference deaths: 1055778
	Excess deaths: 151067
	Percent increase: 14.31


In [14]:
# Run this as a reminder of what your variables were
print(lstKeepAges)
print(nWeekWindow)

['D65_74', 'D75_84', 'D85p']
26


# Further exploration

### All individual countries

In [15]:
for strCountryCode in df_excess_mortality['CountryCode'].unique():
    print(strCountryCode)
    s_temp = df_excess_mortality[df_excess_mortality['CountryCode'].isin([strCountryCode])][['REF','EXCESS_ONLY']].sum()
    print("\tReference deaths:", int(s_temp['REF']))
    print("\tExcess deaths:", int(s_temp['EXCESS_ONLY']))
    print("\tPercent increase:", round(s_temp['EXCESS_ONLY']*100.0/s_temp['REF'], 2))
    print()

AUT
	Reference deaths: 35890
	Excess deaths: 1511
	Percent increase: 4.21

BEL
	Reference deaths: 48622
	Excess deaths: 8913
	Percent increase: 18.33

BGR
	Reference deaths: 44810
	Excess deaths: 169
	Percent increase: 0.38

CHE
	Reference deaths: 30119
	Excess deaths: 1969
	Percent increase: 6.54

CZE
	Reference deaths: 47065
	Excess deaths: 1164
	Percent increase: 2.48

DEUTNP
	Reference deaths: 412859
	Excess deaths: 12094
	Percent increase: 2.93

DNK
	Reference deaths: 23442
	Excess deaths: 737
	Percent increase: 3.15

ESP
	Reference deaths: 191100
	Excess deaths: 45724
	Percent increase: 23.93

EST
	Reference deaths: 6432
	Excess deaths: 189
	Percent increase: 2.94

FIN
	Reference deaths: 23413
	Excess deaths: 1126
	Percent increase: 4.81

FRATNP
	Reference deaths: 254874
	Excess deaths: 29894
	Percent increase: 11.73

GBRTENW
	Reference deaths: 239582
	Excess deaths: 53995
	Percent increase: 22.54

GBR_SCO
	Reference deaths: 24578
	Excess deaths: 4421
	Percent increase: 17.99

GR

In [16]:
df_mortality_join[df_mortality_join['CountryCode']=='USA']

Unnamed: 0,CountryCode,Year,Week,Sex,D65_74,D75_84,D85p,SUM,REF,DIFF,EXCESS_ONLY
863,USA,2020,1,b,11575.401018,14356.142857,18673.456124,44605.0,44672.4,-67.4,0.0
864,USA,2020,2,b,11767.178204,14593.990398,18982.831398,45344.0,45753.4,-409.4,0.0
865,USA,2020,3,b,11453.950785,14205.516809,18477.532406,44137.0,44975.0,-838.0,0.0
866,USA,2020,4,b,11388.035487,14123.766775,18371.197738,43883.0,44062.8,-179.8,0.0
867,USA,2020,5,b,11360.787037,14089.972469,18327.240494,43778.0,43444.0,334.0,334.0
868,USA,2020,6,b,11442.791896,14191.677237,18459.530868,44094.0,43440.4,653.6,653.6
869,USA,2020,7,b,11369.610345,14100.915387,18341.474268,43812.0,43155.4,656.6,656.6
870,USA,2020,8,b,11375.579053,14108.31795,18351.102998,43835.0,42628.8,1206.2,1206.2
871,USA,2020,9,b,11424.626262,14169.1477,18430.226038,44024.0,42137.0,1887.0,1887.0
872,USA,2020,10,b,11482.49678,14240.920368,18523.582853,44247.0,42300.6,1946.4,1946.4
