In [None]:
import requests
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import zipfile
import os

sns.set()



In [None]:
# this url must be MANUALLY changed for other year data
url = 'http://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/DVS/mortality/mort2012us.zip'

r = requests.get(url,stream=True)
r.status_code

In [None]:
zip_file_name = 'us_mort_data.zip'

In [None]:

def download_url(url, save_path, chunk_size=128):
    
    r = requests.get(url, stream=True)
    
    with open(save_path, 'wb') as fd:
        
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)


In [None]:
# this downloads the zip-file to local directory. 

#download_url(url,zip_file_name)

In [None]:
# unzip it and get name of the [only!] file in zip archive

#zf = zipfile.ZipFile('us_mort_data.zip', 'r')
#unzipped_file_name = zf.namelist()[0]
unzipped_file_name = 'VS12MORT.DUSMCPUB'
print (unzipped_file_name)

In [None]:
#zf.extract(unzipped_file_name)

In [None]:
#os.remove(zip_file_name)

In [None]:
age_code_map = dict(zip(['0' + str(i) for i in list(range(1,10))] + ['10','11','12'],
         ['-1','1-4','5-14','15-24','25-34','35-44','45-54','55-64',
          '65-74','75-84','85+','unspec']))

place_of_death_map = dict(zip([str(i) for i in list(range(1,8))] + ['9'],['Hosp.InPatient','Hosp.OutPatient',
                                             'Hosp.DeadArr','Home','Hospice','NursingHome',
                                             'Other','Unknown']))


cause_map = dict(zip([str(i) for i in list(range(1,8))] + [' '],['Accident','Suicide','Homicide',
                                                    'Pending','Undecided','SelfInflected',
                                                    'Natural','NotSpecified']))

activity_map = dict(zip(['0','1','2','3','4','8','9',' '],['Sport','Leasure','Job','Oth.Work',
                                                      'Vital.Activities','Oth.Activities','Unspec.','N/A']))

race_map = dict(
        zip(
            ['0' + str(i) for i in list(range(1,8))] + [str(i) for i in list(range(18,88,10))],
            ['White','Black','Am.Indian','Chinese','Japanese','Hawaiian',
            'Filipino','Asian.Indian','Korean','Samoan','Vietnamese',
            'Guamanian','Other.Asian','Other.Asian']))

buf_size = 492

with open (unzipped_file_name,'rb') as data_file:
    
    dict_list = []
    
    data_remaining = True
    
    while data_remaining:
    
        data = data_file.read(buf_size)        
            
        if len(data) == buf_size:

            p_dic = dict()
            
            p_dic['month_of_death'] = (data[64:66]).decode('utf-8')
            p_dic['sex'] = chr(data[68])
            p_dic['age'] = age_code_map[(data[78:80]).decode('utf-8')]
            p_dic['place_of_death'] = place_of_death_map[chr(data[82])]
            p_dic['marital_status'] = chr(data[83])
            p_dic['injury_at_work'] = chr(data[105])
            p_dic['manner_of_death'] = cause_map[chr(data[106])]
            p_dic['activity_at_death'] = activity_map[chr(data[143])]
            p_dic['ICD-code'] = (data[145:149]).decode('utf-8')
            p_dic['race'] = race_map[(data[444:446]).decode('utf-8')]

            dict_list.append(p_dic)
            
        else:
                
            data_remaining = False



df = pd.DataFrame(dict_list)


In [None]:
#os.remove(unzipped_file_name)

In [None]:
df['age'] = pd.Categorical(df['age'],['-1','1-4','5-14','15-24','25-34','35-44','45-54','55-64',
          '65-74','75-84','85+','unspec'])

df.head()

In [None]:
age_counts = df.groupby('age').count()['month_of_death']
age_counts

In [None]:
age_counts.plot(kind='bar',title='US 2018 deaths per age grp')
plt.ylabel('nr of deaths')

In [None]:

age_death_dist = age_counts / age_counts.sum()
age_death_dist.plot(kind='bar',title='US 2018 age grp deaths distribution')
plt.ylabel('rel.Freq')
age_death_dist

In [None]:
age_death_dist.cumsum().plot(title='US 2018 cumulative deaths distribution',style='o--')
plt.ylabel('cumulative rel.Freq')
_= plt.xticks(range(len(age_death_dist.cumsum())),
              age_death_dist.cumsum().index,rotation=90)

In [None]:
df.head()

In [None]:
df.groupby('manner_of_death').count()['age'] / df['age'].count()

In [None]:
df.groupby('place_of_death').count()['age']

In [None]:
df.groupby('activity_at_death').count()['age']

In [None]:
df.groupby('ICD-code').count()['age'].sort_values(ascending=False)