In [1]:
# Exercise 2
# Pick a year and for each gender and age group find out the top 5 causes of death. 
# Your outcome should be a CSV or JSON file which is human readable.

In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
# Assuming each record is one unique death
# Using 39 cause because more descriptive than manner of death
deaths_2015 = pd.read_csv('mortality/2015_data.csv', usecols=["sex", "age_recode_12", "39_cause_recode"]) 

In [4]:
# Do some data cleaning?

In [5]:
# Load json file with code names into a dataframe
with open('mortality/2015_codes.json') as f: 
   codes_2015_json = json.load(f)

codes_2015 = pd.DataFrame(codes_2015_json)

In [6]:
# Get human-readable names for causes of death
causes = codes_2015[['39_cause_recode']].dropna().reset_index()
causes['index'] = causes['index'].astype(int)

In [7]:
# Get human-readable names for age groups
ages = codes_2015[['age_recode_12']].dropna().reset_index()
ages['index'] = ages['index'].astype(int)

In [8]:
# Match death records to readable cause names and age groups
# Drop redundant columns

deaths_2015_merged = deaths_2015.merge(causes, how='left', left_on='39_cause_recode', right_on='index').drop(['39_cause_recode_x', 'index'], axis=1)
deaths_2015_merged = deaths_2015_merged.merge(ages, how='left', left_on='age_recode_12', right_on='index').drop(['age_recode_12_x', 'index'], axis=1) 


In [38]:
# Group death records by gender + by age and sort by count of death causes
# Columns could use some renaming

deaths_by_sex = deaths_2015_merged.groupby(['sex', '39_cause_recode_y']).count().reset_index().sort_values(['sex', 'age_recode_12_y'], ascending=False)
deaths_by_age = deaths_2015_merged.groupby(['age_recode_12_y', '39_cause_recode_y']).count().reset_index().sort_values(['age_recode_12_y', 'sex'], ascending=False)


In [50]:
# Get top 5 causes of death for men
m_deaths = deaths_by_sex[deaths_by_sex['sex'] == 'M'].head(5).drop(['age_recode_12_y'], axis=1)
m_deaths['Variable'] = 'Sex'
m_deaths['Rank'] = m_deaths.reset_index().index + 1
m_deaths.columns = ['Value', 'Top 5 Death Causes', 'Variable', 'Rank']

# Get top 5 causes of death for women
f_deaths = deaths_by_sex[deaths_by_sex['sex'] == 'F'].head(5).drop(['age_recode_12_y'], axis=1)
f_deaths['Variable'] = 'Sex'
f_deaths['Rank'] = f_deaths.reset_index().index + 1
f_deaths.columns = ['Value', 'Top 5 Death Causes', 'Variable', 'Rank']

output = pd.concat([m_deaths, f_deaths])

# Get top 5 causes of death for all age groups
for index,age in ages.iterrows():
    age_deaths = deaths_by_age[deaths_by_age['age_recode_12_y'] == age['age_recode_12']].head(5).drop(['sex'], axis=1)
    age_deaths['Variable'] = 'Age group'
    age_deaths['Rank'] = age_deaths.reset_index().index + 1
    age_deaths.columns = ['Value', 'Top 5 Death Causes', 'Variable', 'Rank']
    output = pd.concat([output, age_deaths])
    
output = output.set_index(['Variable', 'Value', 'Top 5 Death Causes'])
output.to_csv('exercise_2_output.csv')