In [1]:
import pandas as pd
import numpy as np

### About the Dataset
This dataset presents number of deaths and age-adjusted death rates for the 10 leading causes of death, as well as all causes of death combined, in the United States and by state for 1999-2017.  

In [2]:
cdc_original = pd.read_excel("Data/cdc_cause_of_death.xlsx")

In [3]:
cdc_original.head()

Unnamed: 0,Year,113 Cause Name,Cause Name,State,Deaths,Age-adjusted Death Rate
0,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,United States,169936,49.4
1,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alabama,2703,53.8
2,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alaska,436,63.7
3,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arizona,4184,56.2
4,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arkansas,1625,51.8


In [4]:
#display the column names in the dataset
cdc_original.columns

Index(['Year', '113 Cause Name', 'Cause Name', 'State', 'Deaths',
       'Age-adjusted Death Rate'],
      dtype='object')

In [5]:
#extract required columns
cdc_data = cdc_original[['Year', 'Cause Name', 'State', 'Deaths']]
cdc_data.head()

Unnamed: 0,Year,Cause Name,State,Deaths
0,2017,Unintentional injuries,United States,169936
1,2017,Unintentional injuries,Alabama,2703
2,2017,Unintentional injuries,Alaska,436
3,2017,Unintentional injuries,Arizona,4184
4,2017,Unintentional injuries,Arkansas,1625


In [6]:
#rename Cause Name column

cdc_data = cdc_data.rename(columns = {'Cause Name': 'Cause_Name'})

In [7]:
# display the unique Cause Names listed in the dataset
cause_name_ls = cdc_data['Cause_Name'].unique()
cause_name_ls

array(['Unintentional injuries', 'All causes', "Alzheimer's disease",
       'Stroke', 'CLRD', 'Diabetes', 'Heart disease',
       'Influenza and pneumonia', 'Suicide', 'Cancer', 'Kidney disease'],
      dtype=object)

### Explore the Total Number of Deaths by Each Disease year over year

In [8]:
#Create Dataframe to identify totals for each disease for each year.
#The total number of deaths for each disease is listed in rows where the United States is the name 
#of the state. 

disease_totals = cdc_data.loc[cdc_data['State'] == 'United States'].reset_index()
# drop original index
disease_totals.drop('index', axis = 1, inplace = True)

disease_totals.head()

Unnamed: 0,Year,Cause_Name,State,Deaths
0,2017,Unintentional injuries,United States,169936
1,2017,All causes,United States,2813503
2,2017,Alzheimer's disease,United States,121404
3,2017,Stroke,United States,146383
4,2017,CLRD,United States,160201


In [9]:
# Total deaths per year is represented with the Cause Name "All Causes".
# Delete all rows where Cause Name is "All Causes" since the goal is to compare the causes individually

disease_totals = disease_totals[disease_totals.Cause_Name != 'All causes']
disease_totals.head()

Unnamed: 0,Year,Cause_Name,State,Deaths
0,2017,Unintentional injuries,United States,169936
2,2017,Alzheimer's disease,United States,121404
3,2017,Stroke,United States,146383
4,2017,CLRD,United States,160201
5,2017,Diabetes,United States,83564


### Explore data for Chronic Diseases
In this section I reduce my dataset to only include Chronic Diseases where the outcome may be affected by access to proper health care. The following diseases include Diabetes, Heart disease and Stroke.

In [10]:
#Create Dataframe to contain only data for the Chronic Diseases being considered

chronics = ['Stroke','Diabetes', 'Heart disease']

chronic_data = cdc_data[cdc_data.Cause_Name.isin(chronics)]

In [11]:
chronic_data.head()

Unnamed: 0,Year,Cause_Name,State,Deaths
156,2017,Stroke,United States,146383
157,2017,Stroke,Alabama,2931
158,2017,Stroke,Alaska,190
159,2017,Stroke,Arizona,2681
160,2017,Stroke,Arkansas,1612


In [12]:
#drop rows that display country total for each disease

chronic_data = chronic_data[chronic_data.State != 'United States']

chronic_data.head()

Unnamed: 0,Year,Cause_Name,State,Deaths
157,2017,Stroke,Alabama,2931
158,2017,Stroke,Alaska,190
159,2017,Stroke,Arizona,2681
160,2017,Stroke,Arkansas,1612
161,2017,Stroke,California,16355


In [19]:
#create new DataFrame to extract data for 2013 and 2016
years = [2013, 2016]
chronic_pp = chronic_data[chronic_data.Year.isin(years)]
chronic_pp

Unnamed: 0,Year,Cause_Name,State,Deaths
728,2016,Stroke,Alabama,2967
729,2016,Stroke,Alaska,196
730,2016,Stroke,Arizona,2556
731,2016,Stroke,Arkansas,1643
732,2016,Stroke,California,15680
...,...,...,...,...
2647,2013,Heart disease,Virginia,13663
2648,2013,Heart disease,Washington,10524
2649,2013,Heart disease,West Virginia,4666
2650,2013,Heart disease,Wisconsin,11362
