In [85]:
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import seaborn as sns 
sns.set()

Query Criteria:\
Title:	US Cause of death 2008 and 2018 in medical facilities\
Census Regions:	US Northeast, Midwest, South, West\
Place of Death:
                
                Medical Facility - Inpatient
                Medical Facility - Outpatient or ER 
                Medical Facility - Dead on Arrival 
                Medical Facility - Status unknown

Year/Month:	2008 and 2018\
Group By:	Race; Cause of death; Ten-Year Age Groups; Gender\
Calculate Rates Per:	100,000\
Basic search criteria example:\
https://wonder.cdc.gov/controller/saved/D76/D82F748


Centers for Disease Control and Prevention, National Center for Health Statistics. Underlying Cause of Death 1999-2018 on CDC WONDER Online Database, released in 2020. Data are from the Multiple Cause of Death Files, 1999-2018, as compiled from data provided by the 57 vital statistics jurisdictions through the Vital Statistics Cooperative Program. Accessed at http://wonder.cdc.gov/ucd-icd10.html on May 1, 2020 5:00:25 PM


In [86]:
# Local uploads will be converted to one combined file located
# in a remote repository after adding year and region columns to each.

dfne2008 = pd.read_csv('https://raw.githubusercontent.com/robholmstrom/Sams_github/master/Capstone_1/COD_csv_files/2008_Cause_of_death_Northeast.csv')
dfne2018 = pd.read_csv('https://raw.githubusercontent.com/robholmstrom/Sams_github/master/Capstone_1/COD_csv_files/2018_Cause_of_death_Northeast.csv')

dfmw2008 = pd.read_csv('https://raw.githubusercontent.com/robholmstrom/Sams_github/master/Capstone_1/COD_csv_files/2008_Cause_of_death_midwest.csv')
dfmw2018 = pd.read_csv('/Users/robholmstrom/Gitrepos/Sams_github/Capstone_1/COD_csv_files/2018_Cause_of_death_Midwest.csv')

dfs2008 = pd.read_csv('/Users/robholmstrom/Gitrepos/Sams_github/Capstone_1/COD_csv_files/2008_Cause_of_death_South.csv')
dfs2018 = pd.read_csv('/Users/robholmstrom/Gitrepos/Sams_github/Capstone_1/COD_csv_files/2018_Cause_of_death_South.csv')

dfw2008 = pd.read_csv('/Users/robholmstrom/Gitrepos/Sams_github/Capstone_1/COD_csv_files/2008_Cause_of_death_West.csv')
dfw2018 = pd.read_csv('/Users/robholmstrom/Gitrepos/Sams_github/Capstone_1/COD_csv_files/2018_Cause_of_death_West.csv')




In [65]:
# Add US region column and year to all eight datasets before combining into one dataset.

dfne2008['year'] = '2008'; dfne2008['US_region'] = 'northeast'
dfne2018['year'] = '2018'; dfne2018['US_region'] = 'northeast'

dfmw2008['year'] = '2008'; dfmw2008['US_region'] = 'midwest'
dfmw2018['year'] = '2018'; dfmw2018['US_region'] = 'midwest'

dfs2008['year'] = '2008'; dfs2008['US_region'] = 'south'
dfs2018['year'] = '2018'; dfs2018['US_region'] = 'south'

dfw2008['year'] = '2008'; dfw2008['US_region'] = 'west'
dfw2018['year'] = '2018'; dfw2018['US_region'] = 'west'


In [87]:
# Stack tables
dfcod = pd.concat([dfne2008, dfne2018, dfmw2008, dfmw2018, dfs2008, dfs2018, dfw2008, dfw2018])

# Get rid of unnecessary columns that were adding null values or just noise

dfcod.drop(['Notes', 'Population', 'Race Code', 'Ten-Year Age Groups', 'Crude Rate'], axis = 1, inplace = True)
dfcod.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 3894 entries, 0 to 96
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Race                      3489 non-null   object 
 1   Cause of death            3489 non-null   object 
 2   Cause of death Code       3489 non-null   object 
 3   Ten-Year Age Groups Code  3489 non-null   object 
 4   Gender                    3489 non-null   object 
 5   Gender Code               3489 non-null   object 
 6   Deaths                    3489 non-null   float64
dtypes: float64(1), object(6)
memory usage: 243.4+ KB


In [81]:
dfcod.head()

Unnamed: 0,Race,Cause of death,Cause of death Code,Ten-Year Age Groups Code,Gender,Gender Code,Deaths,year,US_region
0,Asian or Pacific Islander,"Bronchus or lung, unspecified - Malignant neop...",C34.9,65-74,Male,M,10.0,2008,northeast
1,Asian or Pacific Islander,Atherosclerotic heart disease,I25.1,65-74,Male,M,10.0,2008,northeast
2,Asian or Pacific Islander,Atherosclerotic heart disease,I25.1,85+,Female,F,11.0,2008,northeast
3,Black or African American,Enterocolitis due to Clostridium difficile,A04.7,85+,Female,F,10.0,2008,northeast
4,Black or African American,"Septicaemia, unspecified",A41.9,45-54,Female,F,15.0,2008,northeast
