In [85]:
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import seaborn as sns 
sns.set()

This research proposal will investigate causes of death in US medical facilities and how they have changed from 2008 vs 2018. This study will also include race, age and death statistics by US census regions. With major advances in many areas of healthcare over the last decade, the purpose of the study is evaluate whether this data can improve decisions related to region-specific future hospital center expansions. 

Query Criteria:\
Title:	US Cause of death 2008 and 2018 in medical facilities\
Census Regions:	US Northeast, Midwest, South, West\
Place of Death:
                
                Medical Facility - Inpatient
                Medical Facility - Outpatient or ER 
                Medical Facility - Dead on Arrival 
                Medical Facility - Status unknown

Year/Month:	2008 and 2018\
Group By:	Race; Cause of death; Ten-Year Age Groups; Gender\
Calculate Rates Per:	100,000\
Basic search criteria example:\
https://wonder.cdc.gov/controller/saved/D76/D82F819

Also a good resource to understand death rate calculations:
https://www.stats.indiana.edu/vitals/CalculatingARate.pdf


![image.png](attachment:image.png)

Centers for Disease Control and Prevention, National Center for Health Statistics. Underlying Cause of Death 1999-2018 on CDC WONDER Online Database, released in 2020. Data are from the Multiple Cause of Death Files, 1999-2018, as compiled from data provided by the 57 vital statistics jurisdictions through the Vital Statistics Cooperative Program. Accessed at http://wonder.cdc.gov/ucd-icd10.html on May 1, 2020 5:00:25 PM


In [325]:
# Local downloads from mulitple queries were required due to limits and were moved to remote repository

dfcod = pd.read_csv('https://raw.githubusercontent.com/robholmstrom/Sams_github/master/Capstone_1/COD_csv_files/US_2008-2018_COD_chapters.csv')
dfcod.info()
dfcod.head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7126 entries, 0 to 7125
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Notes                       58 non-null     object 
 1   ICD-10 113 Cause List       7068 non-null   object 
 2   ICD-10 113 Cause List Code  7068 non-null   object 
 3   Ten-Year Age Groups         7068 non-null   object 
 4   Ten-Year Age Groups Code    7068 non-null   object 
 5   Gender                      7068 non-null   object 
 6   Gender Code                 7068 non-null   object 
 7   Census Region               7068 non-null   object 
 8   Census Region Code          7068 non-null   object 
 9   Deaths                      7068 non-null   float64
 10  Population                  7068 non-null   object 
 11  Crude Rate                  7068 non-null   object 
dtypes: float64(1), object(11)
memory usage: 668.2+ KB


Unnamed: 0,Notes,ICD-10 113 Cause List,ICD-10 113 Cause List Code,Ten-Year Age Groups,Ten-Year Age Groups Code,Gender,Gender Code,Census Region,Census Region Code,Deaths,Population,Crude Rate
0,,#Salmonella infections (A01-A02),GR113-001,55-64 years,55-64,Male,M,Census Region 3: South,CENS-R3,10.0,Not Applicable,Not Applicable
1,,#Salmonella infections (A01-A02),GR113-001,65-74 years,65-74,Female,F,Census Region 3: South,CENS-R3,10.0,Not Applicable,Not Applicable
2,,#Salmonella infections (A01-A02),GR113-001,65-74 years,65-74,Male,M,Census Region 3: South,CENS-R3,10.0,Not Applicable,Not Applicable
3,,#Salmonella infections (A01-A02),GR113-001,75-84 years,75-84,Female,F,Census Region 3: South,CENS-R3,12.0,Not Applicable,Not Applicable
4,,#Salmonella infections (A01-A02),GR113-001,75-84 years,75-84,Male,M,Census Region 3: South,CENS-R3,14.0,Not Applicable,Not Applicable
5,,"Certain other intestinal infections (A04,A07-A09)",GR113-003,< 1 year,1,Female,F,Census Region 1: Northeast,CENS-R1,82.0,Not Applicable,Not Applicable
6,,"Certain other intestinal infections (A04,A07-A09)",GR113-003,< 1 year,1,Female,F,Census Region 2: Midwest,CENS-R2,100.0,Not Applicable,Not Applicable
7,,"Certain other intestinal infections (A04,A07-A09)",GR113-003,< 1 year,1,Female,F,Census Region 3: South,CENS-R3,222.0,Not Applicable,Not Applicable
8,,"Certain other intestinal infections (A04,A07-A09)",GR113-003,< 1 year,1,Female,F,Census Region 4: West,CENS-R4,85.0,Not Applicable,Not Applicable
9,,"Certain other intestinal infections (A04,A07-A09)",GR113-003,< 1 year,1,Male,M,Census Region 1: Northeast,CENS-R1,93.0,Not Applicable,Not Applicable


In [326]:


# Get rid of unnecessary columns that were adding null values or just noise

dfcod.drop(['Notes', 'Population', 'Ten-Year Age Groups', 'Crude Rate'], axis = 1, inplace = True)
dfcod.dropna(inplace = True)

# Rename some columns for clarity:

dfcod.rename(columns = {'Deaths': 'Deaths per 100,000', 
                        'Ten-Year Age Groups Code':'Age range', 
                        'ICD-10 113 Cause List': 'Cause of death'}, 
                        inplace = True)

dfcod.info() # Ensure that all columns do not contain null values



<class 'pandas.core.frame.DataFrame'>
Int64Index: 7068 entries, 0 to 7067
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Cause of death              7068 non-null   object 
 1   ICD-10 113 Cause List Code  7068 non-null   object 
 2   Age range                   7068 non-null   object 
 3   Gender                      7068 non-null   object 
 4   Gender Code                 7068 non-null   object 
 5   Census Region               7068 non-null   object 
 6   Census Region Code          7068 non-null   object 
 7   Deaths per 100,000          7068 non-null   float64
dtypes: float64(1), object(7)
memory usage: 497.0+ KB


In [327]:
dfcod.head(3) # Example of table without the total population data--more clean up will happen later on

Unnamed: 0,Cause of death,ICD-10 113 Cause List Code,Age range,Gender,Gender Code,Census Region,Census Region Code,"Deaths per 100,000"
0,#Salmonella infections (A01-A02),GR113-001,55-64,Male,M,Census Region 3: South,CENS-R3,10.0
1,#Salmonella infections (A01-A02),GR113-001,65-74,Female,F,Census Region 3: South,CENS-R3,10.0
2,#Salmonella infections (A01-A02),GR113-001,65-74,Male,M,Census Region 3: South,CENS-R3,10.0


Below, I want to conduct exploratory analysis of the dfcod. Want to look summarized unique list of causes of death in order to condense

In [328]:
# Use this to convert cause of death column to a list: list = df['Name'].values.tolist()
cod = dfcod['Cause of death'].values.tolist()

# Get unique cods and place into a dataframe column. Will populate adjacent column with broader designation
codunique = np.unique(cod)
coddf = pd.DataFrame({'Unique causes of death':codunique})

#Temporarily set display of rows to max in order to view all ICD coded causes of death
pd.set_option('display.max_rows', None)

coddf

Unnamed: 0,Unique causes of death
0,"#Accidents (unintentional injuries) (V01-X59,Y..."
1,#Acute bronchitis and bronchiolitis (J20-J21)
2,#Alzheimer disease (G30)
3,#Anemias (D50-D64)
4,#Aortic aneurysm and dissection (I71)
5,"#Assault (homicide) (*U01-*U02,X85-Y09,Y87.1)"
6,#Atherosclerosis (I70)
7,#Cerebrovascular diseases (I60-I69)
8,#Certain conditions originating in the perinat...
9,#Cholelithiasis and other disorders of gallbla...


In [None]:
# Reset display options to 10
pandas.set_option('display.max_rows', 10)