## chronic diseases indicators (CDI) Analysis

Analysis on lifestyle factors on health in Nashville/Tennessee

#### Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Formatting 

In [2]:
%matplotlib inline

In [3]:
#import csv file: U.S._Chronic_Disease_Indicators__CDI_.master
us_cdi_df = pd.read_csv('../data/U.S._Chronic_Disease_Indicators__CDI_.master.csv', low_memory=False)

In [4]:
#look at head
us_cdi_df.head(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2010,2014,WA,Washington,Death Certificate,Cancer,"Cancer of the colon and rectum (colorectal), m...",,"per 100,000",Average Annual Age-adjusted Rate,...,53,CAN,CAN7_2,AVGANNAGEADJRATE,GENDER,GENM,,,,
1,2009,2013,OR,Oregon,Statewide central cancer registries,Cancer,"Invasive cancer (all sites combined), incidence",,"per 100,000",Average Annual Age-adjusted Rate,...,41,CAN,CAN4_1,AVGANNAGEADJRATE,RACE,WHT,,,,
2,2009,2013,FL,Florida,Death Certificate,Cancer,"Cancer of the oral cavity and pharynx, mortality",,"per 100,000",Average Annual Crude Rate,...,12,CAN,CAN10_2,AVGANNCRDRATE,RACE,AIAO,,,,
3,2010,2014,MI,Michigan,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence",,,Average Annual Number,...,26,CAN,CAN8_1,AVGANNNMBR,RACE,WHT,,,,
4,2010,2014,KY,Kentucky,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence",,"per 100,000",Average Annual Crude Rate,...,21,CAN,CAN8_1,AVGANNCRDRATE,RACE,AIAO,,,,


In [5]:
#look at tail
us_cdi_df.tail(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
814932,2009,2013,NM,New Mexico,Death Certificate,Cancer,"Invasive cancer (all sites combined), mortality",,,Average Annual Number,...,35,CAN,CAN4_2,AVGANNNMBR,OVERALL,OVR,,,,
814933,2009,2013,AL,Alabama,Statewide central cancer registries,Cancer,"Invasive cancer of the oral cavity or pharynx,...",,"per 100,000",Average Annual Age-adjusted Rate,...,1,CAN,CAN10_1,AVGANNAGEADJRATE,RACE,WHT,,,,
814934,2010,2014,OH,Ohio,Death Certificate,Cancer,"Cancer of the lung and bronchus, mortality",,"per 100,000",Average Annual Age-adjusted Rate,...,39,CAN,CAN8_2,AVGANNAGEADJRATE,OVERALL,OVR,,,,
814935,2009,2013,SC,South Carolina,Death Certificate,Cancer,"Invasive cancer (all sites combined), mortality",,"per 100,000",Average Annual Age-adjusted Rate,...,45,CAN,CAN4_2,AVGANNAGEADJRATE,RACE,HIS,,,,
814936,2009,2013,DC,District of Columbia,Death Certificate,Cancer,"Cancer of the oral cavity and pharynx, mortality",,,Average Annual Number,...,11,CAN,CAN10_2,AVGANNNMBR,RACE,AIAO,,,,


In [6]:
# examine shape of us cdi df
#814937 rows, 34 columns
us_cdi_df.shape

(814937, 34)

In [7]:
#examine info of us cdi df
us_cdi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814937 entries, 0 to 814936
Data columns (total 34 columns):
YearStart                    814937 non-null int64
YearEnd                      814937 non-null int64
LocationAbbr                 814937 non-null object
LocationDesc                 814937 non-null object
DataSource                   814937 non-null object
Topic                        814937 non-null object
Question                     814937 non-null object
Response                     0 non-null float64
DataValueUnit                728402 non-null object
DataValueType                814937 non-null object
DataValue                    555277 non-null object
DataValueAlt                 554139 non-null object
DataValueFootnoteSymbol      270821 non-null object
DatavalueFootnote            270821 non-null object
LowConfidenceLimit           475125 non-null object
HighConfidenceLimit          475125 non-null object
StratificationCategory1      814937 non-null object
Stratificat

In [8]:
# drop unecessary columns in us cdi df 
us_cdi_df = us_cdi_df.drop(columns = ['LocationAbbr','Response', 'DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','ResponseID','LocationID','TopicID','QuestionID','DataValueTypeID','StratificationCategoryID1','StratificationID1','StratificationCategoryID2','StratificationCategoryID3','StratificationID2','StratificationID3'])                                                                       

In [9]:
#look at column names now
us_cdi_df.columns

Index(['YearStart', 'YearEnd', 'LocationDesc', 'DataSource', 'Topic',
       'Question', 'DataValueUnit', 'DataValueType', 'DataValue',
       'DataValueAlt', 'StratificationCategory1', 'Stratification1',
       'GeoLocation'],
      dtype='object')

In [10]:
# change YearEnd to datetimeindex format
us_cdi_df['YearEnd'] = pd.DatetimeIndex(us_cdi_df['YearEnd'])

In [11]:
# want to subset to have topic of Nutrition, Physical Activity, and Weight Status on topic column

#need to subset multiple columns based on mulitple conditions

# us_cdi_df to only have DataSource col 'BRFSS' and DataValueType be 'Crude Prevalence'
## example of exact match subset dogs["breed"] == "Labrador"

# brfss and cp subset
brfss = us_cdi_df['DataSource'] == 'BRFSS'
cp    = us_cdi_df['DataValueType']== 'Crude Prevalence'
us_cdi_brfss_cp = us_cdi_df[brfss & cp]

# DataSource needs to be 'BRFSS', Topic be 'Nutrition, Physical Activity, and Weight Status', DataValueType be 'Crude Prevalence'
# brfss and cp and npw subset

npw = us_cdi_df['Topic']== 'Nutrition, Physical Activity, and Weight Status'
us_cdi_brfss_npw_cp = us_cdi_df[brfss & cp & npw]


In [12]:
us_cdi_brfss_cp.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation
813,2013,1970-01-01 00:00:00.000002013,Mississippi,BRFSS,Arthritis,Fair or poor health among adults aged >= 18 ye...,%,Crude Prevalence,46.9,46.9,Gender,Male,"(32.745510099000455, -89.53803082499968)"
1419,2015,1970-01-01 00:00:00.000002015,Arizona,BRFSS,Arthritis,Arthritis among adults aged >= 18 years,%,Crude Prevalence,23.6,23.6,Overall,Overall,"(34.865970280000454, -111.76381127699972)"
3469,2014,1970-01-01 00:00:00.000002014,Arkansas,BRFSS,Alcohol,Heavy drinking among adults aged >= 18 years,%,Crude Prevalence,6.0,6.0,Gender,Male,"(34.74865012400045, -92.27449074299966)"
4077,2018,1970-01-01 00:00:00.000002018,Virginia,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,%,Crude Prevalence,36.6,36.6,Gender,Male,"(37.54268067400045, -78.45789046299967)"
4148,2018,1970-01-01 00:00:00.000002018,Idaho,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,%,Crude Prevalence,12.4,12.4,Race/Ethnicity,"White, non-Hispanic","(43.682630005000476, -114.3637300419997)"


In [13]:
us_cdi_brfss_npw_cp.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation
29332,2018,1970-01-01 00:00:00.000002018,North Dakota,BRFSS,"Nutrition, Physical Activity, and Weight Status",No leisure-time physical activity among adults...,%,Crude Prevalence,22.3,22.3,Overall,Overall,"(47.47531977900047, -100.11842104899966)"
32562,2018,1970-01-01 00:00:00.000002018,Utah,BRFSS,"Nutrition, Physical Activity, and Weight Status",Obesity among adults aged >= 18 years,%,Crude Prevalence,30.0,30.0,Race/Ethnicity,Hispanic,"(39.360700171000474, -111.58713063499971)"
32687,2018,1970-01-01 00:00:00.000002018,Oklahoma,BRFSS,"Nutrition, Physical Activity, and Weight Status",Overweight or obesity among adults aged >= 18 ...,%,Crude Prevalence,68.9,68.9,Race/Ethnicity,"White, non-Hispanic","(35.47203135600046, -97.52107021399968)"
34092,2018,1970-01-01 00:00:00.000002018,Wisconsin,BRFSS,"Nutrition, Physical Activity, and Weight Status",Obesity among adults aged >= 18 years,%,Crude Prevalence,40.3,40.3,Race/Ethnicity,Hispanic,"(44.39319117400049, -89.81637074199966)"
34098,2018,1970-01-01 00:00:00.000002018,Montana,BRFSS,"Nutrition, Physical Activity, and Weight Status",Healthy weight among adults aged >= 18 years,%,Crude Prevalence,32.9,32.9,Race/Ethnicity,Hispanic,"(47.06652897200047, -109.42442064499971)"


In [14]:
# three conditions for questions subset see notes of capstone proposal

#Row contains string ‘consumption’ mean the person fruit and vegetable consumption frequency
## example of isin subset is_black_or_brown = dogs["color"].isin(["Black", "Brown"]) #this isin code has an error
## df[df['A'].str.contains("hello")]
nutrition_intake_consump = us_cdi_df[us_cdi_df['Question'].str.contains("consumption")]
nutrition_intake_consump.head()
#us_cdi_brfss_npw_cp_consump = us_cdi_df[brfss & cp & npw & nutrition_intake_consump]

#There are so many filters within the data set of us cdc dataframe and i did a general analysis with excel. See notes in capstone

Unnamed: 0,YearStart,YearEnd,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation
34319,2017,1970-01-01 00:00:00.000002017,New Hampshire,YRBSS,"Nutrition, Physical Activity, and Weight Status",Soda consumption among high school students,%,Crude Prevalence,11.7,11.7,Race/Ethnicity,"Asian, non-Hispanic","(43.65595011300047, -71.50036091999965)"
34331,2017,1970-01-01 00:00:00.000002017,Montana,YRBSS,"Nutrition, Physical Activity, and Weight Status",Soda consumption among high school students,%,Crude Prevalence,10.9,10.9,Gender,Female,"(47.06652897200047, -109.42442064499971)"
34372,2017,1970-01-01 00:00:00.000002017,California,YRBSS,"Nutrition, Physical Activity, and Weight Status",Soda consumption among high school students,%,Crude Prevalence,13.2,13.2,Race/Ethnicity,"Black, non-Hispanic","(37.63864012300047, -120.99999953799971)"
35530,2016,1970-01-01 00:00:00.000002016,Georgia,AEDS,Alcohol,Per capita alcohol consumption among persons a...,gallons,Per capita alcohol consumption,1.9,1.9,Overall,Overall,"(32.83968109300048, -83.62758034599966)"
35557,2016,1970-01-01 00:00:00.000002016,Delaware,AEDS,Alcohol,Per capita alcohol consumption among persons a...,gallons,Per capita alcohol consumption,3.7,3.7,Overall,Overall,"(39.008830667000495, -75.57774116799965)"


In [15]:
#import for only obesity dataset to explore relationship between obesity and nutrition intake and excercise
# inner joins obesity data,nutrition and exercise after apply filters formula

#import csv file: Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System
us_obesity_df = pd.read_csv('../data/Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv')

In [16]:
# examine us_obesity_df head

us_obesity_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2012,2012,WY,Wyoming,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(43.235541343, -108.109830353)",OWS,OWS1,Q037,VALUE,56,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
1,2012,2012,DC,District of Columbia,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(38.890371385, -77.031961127)",OWS,OWS1,Q036,VALUE,11,Education,Less than high school,EDU,EDUHS
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Gender,Female,GEN,FEMALE
4,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),18 - 24,AGEYR,AGEYR1824


In [17]:
#dataframe info

us_obesity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63028 entries, 0 to 63027
Data columns (total 33 columns):
YearStart                     63028 non-null int64
YearEnd                       63028 non-null int64
LocationAbbr                  63028 non-null object
LocationDesc                  63028 non-null object
Datasource                    63028 non-null object
Class                         63028 non-null object
Topic                         63028 non-null object
Question                      63028 non-null object
Data_Value_Unit               0 non-null float64
Data_Value_Type               63028 non-null object
Data_Value                    56796 non-null float64
Data_Value_Alt                56796 non-null float64
Data_Value_Footnote_Symbol    6232 non-null object
Data_Value_Footnote           6232 non-null object
Low_Confidence_Limit          56796 non-null float64
High_Confidence_Limit         56796 non-null float64
Sample_Size                   56796 non-null object
Total     

In [18]:
# subset obeisty values
#use Question contains to subset  
## df[df['A'].str.contains("hello")]

#explore if there are ways to transpose rows to columns grouping by topics and sub topics and show their values
#trying df.pivot(column='var',values = 'val') that spread rows into columns

#by_topic = us_obesity_df.pivot(columns='Topic', values='Data_Value')
#it does not work as there are no matching values for each column

In [19]:
#try groupby

by_topic = us_obesity_df.groupby(by='Topic')

In [20]:
by_topic.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2012,2012,WY,Wyoming,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(43.235541343, -108.109830353)",OWS,OWS1,Q037,VALUE,56,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
1,2012,2012,DC,District of Columbia,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(38.890371385, -77.031961127)",OWS,OWS1,Q036,VALUE,11,Education,Less than high school,EDU,EDUHS
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Gender,Female,GEN,FEMALE
4,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),18 - 24,AGEYR,AGEYR1824
5,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),25 - 34,AGEYR,AGEYR2534
6,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),55 - 64,AGEYR,AGEYR5564
7,2015,2015,RI,Rhode Island,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(41.708280193, -71.522470314)",OWS,OWS1,Q037,VALUE,44,Race/Ethnicity,Hispanic,RACE,RACEHIS
8,2012,2012,WY,Wyoming,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(43.235541343, -108.109830353)",PA,PA1,Q047,VALUE,56,Income,"Less than $15,000",INC,INCLESS15
10,2011,2011,WA,Washington,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(47.522278629, -120.47001079)",OWS,OWS1,Q037,VALUE,53,Gender,Male,GEN,MALE


In [None]:
#figure out duplicate in excel file 