In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [63]:
#load data
airq_asthma_df = pd.read_csv('data/airq_FNOS_asthma.csv')

In [64]:
airq_asthma_df.head()

Unnamed: 0,year,borough,geo_place_name,mean_fpm,mean_no,ozone mean (ppb),mean_so2,ed_annual_adult_rate_per10k
0,2011,brooklyn,greenpoint,11.51,25.84,31.6,1.95,65.4
1,2013,staten island,south beach - tottenville,7.82,11.51,32.7,0.73,33.9
2,2014,staten island,south beach - tottenville,8.2,12.59,32.0,0.22,32.4
3,2009,staten island,south beach - tottenville,9.63,14.71,27.8,1.01,36.6
4,2012,staten island,south beach - tottenville,8.28,12.47,35.3,0.39,44.9


In [65]:
#describe data
airq_asthma_df.describe()

Unnamed: 0,year,mean_fpm,mean_no,ozone mean (ppb),mean_so2,ed_annual_adult_rate_per10k
count,252.0,252.0,252.0,252.0,252.0,252.0
mean,2011.5,9.973452,23.077976,30.259127,2.50754,111.55119
std,1.711224,1.36701,5.520205,4.00106,1.970882,90.307775
min,2009.0,7.36,11.51,16.1,0.22,11.7
25%,2010.0,8.97,19.4175,27.575,1.0325,45.8
50%,2011.5,9.81,22.345,31.1,1.9,74.3
75%,2013.0,10.78,25.7525,33.2,3.3625,152.575
max,2014.0,15.03,43.32,40.3,10.51,375.4


In [66]:
#create new column from copy of 'ed_annual_adult_rate_per10k'
airq_asthma_df['asthma_rate_category'] = airq_asthma_df['ed_annual_adult_rate_per10k'].copy()

In [67]:
airq_asthma_df.head()

Unnamed: 0,year,borough,geo_place_name,mean_fpm,mean_no,ozone mean (ppb),mean_so2,ed_annual_adult_rate_per10k,asthma_rate_category
0,2011,brooklyn,greenpoint,11.51,25.84,31.6,1.95,65.4,65.4
1,2013,staten island,south beach - tottenville,7.82,11.51,32.7,0.73,33.9,33.9
2,2014,staten island,south beach - tottenville,8.2,12.59,32.0,0.22,32.4,32.4
3,2009,staten island,south beach - tottenville,9.63,14.71,27.8,1.01,36.6,36.6
4,2012,staten island,south beach - tottenville,8.28,12.47,35.3,0.39,44.9,44.9


In [68]:
#transform asthma rates to categorical variable

asthma_list =[]
def category_map (column):
    for rate in column: 
        if rate>=0.0 and rate<126.0: 
            asthma_list.append('low')
        elif rate>=126.0 and rate<252.0: 
            asthma_list.append('medium')
        else: 
            asthma_list.append('high')
                    
    return asthma_list   

In [69]:
#apply function to transform column
asthma_rate_series=category_map(airq_asthma_df['asthma_rate_category'])

In [70]:
len(asthma_rate_series)

252

In [71]:
airq_asthma_df['asthma_rate_category'] = asthma_rate_series

In [72]:
airq_asthma_df.head()

Unnamed: 0,year,borough,geo_place_name,mean_fpm,mean_no,ozone mean (ppb),mean_so2,ed_annual_adult_rate_per10k,asthma_rate_category
0,2011,brooklyn,greenpoint,11.51,25.84,31.6,1.95,65.4,low
1,2013,staten island,south beach - tottenville,7.82,11.51,32.7,0.73,33.9,low
2,2014,staten island,south beach - tottenville,8.2,12.59,32.0,0.22,32.4,low
3,2009,staten island,south beach - tottenville,9.63,14.71,27.8,1.01,36.6,low
4,2012,staten island,south beach - tottenville,8.28,12.47,35.3,0.39,44.9,low


In [77]:
#create df with only borough and asthma rate category
borough_rate_df = airq_asthma_df[['borough', 'asthma_rate_category']]

In [78]:
borough_rate_df.head()

Unnamed: 0,borough,asthma_rate_category
0,brooklyn,low
1,staten island,low
2,staten island,low
3,staten island,low
4,staten island,low


In [79]:
#find frequencies of each category
borough_rate_freq = borough_rate_df.groupby(['borough', 'asthma_rate_category']).size().reset_index(name='count')

In [80]:
borough_rate_freq

Unnamed: 0,borough,asthma_rate_category,count
0,bronx,high,18
1,bronx,low,9
2,bronx,medium,15
3,brooklyn,high,1
4,brooklyn,low,44
5,brooklyn,medium,21
6,manhattan,high,12
7,manhattan,low,48
8,queens,low,59
9,queens,medium,1


In [83]:
#create pivot table for chi square
borough_rate_pivot = pd.pivot_table(borough_rate_freq, index = 'borough', values='count',
                    columns='asthma_rate_category', fill_value =0)

In [84]:
borough_rate_pivot

asthma_rate_category,high,low,medium
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bronx,18,9,15
brooklyn,1,44,21
manhattan,12,48,0
queens,0,59,1
staten island,0,18,6


In [86]:
from scipy.stats import chi2_contingency, ttest_ind

In [88]:
#perform chi sq
chi2, p, dof, ex = chi2_contingency(borough_rate_pivot)
print('verification_status: p-value of chisquare test =', p)

verification_status: p-value of chisquare test = 4.4814008582364374e-20


#### Chi sq is significant indicating that we can reject null hypothesis that asthma rates among all boroughs are the same.

In [90]:
borough_rate_transpose = borough_rate_pivot.T

In [91]:
borough_rate_transpose

borough,bronx,brooklyn,manhattan,queens,staten island
asthma_rate_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
high,18,1,12,0,0
low,9,44,48,59,18
medium,15,21,0,1,6


In [92]:
chi2, p, dof, ex = chi2_contingency(borough_rate_transpose)
print('verification_status: p-value of chisquare test =', p)

verification_status: p-value of chisquare test = 4.4814008582364374e-20
