In [18]:
import pyreadr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly
import plotly.graph_objects as go
import plotly.express as px
from scipy.stats import chi2_contingency

In [4]:
data_2014 = pyreadr.read_r('cleaned_data/data14.rds')
df_14=data_2014[None]

data_2015_18 = pyreadr.read_r('cleaned_data/data1518.rds')
df_1518=data_2015_18[None]

'''
with pd.ExcelWriter('output.xlsx') as writer:
    df.to_excel(writer, sheet_name='Sheet_name_1')
'''

"\nwith pd.ExcelWriter('output.xlsx') as writer:\n    df.to_excel(writer, sheet_name='Sheet_name_1')\n"

In [5]:
titles=df_14.keys()
titles

Index(['xstate', 'imonth', 'genhlth', 'physhlth', 'poorhlth', 'hlthpln1',
       'persdoc2', 'medcost', 'checkup1', 'exerany2', 'sex', 'educa',
       'employ1', 'income2', 'flushot6', 'howlong', 'lastpap2', 'lastsig3',
       'hadsigm3', 'xhcvu651', 'xtotinda', 'drvisits', 'medicare', 'hlthcvr1',
       'nocov121', 'medscost', 'carercvd', 'year'],
      dtype='object')

In [6]:
#start of chi2 test for general health data
gen_health14 = df_14['genhlth']
gen_health14

0        1.0
1        3.0
2        2.0
3        1.0
4        3.0
        ... 
51853    3.0
51854    2.0
51855    2.0
51856    1.0
51857    2.0
Name: genhlth, Length: 51858, dtype: float64

In [7]:
gen_health1518 = df_1518['genhlth']
gen_health1518

0         3.0
1         3.0
2         3.0
3         1.0
4         3.0
         ... 
193644    3.0
193645    3.0
193646    1.0
193647    2.0
193648    1.0
Name: genhlth, Length: 193649, dtype: float64

In [8]:
#account for much larger data by taking random sample of same size of 2014 dataset
sample1518 = gen_health1518.sample(n = 51858, random_state = 1)
sample1518

120187    4.0
74314     2.0
10751     5.0
34734     2.0
83776     5.0
         ... 
56529     4.0
101909    3.0
36504     2.0
26896     2.0
126199    4.0
Name: genhlth, Length: 51858, dtype: float64

In [9]:
#get counts for each value of general health in 2014 data set
columns = [1, 2, 3, 4, 5]
counts14 = [0,0,0,0,0]
for row in gen_health14:
    value = (int)(row)
    if value < 7:
        counts14[value-1] += 1
counts14
    

[10706, 18539, 14412, 5803, 2254]

In [10]:
#get counts for each value of general health in 2015-2018 data set sample
columns = [1, 2, 3, 4, 5]
counts1518 = [0,0,0,0,0]
for row in sample1518:
    value = (int)(row)
    if value < 7:
        counts1518[value-1] += 1
counts1518
    

[10274, 18518, 14806, 5884, 2257]

In [11]:
#number of non refused or not sure answers in data set
total14 = np.sum(counts14)
total1518 = np.sum(counts1518)

print("2014 total", total14)
print("2015-2018 total",total1518)

2014 total 51714
2015-2018 total 51739


In [12]:
#create combined matrix
genhealth = pd.DataFrame([counts14, counts1518], index = ["2014", "2015-2018"], columns = ["Excellent", "Very Good", "Good", "Fair", "Poor"])
genhealth

Unnamed: 0,Excellent,Very Good,Good,Fair,Poor
2014,10706,18539,14412,5803,2254
2015-2018,10274,18518,14806,5884,2257


In [26]:
#chart to view data and see if results match up
fig = go.Figure(data = [go.Bar(name = '2014', x = genhealth.columns, y = genhealth.iloc[0]),
                       go.Bar(name = '2015-18', x = genhealth.columns, y = genhealth.iloc[1])])
fig.update_layout(barmode='group')
fig.show()

In [13]:
#perform chi2 test
test = chi2_contingency(genhealth)
test

(14.77760328295258,
 0.005185422461637733,
 4,
 array([[10487.46503243, 18524.02248364, 14605.46965289,  5842.08788532,
          2254.95494572],
        [10492.53496757, 18532.97751636, 14612.53034711,  5844.91211468,
          2256.04505428]]))

In [14]:
#look at expected values of the data
#it appears that if anything the general health in 2014 is better than 2015-2018 (at least the sample I chose)
df = test[3]

ev = pd.DataFrame(data = df[:,:], index = ["2014", "2015-2018"],
                  columns = ["Excellent", "Very Good", "Good", "Fair", "Poor"]).round(2)

ev

Unnamed: 0,Excellent,Very Good,Good,Fair,Poor
2014,10487.47,18524.02,14605.47,5842.09,2254.95
2015-2018,10492.53,18532.98,14612.53,5844.91,2256.05


In [15]:
chi2 = test[0]
chi2

14.77760328295258

In [16]:
#note since we have so many data points the p value may be artifically lower
#p value indicates signficant difference?
pvalue = test[1]
pvalue

0.005185422461637733

In [17]:
dof = test[2]
dof

4

In [43]:
#start of test for cost data
cost2014 = df_14['medcost']

In [44]:
cost201518 = df_1518['medcost']

In [45]:
#take random sample same size as 2014 data
sample1518 = cost201518.sample(n = 51858, random_state = 1)

In [31]:
#1 corresponds to not being able to see a doctor and 2 is being able to see a doctor
counts14 = [0,0]
for row in cost2014:
    value = (int)(row)
    if value < 7:
        counts14[value-1] += 1
counts14

[4177, 47546]

In [33]:
counts1518 = [0,0]
for row in sample1518:
    value = (int)(row)
    if value < 7:
        counts1518[value-1] += 1
counts1518

[4097, 47633]

In [35]:
#total number of nonrefused or not sure answers
total14 = np.sum(counts14)
total1518 = np.sum(counts1518)

print("2014 total:", total14)
print("2015-2018 total:",total1518)

2014 total: 51723
2015-2018 total: 51730


In [36]:
#create combined matrix
cost = pd.DataFrame([counts14, counts1518], index = ["2014", "2015-2018"], 
                         columns = ["Yes", "No"])
cost

Unnamed: 0,Yes,No
2014,4177,47546
2015-2018,4097,47633


In [38]:
#chart to view data and see if results match up
fig = go.Figure(data = [go.Bar(name = '2014', x = cost.columns, y = cost.iloc[0]),
                       go.Bar(name = '2015-18', x = cost.columns, y = cost.iloc[1])])
fig.update_layout(barmode='group')
fig.show()

In [39]:
#perform chi2 test
test1 = chi2_contingency(cost)
test1

(0.8315231241843959,
 0.361832471245386,
 1,
 array([[ 4136.72007578, 47586.27992422],
        [ 4137.27992422, 47592.72007578]]))

In [40]:
#find contingency matrix for data
df = test1[3]

ev = pd.DataFrame(data = df[:,:], index = ["2014", "2015-2018"],
                  columns = ["Yes","No"]).round(2)

ev

Unnamed: 0,Yes,No
2014,4136.72,47586.28
2015-2018,4137.28,47592.72


In [42]:
#print summary statistics
#note only 1 degree of freedom, however p value indicates no significant difference
chi2 = test1[0]
pvalue = test1[1]
dof = test1[2]
print("chi2:", chi2, "p-value:", pvalue, "dof:", dof)

chi2: 0.8315231241843959 p-value: 0.361832471245386 dof: 1
