In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
df = pd.read_csv('data/nhanes_2015_2016.csv')
df.SMQ020.value_counts()

2    3406
1    2319
9       8
7       2
Name: SMQ020, dtype: int64

In [3]:
df['SMQ020x'] = df.SMQ020.replace({1:"Yes",2:"No",7:np.nan,9:np.nan})
df.SMQ020x.value_counts()

No     3406
Yes    2319
Name: SMQ020x, dtype: int64

In [4]:
df["RIAGENDRx"] = df.RIAGENDR.replace({1: "Male", 2: "Female"})
ctab = pd.crosstab(index=df['RIAGENDRx'],columns=df['SMQ020x'])

In [11]:
# lets compute the stats for female smokers
# proportion of females that are smokers
df['SMQ020xx'] = df['SMQ020x'].replace({"Yes":1,"No":0})
dx = df.groupby(['RIAGENDRx']).agg({'SMQ020xx':[np.mean,'count']})
dx

Unnamed: 0_level_0,SMQ020xx,SMQ020xx
Unnamed: 0_level_1,mean,count
RIAGENDRx,Unnamed: 1_level_2,Unnamed: 2_level_2
Female,0.304845,2972
Male,0.513258,2753


In [56]:
df = df[~pd.isnull(df["SMQ020x"])]
df["agegrp"] = pd.cut(df.RIDAGEYR, [18, 30, 40, 50, 60, 70, 80])
dz = df.groupby(['agegrp','RIAGENDRx']).agg({'SMQ020x':[lambda x: np.mean(x=='Yes')]})
dz.columns = dz.columns.droplevel(level=0)
dz = dz.rename(columns={'<lambda>':'proportions'})
dz1 = dz.unstack()
dz1.columns = dz1.columns.droplevel(level=0)
dz1

RIAGENDRx,Female,Male
agegrp,Unnamed: 1_level_1,Unnamed: 2_level_1
"(18, 30]",0.226974,0.349265
"(30, 40]",0.287526,0.503282
"(40, 50]",0.268924,0.448878
"(50, 60]",0.422175,0.572687
"(60, 70]",0.37415,0.655963
"(70, 80]",0.325183,0.655779


In [57]:
dc = df.groupby(['agegrp','RIAGENDRx']).agg({'SMQ020x':'count'})
dc = dc.unstack()
dc.columns = dc.columns.droplevel(level=0)
dc

RIAGENDRx,Female,Male
agegrp,Unnamed: 1_level_1,Unnamed: 2_level_1
"(18, 30]",608,544
"(30, 40]",473,457
"(40, 50]",502,401
"(50, 60]",469,454
"(60, 70]",441,436
"(70, 80]",409,398


In [58]:
dstd = np.sqrt(dz1*(1-dz1)/dc)
dstd

RIAGENDRx,Female,Male
agegrp,Unnamed: 1_level_1,Unnamed: 2_level_1
"(18, 30]",0.016988,0.02044
"(30, 40]",0.020811,0.023389
"(40, 50]",0.01979,0.024838
"(50, 60]",0.022806,0.023217
"(60, 70]",0.023043,0.022751
"(70, 80]",0.023163,0.023815


In [59]:
dstd_diff = np.sqrt(dstd.Female**2 + dstd.Male**2)
dstd_diff

agegrp
(18, 30]    0.026578
(30, 40]    0.031307
(40, 50]    0.031758
(50, 60]    0.032545
(60, 70]    0.032382
(70, 80]    0.033222
dtype: float64