In [1]:
###############################################################################
# Modules import
###############################################################################

import numpy as np
import pandas as pd
from pyarrow import feather
import re
from collections import defaultdict
from IPython.core.display import display, HTML


In [None]:
## (a)

cols_to_keep = ['SEQN', 'RIDAGEYR', 'RIDRETH3', 'DMDEDUC2', 'DMDMARTL', 
                'RIDSTATR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'WTINT2YR',
                ]

data2011_2012 = pd.read_sas('DEMO_G.XPT')[cols_to_keep]
data2013_2014 = pd.read_sas('DEMO_H.XPT')[cols_to_keep]
data2015_2016 = pd.read_sas('DEMO_I.XPT')[cols_to_keep]
data2017_2018 = pd.read_sas('DEMO_J.XPT')[cols_to_keep]

data2011_2012['cohort years'] = '2011-2012'
data2013_2014['cohort years'] = '2013-2014'
data2015_2016['cohort years'] = '2015-2016'
data2017_2018['cohort years'] = '2017-2018'

demo2011_2018 = pd.concat([data2011_2012, data2013_2014, 
                           data2015_2016, data2017_2018])

new_cols = ['ids', 'age', 'race and ethnicity', 'education',
            'marital status', 'interview/examination status', 
            'masked variance pseudo-psu', 'masked variance pseudo-stratum', 
           'full sample 2 year mec exam weight',
           'full sample 2 year interview weight']
demo2011_2018 = demo2011_2018.reset_index(drop=True)
output_demo = demo2011_2018.rename(columns = dict(zip(cols_to_keep, new_cols)))

# convert variable types
for i in ['ids', 'age', 'race and ethnicity', 'education',
          'marital status', 'interview/examination status', 
          'masked variance pseudo-psu', 'masked variance pseudo-stratum']:
    output_demo[i] = output_demo[i].replace(np.nan, 0)
    output_demo[i] = output_demo[i].transform(lambda x: int(x))

output_demo.to_feather('demo_2011_2018.feather')

In [None]:
## (b)

ohxden2011_2012 = pd.read_sas('OHXDEN_G.XPT')
ohxden2013_2014 = pd.read_sas('OHXDEN_H.XPT')
ohxden2015_2016 = pd.read_sas('OHXDEN_I.XPT')
ohxden2017_2018 = pd.read_sas('OHXDEN_J.XPT')

cols_to_keep = ['SEQN', 'OHDDESTS']
cols_to_keep2 = []
cols_to_keep3 = []
colnames = ohxden2011_2012.keys()
for name in colnames:
    if re.match(r'OHX\d+TC', name):
        cols_to_keep2.append(name)
    elif re.match(r'OHX\d+CTC', name):
        cols_to_keep3.append(name)

ohxden2011_2012 = ohxden2011_2012[cols_to_keep + cols_to_keep2 + cols_to_keep3]
ohxden2013_2014 = ohxden2013_2014[cols_to_keep + cols_to_keep2 + cols_to_keep3]
ohxden2015_2016 = ohxden2015_2016[cols_to_keep + cols_to_keep2 + cols_to_keep3]
ohxden2017_2018 = ohxden2017_2018[cols_to_keep + cols_to_keep2 + cols_to_keep3]

ohxden2011_2012['cohort years'] = '2011-2012'
ohxden2013_2014['cohort years'] = '2013-2014'
ohxden2015_2016['cohort years'] = '2015-2016'
ohxden2017_2018['cohort years'] = '2017-2018'

ohxden2011_2018 = pd.concat([ohxden2011_2012, ohxden2013_2014, 
                             ohxden2015_2016, ohxden2017_2018])
ohxden2011_2018 = ohxden2011_2018.reset_index(drop=True)

OHXxxTC_name = []
for OHXxxTC in cols_to_keep2:
    OHXxxTC_num = re.findall(r'\d{1,9}\d', OHXxxTC)
    OHXxxTC_name.append(np.char.add('tooth count: #', OHXxxTC_num)[0])
OHXxxCTC_name = []
for OHXxxCTC in cols_to_keep3:
    OHXxxCTC_num = re.findall(r'\d{1,9}\d', OHXxxCTC)
    OHXxxCTC_name.append(np.char.add('coronal cavities: tooth count #',
                                     OHXxxCTC_num)[0])

new_cols = ['ids', 'age'] + OHXxxTC_name + OHXxxCTC_name

output_ohxden = ohxden2011_2018.rename(
    columns = dict(zip(cols_to_keep + cols_to_keep2 + cols_to_keep3, new_cols))
)

## convert variable type
for i in ['ids', 'age'] + OHXxxTC_name:
    output_ohxden[i] = output_ohxden[i].replace(np.nan, 0)
    output_ohxden[i] = output_ohxden[i].transform(lambda x: int(x))

output_ohxden.to_feather('ohxden_2011_2018.feather')

In [None]:
## (c)

print('The number of cases in the demographic datasets is',
      len(output_demo.index))
print('The number of cases in the oral health and dentition data is',
      len(output_ohxden.index))