In [2]:
import pandas as pd
import numpy as np
import os
import math
import matplotlib.pyplot as plt
from scipy import stats
from itertools import product

In [3]:
index_list1 = ['SEQN', 'RIDAGEYR', 'RIDRETH3', 'DMDEDUC2', 'DMDMARTL',
              'RIDSTATR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'WTINT2YR']

# read files and and a column
path1 = r"./demographic"
files1 = os.listdir(path1)
dat1 = []
for file in files1:
    if file.endswith('.XPT'):
        a = pd.read_sas(path1+'/'+file)
        # select specified columns
        a = a[index_list1]
        # add the column identifying to which cohort each case belongs
        a['YEAR'] = [file[6:10] for i in range(len(a))]
        dat1.append(a)
        
df1 = pd.concat(dat1, ignore_index=True)

# rename columns
df1 = df1.rename(columns = {'SEQN': 'id', 'RIDAGEYR':'age', 
                            'RIDRETH3':'race', 'DMDEDUC2':'education', 
                            'DMDMARTL':'marital_status', 'SDMVPSU':'psu', 
                            'SDMVSTRA':'stratum'})
df1.columns = df1.columns.str.lower()

# object: id
# int: age, psu, stratum
# category: gender, race, education, marital_status, ridstatr
df1['id'] = df1['id'].astype(int).astype(object)

l1_1 = ['age', 'psu', 'stratum']
df1[l1_1] = df1[l1_1].astype(int)

l1_2 = ['race', 'education', 'marital_status', 'ridstatr']
df1[l1_2] = df1[l1_2].fillna(-1).astype('Int8')
for k in l1_2:
    df1[k] = pd.Categorical(df1[k])

In [4]:
# columns
dat = pd.read_sas('./ohxden/OHXDEN_G2011.XPT')
index_list2 = ['SEQN','OHDDESTS']
all_index = dat.columns
for a in all_index:
    if a[:3] == 'OHX' and a[-2:] == 'TC':
        index_list2.append(a)

path2 = r"./ohxden"
files2 = os.listdir(path2)
dat2 = []
for file in files2:
    if file.endswith('.XPT'):
        a = pd.read_sas(path2+'/'+file)
        # select specified columns
        index_list2 = ['SEQN','OHDDESTS']
        all_index = dat.columns
        for idx in all_index:
            if idx[:3] == 'OHX' and idx[-2:] == 'TC':
                index_list2.append(idx)
        a = a[index_list2]
        # add the column identifying to which cohort each case belongs
        a['YEAR'] = [file[-8:-4] for i in range(len(a))]
        dat2.append(a)
        
df2 = pd.concat(dat2, ignore_index=True)

# rename columns
df2 = df2.rename(columns = {'SEQN': 'id'})
df2.columns = df2.columns.str.lower()

# object: id
# category: ohddests, ohxXXtc, ohxXXctc
columnNames2 = df2.columns

## seqn
df2['id'] = df2['id'].astype(int).astype(object)

## ohxXXctc
l2_1 = [x for x in columnNames2 if x[-3:] == 'ctc']
for idx in l2_1:
    df2[idx] = df2[idx].str.decode('utf8', errors='strict').astype('category')

## ohxXXtc & ohddests
l2_2 = [x for x in columnNames2 if x[-2:]=='tc' and x[-3]!='c']
l2_2.append('ohddests')
df2[l2_2] = df2[l2_2].fillna(-1).astype(int)
for x in l2_2:
    df2[x] = pd.Categorical(df2[x])

In [None]:
# save to pickle
df1.to_pickle('./demographic.pkl')
df2.to_pickle('./ohxden.pkl')