Imports

In [1]:
import numpy as np
import os
import os.path
from functools import reduce
import pickle

Logistics

In [2]:
all_data_path = os.path.join(os.environ['NHANES_PROJECT_ROOT'], 'data/npy')
diet_path = os.path.join(all_data_path, 'diet')
demo_path = os.path.join(all_data_path, 'demo')
exam_path = os.path.join(all_data_path, 'exam')
lab_path = os.path.join(all_data_path, 'lab')

label_names = {'demo': 'demographic_column_headers', 'blood': 'blood_pressure_column_headers', 'body': 'body_measurement_column_headers', 'lab': 'quantized_dense_labdata_info', 'diet': 'dietary_labels_2007-2013'}
data_names = {'demo': 'demographic_data', 'blood': 'blood_pressure_data', 'body': 'body_measurement_data', 'lab': 'dense_labdata', 'diet': 'dietary_data_2007-2013'}
years = ['{}-{}'.format(year, year+1) for year in range(2007, 2013+1, 2)]


Helpers

In [3]:
#O(nm)..., but serves its purpose
def indexOf(labels, label):
    index = -1
    for i in range(len(labels)):
        if label == labels[i]['name']:
            index = i
            break
    return index

Merge lab all years

In [None]:
sel_chems = ['URXUHG', 'URXUCD', 'URXUPB', 'LBXBGM','LBXBCD',  'LBXBPB', 'LBXTHG', 'URXUAS']
comp = 'lab'
all_datas = []
fill_val = 0
for year in years:
    data_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(data_names[comp], year))
    label_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(label_names[comp], year))
    data_year = np.load(data_path)
    labels_year = np.load(label_path)
    
    #get indices of interest
    indices = [indexOf(labels_year, chem_name) for chem_name in sel_chems]
    
    #discard first X0
    data_year = data_year[:,:,1]
    
    #take indices of interest
    data_year = data_year[:, indices]
    
    #fill missing values
    data_year[:, np.array(indices) == -1] = fill_val
    all_datas.append(data_year)
    #just take the latest year's labels since all the same
    if year == '2013-2014':
        lab_labels = labels_year[indices]
lab_data = np.vstack(all_datas)

Merge demo all years

In [5]:
comp = 'demo'
all_labels = []
all_datas = []
for year in years:
    data_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(data_names[comp], year))
    label_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(label_names[comp], year))
    data_year = np.load(data_path)
    labels_year = np.load(label_path)
    all_datas.append(data_year)
    all_labels.append(labels_year)
demo_data = np.vstack(all_datas)
demo_labels = np.array([{'name':label} for label in all_labels[-1]], dtype=object)

Merge blood all years

In [6]:
comp = 'blood'
all_labels = []
all_datas = []
for year in years:
    data_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(data_names[comp], year))
    label_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(label_names[comp], year))
    data_year = np.load(data_path)
    labels_year = np.load(label_path)
    all_datas.append(data_year)
    all_labels.append(labels_year)
blood_data = np.vstack(all_datas)
blood_labels = np.array([{'name':label} for label in all_labels[-1]], dtype=object)

Merge body all years

In [7]:
comp = 'body'
all_labels = []
all_datas = []
for year in years:
    data_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(data_names[comp], year))
    label_path = os.path.join(all_data_path, os.path.join(comp), '{}_{}.npy'.format(label_names[comp], year))
    data_year = np.load(data_path)
    labels_year = np.load(label_path)
    all_datas.append(data_year)
    all_labels.append(labels_year)
body_data = np.vstack(all_datas)
body_labels = np.array([{'name':label} for label in all_labels[-1]], dtype=object)

Merge diet

In [8]:
comp = 'diet'
data_path = os.path.join(all_data_path, os.path.join(comp), '{}.npy'.format(data_names[comp]))
label_path = os.path.join(all_data_path, os.path.join(comp), '{}.npy'.format(label_names[comp]))
diet_data = np.load(data_path)
diet_labels = np.load(label_path)
diet_labels = np.array([{'name':label} for label in diet_labels], dtype=object)

Merge all

In [9]:
#sanity check
print(lab_data.shape, lab_labels.shape, demo_data.shape, demo_labels.shape, blood_data.shape, blood_labels.shape, body_data.shape, body_labels.shape, diet_data.shape, diet_labels.shape)
data_arr = [demo_data, blood_data, body_data, diet_data, lab_data]
labels_arr = [demo_labels, blood_labels, body_labels, diet_labels, lab_labels]
data = np.hstack(data_arr)
labels = np.hstack(labels_arr)

(40617, 8) (8,) (40617, 134) (134,) (40617, 102) (102,) (40617, 44) (44,) (40617, 32332) (32332,)


Make info

In [231]:
info = {'offsets':
            {'demo':0,
            'blood': demo_labels.size,
            'body': info['offsets']['blood'] + blood_labels.size,
            'diet': info['offsets']['body'] + body_labels.size,
            'lab': info['offsets']['diet'] + diet_labels.size, 
            }
       }

Filter out years

In [2]:
children_labs = ['RIDAGEYR_0-9', 'RIDAGEYR_10-19']
indices = [indexOf(labels, child_lab) for child_lab in children_labs]
adult_ind = np.sum(data[:, indices], 1) == 0
adult_data = data[adult_ind, :]
seqn = np.load(os.path.join(all_data_path, os.path.join('diet'), 'dietary_seqn_2007-2013.npy'))
adult_seqn = seqn[adult_ind]

NameError: name 'indexOf' is not defined

Save stuff

In [239]:
save data, labels, seqn
out_path = os.path.join(all_data_path, 'all')
np.save(os.path.join(out_path, 'data_adult_2007-2014'), adult_data)
np.save(os.path.join(out_path, 'labels_adult_2007-2014'), labels)
np.save(os.path.join(out_path, 'seqn_adult_2007-2014'), adult_seqn)
pickle.dump(info, open(os.path.join(out_path, 'info.pkl'), "wb" ))