forked from vynguyen92/nhanes_mortality_associations
-
Notifications
You must be signed in to change notification settings - Fork 0
/
f - compile_chemicals_dataset.R
194 lines (158 loc) · 9.72 KB
/
f - compile_chemicals_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTION - COMPILE THE NHANES CHEMICALS DATASET ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Purpose: This function extracts the individual chemical dataset for each cycle and merge into one combined
# chemicals dataset
#
# Inputs: main_directory - the working directory of the folder that contains the folders for each cycle. Each
# cycle-specific folder contains the file names for each the chemical dataset
# current_directory - the working directory of the folder where the function and main scripts of the
# project is housed.
#
# Outputs: returns a dataframe with the merged chemicals dataset
compile_chemicals_dataset <- function(main_directory
, current_directory)
{
# Establish the working directory to be the folder that contains the folders of the cycle-specific chemical
# datasets
setwd(main_directory)
# Obtain a vector of folder names, one for each cycle
nhanes_dataset_by_cycle <- list.files()
# Determine the number of cycles
num_cycles <- length(nhanes_dataset_by_cycle)
# Initialize a list to store a dataframe of biomarker measurements for each cycle
all_nhanes_datasets_by_cycle <- list()
# For each cycle in NHANES, go into a cycle-specific folder and extract the corresponding files to form a
# merged chemicals datasest for an ith cycle
for(i in seq(num_cycles))
{
# Define the directory of the ith cycle-specific folder
cycle_specific_nhanes_dataset_directory <- paste(main_directory
, nhanes_dataset_by_cycle[i]
, sep = "/")
print(cycle_specific_nhanes_dataset_directory)
# Establish the working directory for the ith cycle-specific folder
setwd(cycle_specific_nhanes_dataset_directory)
# Determine a vector of file names in the ith cycle-specific folder
files_names.xpt <- list.files()
# Determine the number of files in the ith cycle-specific folder
num_files_in_cycle_specific_folder <- length(files_names.xpt)
# Determine the name of the first file in the folder
first_file <- files_names.xpt[1]
# Replace the ".XPT" in the file name with ""
first_file <- gsub(".XPT"
, ""
, first_file)
# Use the updated file name to extract the appropriate dataset from the nhanesA package
# Store the first chemical dataset for the ith cycle into cycle_specific_datasest, so that subsequent
# chemical datasets can be merged with the first one
cycle_specific_dataset <- nhanes(first_file)
# For the 2nd file and beyond (jth) in the cycle-specific folder, the corresponding chemical dataset will
# be merged with the first chemical dataset by SEQN to form the cycle-specific dataset of chemical
# measurements
for(j in 2:num_files_in_cycle_specific_folder)
{
# Determine the name of the jth file in the folder and replace the ".XPT" in the file name with ""
file_name_j <- gsub(".XPT"
, ""
, files_names.xpt[j])
# Message to know which chemical dataset is being extracted
print(file_name_j)
# Store jth chemical dataset into temp_file
temp_file <- nhanes(file_name_j)
# Merge the jth chemical dataset with the previous chemicals datset by SEQN
cycle_specific_dataset <- merge(cycle_specific_dataset
, temp_file
, all = TRUE
, by = "SEQN")
}
# Determine the number of participants in the ith cycle
num_participants_cycle_i <- dim(cycle_specific_dataset)[1]
# Define the study year that each participant belongs to
study_year <- rep(i, num_participants_cycle_i)
# Store the merged chemical datset for the ith cycle into a list
all_nhanes_datasets_by_cycle[[i]] <- cycle_specific_dataset
# Determine the indices of duplicates for the ith cycle
index_dupl_codenames <- grep(".(x|y)", colnames(all_nhanes_datasets_by_cycle[[i]]))
# Determine the codename of the duplicates for the ith cycle
dupl_codenames <- colnames(all_nhanes_datasets_by_cycle[[i]])[index_dupl_codenames]
# Determine a vector of unique codenames that were duplicated
problematic_codenames <- unique(gsub(".(x|y)", "", dupl_codenames))
# For each biomarker that were duplicated, all measurements will be stored into the first duplicate
# to retain the maximum number of measurements pertaining to this biomarker
for(p in seq(length(problematic_codenames)))
{
# Start the pth duplicated biomarker
probl_codename <- problematic_codenames[p]
# Establish a pattern to find all duplicates pertaining to this pth biomarker
probl_code_pattern <- paste("\\b"
, probl_codename
, "\\b"
, sep = "")
# use the pattern to determine the indices of duplicate pertaining to this pth biomarker
index_probl_code_dupl <- grep(probl_code_pattern
, colnames(all_nhanes_datasets_by_cycle[[i]]))
# Determine the number of duplicates for this pth biomarker
num_probl_dupl <- length(index_probl_code_dupl)
# Determine the first index of all the duplicates for this pth biomarker
index_first_probl_code_dupl <- index_probl_code_dupl[1]
# For any subsequent duplicate (kth), extract all measurements and store it to the corresponding
# participant in the first duplicated vector
for(k in 2:num_probl_dupl)
{
# Determine the index pertaining the kth duplicated column vector for the pth biomarker
dupl_index_k <- index_probl_code_dupl[k]
# Store the column vector for this kth duplicate into a more readable variable name
measurements_dupl_k <- all_nhanes_datasets_by_cycle[[i]][,dupl_index_k]
# Determine the indices of participants who have measurements for this pth biomarker in the
# kth duplicated column vector
index_measurements_for_dupl_k <- which(!is.na(measurements_dupl_k))
# Extract the measurements and store it into the column vector pertaining to the first duplicate
all_nhanes_datasets_by_cycle[[i]][index_measurements_for_dupl_k,index_first_probl_code_dupl] <-
measurements_dupl_k[index_measurements_for_dupl_k]
}
# change the codename of the column vector that now contains all measurements for the pth biomarker
# to be without any extraneous symbol (i.e. ".x" or ".y")
colnames(all_nhanes_datasets_by_cycle[[i]])[index_first_probl_code_dupl] <- probl_codename
# The indices pertaining to the 2nd duplicates and beyond for the pth biomarker needs to be
# removed from the chemical dataset for the ith cycle
index_remove_dupl <- index_probl_code_dupl[2:num_probl_dupl]
# Remove the column vectors pertaining to the duplicates for the pth biomarker from the
# ith cycle chemical dataset
all_nhanes_datasets_by_cycle[[i]] <- all_nhanes_datasets_by_cycle[[i]][,-index_remove_dupl]
}
# Append the study_year for this ith cycle with the ith cycle chemical dataset
all_nhanes_datasets_by_cycle[[i]] <- data.frame(all_nhanes_datasets_by_cycle[[i]]
, study_year)
}
# Rename the list containing all the cycle-specific chemicals datasets
list_merged_nhanes_datasets <- all_nhanes_datasets_by_cycle
# Determine the number of cycles or technically, determine the number of dataframes stored in this list
num_elements_in_list <- length(lengths(list_merged_nhanes_datasets))
# There will be an error in the merge if the data type of the column variables are different
list_merged_nhanes_datasets[[2]]$URXUIO <- as.numeric(list_merged_nhanes_datasets[[2]]$URXUIO)
list_merged_nhanes_datasets[[2]]$URXP02 <- as.numeric(list_merged_nhanes_datasets[[2]]$URXP02)
list_merged_nhanes_datasets[[5]]$URXCRS <- as.numeric(list_merged_nhanes_datasets[[5]]$URXCRS)
list_merged_nhanes_datasets[[6]]$URXP01 <- as.numeric(list_merged_nhanes_datasets[[6]]$URXP01)
list_merged_nhanes_datasets[[6]]$URXP04 <- as.numeric(list_merged_nhanes_datasets[[6]]$URXP04)
list_merged_nhanes_datasets[[7]]$URXP04 <- as.numeric(list_merged_nhanes_datasets[[7]]$URXP04)
list_merged_nhanes_datasets[[7]]$URXNO3 <- as.numeric(list_merged_nhanes_datasets[[7]]$URXNO3)
list_merged_nhanes_datasets[[8]]$URXNO3 <- as.numeric(list_merged_nhanes_datasets[[8]]$URXNO3)
# Extract the first cycle chemicals dataset to be store into merged_nhanes_datasets so that subsequent
# kth cycles can be merged together
merged_nhanes_datasets <- list_merged_nhanes_datasets[[1]]
# For subsequent cycles, the kth cycle chemical dataset will be merged with the previous merged chemical
# dataset
for(k in 2:num_elements_in_list)
{
# Perform the merge by the codenames
merged_nhanes_datasets <- full_join(merged_nhanes_datasets
, list_merged_nhanes_datasets[[k]]
, by = NULL)
# Message to relay which cycle has been merged in
print(paste("Merge in Cycle ", k, sep = ""))
}
setwd(current_directory)
# Return the merged chemical biomarker dataset
return(merged_nhanes_datasets)
}