-
Notifications
You must be signed in to change notification settings - Fork 0
/
HgData_preprocess.R
390 lines (282 loc) · 13.6 KB
/
HgData_preprocess.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
require(dplyr)
require(ggplot2)
require(modern)
require(TDPanalysis)
# --------------------------------------------------------------------------
# Before running the preprocessing script, data have to be requested from
# the ICP Forests Database (http://icp-forests.net/page/data-requests)
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
#Correction of Hg concentrations using available drying factors
#Explanation: samples were dried at various temperatures (40 C - 80 C),
#thus measured concentration values have to be normalized to dry
#weight after drying at 105 C
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
#Exclude Polish samples from drying factor calculation, since samples from
#Poland were already dried at 105 C
dat_Poland <- dat %>% dplyr::filter(Country == "Poland")
dat <- dat %>% dplyr::filter(Country != "Poland")
#Correct Hg conc. for which drying factors are available
for (i in 1:length(dat$Hg_ng_g)) {
if(!is.na(dat$drying_factor[i])){
dat$Hg_ng_g[i] <- dat$Hg_ng_g[i]*dat$drying_factor[i]
}
}
#Filter dataset for available drying factors (old_dryfs) and for
#missing drying factors (dat_new_dryfs)
dat_old_dryfs <- dat %>%
dplyr::filter(!is.na(drying_factor))
dat_new_dryfs <- dat %>% dplyr::filter(is.na(drying_factor)) %>%
dplyr::mutate(key = paste(Species, Needle_age_class))
#Calculate average drying factors per tree species and needle age class
#to use for correction of Hg concentration values, for which no
#drying factor was determined
dryfs <- dat %>%
dplyr::filter(!is.na(drying_factor)) %>%
dplyr::group_by(Species, Needle_age_class) %>%
dplyr::summarise(mean_df = mean(drying_factor),
sd_df = sd(drying_factor),
n_df = length(drying_factor)) %>%
dplyr::mutate(key = paste(Species, Needle_age_class))
#Calculate average drying factor per tree species group (e.g. oak)
dryfs_species_short <- dat %>%
dplyr::filter(!is.na(drying_factor)) %>%
dplyr::group_by(Species_short, Needle_age_class) %>%
dplyr::summarise(mean_df = mean(drying_factor),
sd_df = sd(drying_factor),
n_df = length(drying_factor)) %>%
dplyr::mutate(key = paste(Species_short, Needle_age_class))
#Calculate average drying factor per foliage type (leaf/needle)
dryfs_foliage_type <- dat %>%
dplyr::filter(!is.na(drying_factor)) %>%
dplyr::group_by(Foliage_type, Needle_age_class) %>%
dplyr::summarise(mean_df = mean(drying_factor),
sd_df = sd(drying_factor),
n_df = length(drying_factor)) %>%
dplyr::mutate(key = paste(Foliage_type, Needle_age_class))
#Assign the correct drying factor of tree species and needle age class to
#respective sample, for which no drying factor is availabe (dat_new_dryfs)
for (i in 1:length(dat_new_dryfs$drying_factor)) {
for (j in 1:length(dryfs$mean_df)) {
if(dat_new_dryfs$key[i] == dryfs$key[j]){
dat_new_dryfs$drying_factor[i] <- dryfs$mean_df[j]
}
}
}
#For samples of some tree species (e.g. Abies borisii regis) no specific
#drying factor were available (dat_rest_dryfs). Assign drying factor per
#species group and needle age class (e.g. fir, current-season) to these samples
dat_rest_dryfs <- dat_new_dryfs %>%
dplyr::filter(is.na(drying_factor)) %>%
dplyr::mutate(key = paste(Species_short, Needle_age_class))
for (i in 1:length(dat_rest_dryfs$drying_factor)) {
for (j in 1:length(dryfs_species_short$mean_df)) {
if(dat_rest_dryfs$key[i] == dryfs_species_short$key[j]){
dat_rest_dryfs$drying_factor[i] <- dryfs_species_short$mean_df[j]
}
}
}
#For poplar there was no tree species or tree species group specific
#drying factor available. Assign average drying factor of deciduous leaves
#to poplar
dryfs_foliage_type <- as.data.frame(dryfs_foliage_type)
for (i in 1:length(dat_rest_dryfs$Species_short)) {
if(dat_rest_dryfs$Species_short[i] == "poplar"){
dat_rest_dryfs$drying_factor[i] =
(dryfs_foliage_type %>%
dplyr::filter(Foliage_type == "leaf") %>%
dplyr::select(mean_df))
}
}
#Merge datasets with calculated drying factors
dat_new_dryfs <- dat_new_dryfs %>% dplyr::select(-c("key"))
dat_new_dryfs <- dat_new_dryfs %>%
dplyr::filter(!is.na(drying_factor))
dat_rest_dryfs <- dat_rest_dryfs %>% dplyr::select(-c("key"))
dat_new_dryfs <- rbind(dat_new_dryfs, dat_rest_dryfs)
dat_new_dryfs$drying_factor <- unlist(dat_new_dryfs$drying_factor)
#Correct Hg content for drying temperature at 105 C by multiplication
#with drying factor
dat_new_dryfs$Hg_ng_g <- dat_new_dryfs$Hg_ng_g*dat_new_dryfs$drying_factor
#Merge data frames
dat_dryfs_corrected <- rbind(dat_old_dryfs, dat_new_dryfs)
#Correct values of C and N of three ICP Forests members
#Explanation: C and N values of Baden-Wurttemberg, Switzerland and UK had
#not been corrected for drying temperature previously
dat_dryfs_corrected$Corg_mg_g <-
as.numeric(dat_dryfs_corrected$Corg_mg_g)
dat_dryfs_corrected$Ntot_mg_g <-
as.numeric(dat_dryfs_corrected$Ntot_mg_g)
for (i in 1:length(dat_dryfs_corrected$Sample_code)) {
if((dat_dryfs_corrected$Country[i] == "Baden-Wurttemberg") |
(dat_dryfs_corrected$Country[i] == "Switzerland") |
(dat_dryfs_corrected$Country[i] == "UK")){
dat_dryfs_corrected$Corg_mg_g[i] <-
dat_dryfs_corrected$Corg_mg_g[i]*dat_dryfs_corrected$drying_factor[i]
dat_dryfs_corrected$Ntot_mg_g[i] <-
dat_dryfs_corrected$Ntot_mg_g[i]*dat_dryfs_corrected$drying_factor[i]
}
}
#Merge to data set, of which all values (Hg, C, N) are corrected for
#drying temperature
dat_final_dryf_corrected <- rbind(dat_dryfs_corrected, dat_Poland) %>%
dplyr::arrange(Country)
dat <- dat_final_dryf_corrected
remove(dat_final_dryf_corrected, dat_dryfs_corrected, dat_new_dryfs,
dat_old_dryfs, dat_Poland, dat_rest_dryfs, dryfs,
dryfs_foliage_type, dryfs_species_short)
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
#Calculation of daily foliar Hg uptake rates from foliar Hg concentrations
#by dividing Hg conc. by the number of days between the beginning of the
#growing season and the sampling date (or in the case of a few winter
#needles the 15th of November instead of the sampling date).
#To this, the appropriate start-of-season date has to be assigned to the
#respective foliage sample. This is the ICP Forests start-of-season date, where
#available, the start-of-season date of the closest observation location within
#the phenological database PEP725 for needles (BeginGS_PEP725_DOY) and the
#start-of-season date resulting from PROBA-V LAI modelling for leaves
#(BeginGS_LAImethod_DOY).
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
# Assign appropriate DOY for beginning of growing season to each foliage sample
#Initialize
dat <- dat %>% dplyr::mutate(BeginGS_final_DOY = 0)
#
for (i in 1:length(dat$BeginGS_final_DOY)) {
if (!is.na(dat$BeginGS_ICPF_DOY[i])){ #if there exists an ICPF value, assign
dat$BeginGS_final_DOY[i] <- dat$BeginGS_ICPF_DOY[i]
}
else if ((dat$Foliage_type[i] == "needle") && #assign PEP725 value to needles
(is.na(dat$BeginGS_ICPF_DOY[i]))) {
dat$BeginGS_final_DOY[i] <- dat$BeginGS_PEP725_DOY[i]
} #assign LAImethod value to leaves
else {dat$BeginGS_final_DOY[i] <- dat$BeginGS_LAImethod_DOY[i]}
}
#Round DOY
dat$BeginGS_final_DOY <- round(dat$BeginGS_final_DOY, digits = 0)
#Account for special case:
#there are four unreasonable start of growing season dates:
#SoS at one oak site in Northwestern Germany (Ehrhorn_Ei) 2015 is in July
#SoS at one oak site in NRW (Stadtlohn) 2015 is in almost June
#SoS at one beech site in France (Maron) 2015 is in June
#SoS at one oak site in Belgium (Gontrode) 2017 is in early March
#SoS at Ehrhorn_Ei, Stadtlohn, Maron and Gontrode will be set to
#29th of April (DOY: 119)
#Ehrhorn_Ei key: 4-308 2015
#Stadtlohn key: 4-620 2015
#Maron key: 1-60 2015
#Gontrode key: 2-16 2017
#Create key to identify the right forest plots of the right sampling year
dat$key <- paste(dat$ICPF_country_plot, dat$Sampling_year)
#Correction of Ehrhorn_Ei, Stadtlohn, Maron and Gontrode
for (i in 1:length(dat$BeginGS_final_DOY)) {
if((dat$key[i] == "4-308 2015") |
(dat$key[i] == "4-620 2015") |
(dat$key[i] == "1-60 2015") |
(dat$key[i] == "2-16 2017")){
dat$BeginGS_final_DOY[i] <- 119
}
}
dat$key <- NULL
# Evaluate average Start of Season DOY
#for sampling years and species group
Avg_SoS_DOY <- dat %>% dplyr::mutate(Site_year =
paste(as.character(Site_name),
as.character(Sampling_year),
sep = "")) %>%
dplyr::distinct(Site_year, .keep_all = TRUE) %>%
dplyr::group_by(Species_short) %>%
dplyr::summarise(Avg_DOY = round(mean(BeginGS_final_DOY)),
Std_DOY = round(sd(BeginGS_final_DOY)),
n_sites = n())
# Calculate daily Hg uptake
#Convert DOY of Begin GS to Date Begin GS for improved data handling
dat$origin <- "NA"
for (i in 1:length(dat$origin)) {
dat$origin[i] <- paste(as.character(dat$Sampling_year[i]-1),"-12-31", sep = "")
}
#Initialize BeginGS date column and format as date
dat$BeginGS_date <- as.Date("1900-01-01")
for (i in 1:length(dat$BeginGS_date)) {
dat$BeginGS_date[i] <- as.Date(dat$BeginGS_final_DOY[i], origin = dat$origin[i])
}
# Account for needle samples of different age classes
#Beginning of growing season is antedated for one year (365 d) for needle
#age class 1, two years (730 d) for needle age class 2 etc.
dat$BeginGS_date <- dat$BeginGS_date - dat$Needle_age_class * 365
#Define the date when foliar Hg uptake ends
#in most cases this is the sampling date
#in cases of needle sampling in winter (e.g. January) define
#end of uptake as 15th Nov (DOY = 319)
#Convert Sampling Date to DOY
dat$Sampling_date_character <-
as.character.Date(dat$Sampling_date, format = "%d/%m/%Y")
dat <- dat %>%
dplyr::mutate(Sampling_date_DOY =
date.to.DOY(Sampling_date_character, format = "dd/mm/yyyy"))
#Delete rows without sampling date
dat <- dat[-c(which(is.na(dat$Sampling_date_DOY))),]
#Initialize column with date when Hg uptake ends and format
dat$EndDate <- as.Date("01.01.2014", format = "%d.%m.%Y")
for (i in 1:length(dat$EndDate)) {
if ((dat$Sampling_date_DOY[i] > 319) |
(dat$Sampling_date_DOY[i] < 100)){
dat$EndDate[i] <- as.Date(paste("15.11.",as.character(dat$Sampling_year[i]),
sep = ""),
format = "%d.%m.%Y")
}
else {dat$EndDate[i] <- dat$Sampling_date[i]}
}
#Calculate difference in days
dat$Sampling_interval_d <- dat$EndDate - dat$BeginGS_date
dat$Sampling_interval_d <- as.numeric(dat$Sampling_interval_d)
#Calculate daily Hg uptake rate
dat$Hg_daily <- dat$Hg_ng_g / dat$Sampling_interval_d
dat <- dat %>% dplyr::select(-c("origin", "Sampling_date_character"))
#Account for special case:
#some sites in Baden-Wuerttemberg have negative day differences
#due to a sampling date before the defined start of season
for (i in 1:length(dat$Sampling_interval_d)) {
if((!is.na(dat$Sampling_interval_d[i])) &
(dat$Sampling_interval_d[i] < 0)) {
dat$Sampling_interval_d[i] <- abs(dat$Sampling_interval_d[i]) + 365
}
}
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
# Outlier detection within daily foliar Hg uptake values using method of
#modified Z scores after Iglewicz and Hoaglin with a threshold of 3.5
#function iglewicz_hoaglin within R modern package
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
#Filter NA
dat <- dat[-c(which(is.na(dat$Hg_daily))),]
#Calculate modified Z scores within each species group and needle year class
modZ_scores <- dat %>% dplyr::group_by(Species_short, Needle_age_class) %>%
dplyr::arrange(Sample_code, .by_group = TRUE) %>%
dplyr::do(modZ = iglewicz_hoaglin(.$Hg_daily, threshold = 3.5,
return_scores = TRUE)) %>%
tidyr::unnest(cols = c(modZ))
dat_arranged <- dat %>% dplyr::group_by(Species_short, Needle_age_class) %>%
dplyr::arrange(Sample_code, .by_group = TRUE) %>%
tidyr::unnest()
dat_modZ <- cbind(dat_arranged, modZ_scores[,3])
#Test to check if sample is assigned to correctly corresponding modified Z score
# ash <- dat %>% dplyr::filter(Species_short == "ash") %>%
# dplyr::arrange(Sample_code)
# test_ash <- iglewicz_hoaglin(ash$Hg_daily, threshold = 3.5,
# return_scores = TRUE)
dat_outlier_corrected <- dat_modZ %>% #remove values with mod. Z score of
dplyr::filter(modZ >= -3.5) %>% # < -3.5 and
dplyr::filter(modZ <= 3.5) # > 3.5
percent_outlier_values = (1 - (length(dat_outlier_corrected$Sample_code)/
length(dat$Sample_code)))*100
#Please note:
#the percent_outlier_values does not correspond exactly to the percentage
#mentioned in the paper which is due to the fact that foliage samples from
#the Austrian Bio-Indicator Grid are excluded from the data set
dat_processed <- dat_outlier_corrected
remove(dat, dat_arranged, dat_modZ, dat_outlier_corrected, modZ_scores)
dat_processed <- as.data.frame(dat_processed)