-
Notifications
You must be signed in to change notification settings - Fork 1
/
2_create_db.Rmd
530 lines (423 loc) · 13.4 KB
/
2_create_db.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
---
title: "Create database and filter data"
author:
- Damiano Oldoni
- Peter Desmet
date: "`r Sys.Date()`"
output:
html_document:
toc: true
toc_depth: 3
toc_float: true
number_sections: true
---
In this document we transform the GBIF download - a csv file containing the occurrences of species for Belgium - into a sqlite database, to handle the large volume of data. Then we filter on issues and occurrence status. Note: some of these steps can take long.
# Setup
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```
Load libraries:
```{r load_libraries}
library(tidyverse) # To do datascience
library(tidylog) # To provide feedback on dplyr functions
library(here) # To find files
library(rgbif) # To use GBIF services
library(glue) # To write queries
library(RSQLite) # To interact with SQlite databases
```
# Transform CSV to sqlite file
## Get CSV file from GBIF
Download the occurrences from GBIF, based on the download key returned in `download.Rmd`:
```{r get_occ_file}
key <- "0031758-231002084531237"
zip_filename <- paste0(key, ".zip")
zip_path <- here::here("data", "raw", zip_filename)
if (!file.exists(zip_path)) {
occ <- occ_download_get(
key = key,
path = here::here("data", "raw")
)
}
```
Unzip the occurrence text file from the download file:
```{r unzip_csv}
occ_file <- paste(key, "occurrence.txt", sep = "_")
occ_path <- here::here("data", "raw", occ_file)
if (!file.exists(here::here("data", "raw", occ_file))) {
unzip(zipfile = zip_path,
files = "occurrence.txt",
exdir = here::here("data", "raw"))
file.rename(from = here::here("data", "raw", "occurrence.txt"),
to = occ_path
)
}
```
Columns names:
```{r get_cols}
cols_occ_file <- read_delim(occ_path, "\t", n_max = 1, quote = "")
cols_occ_file <- names(cols_occ_file)
cols_occ_file
```
Number of columns:
```{r n_cols}
length(cols_occ_file)
```
## Create sqlite file
### Database name, path and table name
Define name and path of `.sqlite` file:
```{r sqlite_path}
sqlite_file <- paste(key, "occurrence.sqlite", sep = "_")
sqlite_path <- here::here("data", "interim", sqlite_file)
```
And table name:
```{r define_table_name}
table_name <- "occ_all"
```
### Define storage class for each column
The standard storage class is `TEXT`:
```{r def_default_cols_type}
# default type: TEXT
field_types <- rep("TEXT", length(cols_occ_file))
names(field_types) <- cols_occ_file
```
The following columns should be of storage class `INTEGER`:
1. `*Key`, e.g. `taxonKey`, `speciesKey`, but no `datasetKey`
2. `*DayOfYear`: `startDayOfYear` and `endDayOfYear`
3. `year`
4. `month`
5. `day`
```{r set_to_integer}
int_fields <- names(field_types)[str_detect(names(field_types), "Key") &
names(field_types) != "datasetKey"]
int_fields <- c(
int_fields,
names(field_types)[str_detect(names(field_types), "DayOfYear")],
names(field_types)[names(field_types) == "year"],
names(field_types)[names(field_types) == "month"],
names(field_types)[names(field_types) == "day"]
)
field_types[which(names(field_types) %in% int_fields)] <- "INTEGER"
```
The following columns should be of storage class `REAL`:
1. `decimal*`: `decimalLatitude` and `decimalLongitude`
2. `coordinate*`: `coordinateUncertaintyInMeters` and `coordinatePrecision`
3. `pointRadiusSpatialFit`
```{r set_to_real}
real_fields <- names(field_types)[str_detect(names(field_types), "decimal")]
real_fields <- c(
real_fields,
names(field_types)[str_detect(names(field_types), "coordinate")],
names(field_types)[names(field_types) == "pointRadiusSpatialFit"]
)
field_types[which(names(field_types) %in% real_fields)] <- "REAL"
```
### Write csv to sqlite
Open connection to database:
```{r open_connection_to_db}
sqlite_occ <- dbConnect(SQLite(), dbname = sqlite_path)
```
Fill database with occurrences from text file. This step reads the large occurrence file in chunks and transfers them in the sqlite file. This step can take long the first time you run it:
```{r fill_sqlite_file}
if (!table_name %in% dbListTables(sqlite_occ)) {
dbWriteTable(
conn = sqlite_occ,
name = table_name,
sep = "\t",
value = occ_path,
row.names = FALSE,
header = TRUE,
field.types = field_types,
overwrite = TRUE
)
}
```
## Overview
Number of columns present:
```{r check_fields_present}
cols_occ_db <- dbListFields(sqlite_occ, table_name)
length(cols_occ_db)
```
Number of occurrences:
```{r n_occs_raw}
query <- glue_sql(
"SELECT COUNT(1) FROM {table}",
table = table_name,
.con = sqlite_occ
)
n_occs_total <- dbGetQuery(sqlite_occ, query)
n_occs_total
```
Preview first 100 rows from table `occ_all`:
```{r preview_df}
query <- glue_sql("SELECT * FROM {table} LIMIT 100",
table = table_name,
.con = sqlite_occ
)
preview_df <- dbGetQuery(conn = sqlite_occ, query)
preview_df
```
Are there occurrences without geographical coordinates? If yes (likely due to parsing issues) they should be filtered out in next section.
```{r check_issues_coords}
query <- glue_sql("SELECT * FROM {table} WHERE
decimalLatitude IS NULL OR
decimalLongitude IS NULL",
table = table_name,
.con = sqlite_occ)
suspect_coords_df <- dbGetQuery(sqlite_occ, query)
suspect_coords_df
```
# Filter data
## Define columns to select
We define a subset of columns, `cols_to_use`, we are interested to:
```{r columns_to_use}
cols_to_use <- c(
"gbifID", "scientificName", "kingdom", "phylum", "class", "order", "family",
"genus", "specificEpithet", "infraspecificEpithet", "taxonRank",
"taxonomicStatus", "datasetKey", "basisOfRecord", "occurrenceStatus",
"lastInterpreted", "hasCoordinate", "hasGeospatialIssues", "decimalLatitude",
"decimalLongitude", "coordinateUncertaintyInMeters", "coordinatePrecision",
"pointRadiusSpatialFit", "verbatimCoordinateSystem", "verbatimSRS",
"eventDate", "startDayOfYear", "endDayOfYear", "year", "month", "day",
"verbatimEventDate", "samplingProtocol", "samplingEffort", "issue",
"identificationVerificationStatus", "taxonKey", "acceptedTaxonKey",
"kingdomKey", "phylumKey", "classKey", "orderKey", "familyKey", "genusKey",
"subgenusKey", "speciesKey", "species"
)
```
Columns in occurrence file not in `cols_to_use`:
```{r cols_in_cols_to_use_not_present_in_cols_occ_db}
cols_to_use[which(!cols_to_use %in% cols_occ_db)]
```
will be removed from the selection:
```{r remove_cols_not_in_cols_occ_db}
cols_to_use <- cols_to_use[which(cols_to_use %in% cols_occ_db)]
```
Final number of columns to select:
```{r n_cols_to_use}
length(cols_to_use)
```
Storage class of these columns:
```{r define_field_type_subset}
field_types_subset <- field_types[which(names(field_types) %in% cols_to_use)]
field_types_subset
```
## Define filters on occurrences
Occurrences containing the following issues should be filtered out:
```{r issues}
issues_to_discard <- c(
"ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
)
```
Occurrences with the following occurrence status should be filtered out as well:
```{r occurrenceStatus}
occurrenceStatus_to_discard <- c(
"absent",
"excluded"
)
```
We won't take into account unverified observations neither:
```{r identificationVerificationStatus}
identificationVerificationStatus_to_discard <- c(
"unverified",
"unvalidated",
"not validated",
"under validation",
"not able to validate",
"control could not be conclusive due to insufficient knowledge",
"uncertain",
"unconfirmed",
"unconfirmed - not reviewed",
"validation requested"
)
```
We create an index based on these three columns if not already present:
```{r create_idx_occStatus_issue}
idx_occStatus_issue <- "idx_verifStatus_occStatus_issue"
# get indexes on table
query <- glue_sql(
"PRAGMA index_list({table_name})",
table_name = table_name,
.con = sqlite_occ
)
indexes_all <- dbGetQuery(sqlite_occ, query)
# create index if not present
if (!idx_occStatus_issue %in% indexes_all$name) {
query <- glue_sql(
"CREATE INDEX {`idx`} ON {table_name} ({`cols_idx`*})",
idx = idx_occStatus_issue,
table_name = table_name,
cols_idx = c("identificationVerificationStatus",
"occurrenceStatus",
"issue"),
.con = sqlite_occ
)
dbExecute(sqlite_occ, query)
}
```
As issues are semicolon separated (multiple issues could occur), we have to add `'%` before and after the issues for SQLite string matching (`LIKE` operator):
```{r add_%_}
issues_to_discard <- paste0("\'%", issues_to_discard, "%\'")
```
We create the subquery for filtering on issue conditions:
```{r subquery_issues}
issue_condition <- paste("issue NOT LIKE", issues_to_discard, collapse = " AND ")
issue_condition
```
## Create new table with filtered data
New table name: `occ`
```{r new_table_name}
table_name_subset <- "occ"
```
We create the new table with selected columns and filtered data on `occurrenceStatus` and `issue`:
```{r make_new_table_subset}
if (!table_name_subset %in% dbListTables(sqlite_occ)) {
dbCreateTable(conn = sqlite_occ,
name = table_name_subset,
fields = field_types_subset)
query <- glue_sql(
"INSERT INTO {small_table} SELECT {`some_cols`*} FROM {big_table} WHERE
LOWER(identificationVerificationStatus) NOT IN ({unverified*}) AND LOWER(occurrenceStatus) NOT IN ({bad_status*}) AND ", issue_condition,
small_table = table_name_subset,
some_cols = names(field_types_subset),
big_table = table_name,
unverified = identificationVerificationStatus_to_discard,
bad_status = occurrenceStatus_to_discard,
.con = sqlite_occ
)
dbExecute(sqlite_occ, query)
}
```
## Overview and control filtered data table
## Structure of `occ` table
Check whether the table `occ` has been made:
```{r check_new_table}
table_name_subset %in% dbListTables(sqlite_occ)
```
Columns present:
```{r cols_in_occ}
dbListFields(sqlite_occ, name = table_name_subset)
```
Number of occurrences:
```{r n_occs_filtered}
query <- glue_sql(
"SELECT COUNT(1) FROM {table}",
table = table_name_subset,
.con = sqlite_occ
)
n_occs <- dbGetQuery(sqlite_occ, query)
n_occs_total$`COUNT(1)`
```
### Check filtered data
Are there occurrences without geographical coordinates? It should not.
```{r check_issues_coords}
query <- glue_sql("SELECT * FROM {table} WHERE
decimalLatitude IS NULL OR
decimalLongitude IS NULL",
table = table_name_subset,
.con = sqlite_occ)
invalid_coords <- dbGetQuery(sqlite_occ, query)
invalid_coords
```
We create an index on `occurrenceStatus` to retrieve occurrence status values faster:
```{r create_idx_occStatus}
idx_occStatus <- "idx_occStatus"
# get indexes present on table
query <- glue_sql(
"PRAGMA index_list({table_name})",
table_name = table_name_subset,
.con = sqlite_occ
)
indexes <- dbGetQuery(sqlite_occ, query)
# create index if not present
if (!idx_occStatus %in% indexes$name) {
query <- glue_sql(
"CREATE INDEX {idx} ON {table_name} ({cols_idx})",
idx = idx_occStatus,
table_name = table_name_subset,
cols_idx = c("occurrenceStatus"),
.con = sqlite_occ
)
dbExecute(sqlite_occ, query)
}
```
Occurrence status left in the filtered data:
```{r check_occurrenceStatus_values}
query <- glue_sql(
"SELECT DISTINCT occurrenceStatus FROM {table}",
table = table_name_subset,
.con = sqlite_occ
)
dbGetQuery(sqlite_occ, query)
```
We create an index on `issue` as well:
```{r idx_issue}
idx_issue <- "idx_issue"
if (!idx_issue %in% indexes$name) {
query <- glue_sql(
"CREATE INDEX {idx} ON {table_name} ({cols_idx})",
idx = idx_issue,
table_name = table_name_subset,
cols_idx = c("issue"),
.con = sqlite_occ
)
dbExecute(sqlite_occ, query)
}
```
Issues left in the filtered data:
```{r check_issues_values}
query <- glue_sql(
"SELECT DISTINCT issue FROM {table}",
table = table_name_subset,
.con = sqlite_occ
)
issues_left <- dbGetQuery(sqlite_occ, query)
issues_left
```
Check presence of the unwanted issues in the issues left:
```{r check_filter_on_issue}
any(map_lgl(issues_to_discard,
function(issue) {
any(str_detect(issues_left$issue, issue))
}))
```
We create an index on `identificationVerificationStatus`:
```{r idx_identificationVerificationStatus}
idx_issue <- "idx_identificationVerificationStatus"
if (!idx_issue %in% indexes$name) {
query <- glue_sql(
"CREATE INDEX {idx} ON {table_name} ({cols_idx})",
idx = idx_issue,
table_name = table_name_subset,
cols_idx = c("identificationVerificationStatus"),
.con = sqlite_occ
)
dbExecute(sqlite_occ, query)
}
```
Identification verification status left in the filtered data:
```{r check_identificationVerificationStatus_values}
query <- glue_sql("SELECT identificationVerificationStatus, COUNT(*)
FROM {table}
GROUP BY identificationVerificationStatus
ORDER BY 2 DESC",
table = table_name_subset,
.con = sqlite_occ)
status_verification_left <- dbGetQuery(sqlite_occ, query)
status_verification_left
```
Overview of all indexes present on `occ`:
```{r index_filtered_table}
query <- glue_sql(
"PRAGMA index_list({table_name})",
table_name = table_name_subset,
.con = sqlite_occ
)
dbGetQuery(sqlite_occ, query)
```
Close connection:
```{r close_connection}
dbDisconnect(sqlite_occ)
```