/
filter_fis.R
137 lines (127 loc) · 5.28 KB
/
filter_fis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Fis
#' @title Fis filter
#' @description
#' TODO
#' @param approach Character. By \code{"SNP"} or by \code{"haplotype"}.
#' The function will consider the SNP or haplotype statistics to filter the marker.
#' Default by \code{"haplotype"}.
#' @param fis.min.threshold Number.
#' @param fis.max.threshold Number.
#' @param fis.diff.threshold Number (0 - 1)
#' @param pop.threshold Fixed number of pop required to keep the locus.
#' @param percent Is the threshold a percentage ? TRUE or FALSE.
#' @param filename (optional) The function uses \code{\link[fst]{write.fst}},
#' to write the tidy data frame in
#' the folder created in the working directory. The file extension appended to
#' the \code{filename} provided is \code{.rad}.
#' Default: \code{filename = NULL}.
#' @inheritParams tidy_genomic_data
#'
#' @details
#' The Fis calculated uses the ratio of averages (1-mean(Ho)/mean(Hs))
#' and NOT THE AVERAGE OF RATIOS (Nei 1987).
#' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press.
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
#' @rdname filter_fis
#' @export
filter_fis <- function(data, approach = "haplotype", fis.min.threshold, fis.max.threshold, fis.diff.threshold, pop.threshold, percent, filename) {
# Cleanup-------------------------------------------------------------------
# obj.keeper <- c(ls(envir = globalenv()), "fis.filter")
verbose <- TRUE
radiator_function_header(f.name = "filter_fis", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "filter_fis", start = FALSE, verbose = verbose), add = TRUE)
# on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))
if (is.vector(data)) {
data <- readr::read_tsv(data, col_names = TRUE)
}
pop.number <- dplyr::n_distinct(data$POP_ID)
if (stringi::stri_detect_fixed(pop.threshold, ".") & pop.threshold < 1) {
multiplication.number <- 1/pop.number
message("Using a proportion threshold...")
threshold.id <- "of proportion"
} else if (stringi::stri_detect_fixed(percent, "T")) {
multiplication.number <- 100/pop.number
message("Using a percentage threshold...")
threshold.id <- "percent"
} else {
multiplication.number <- 1
message("Using a fixed threshold...")
threshold.id <- "population as a fixed"
}
if (missing(approach) | approach == "haplotype") {
message("Approach selected: haplotype")
fis.filter <- data %>%
dplyr::select(LOCUS, POS, POP_ID, FIS) %>%
dplyr::group_by (LOCUS, POP_ID) %>%
dplyr::summarise(
FIS_MIN = min(FIS),
FIS_MAX = max(FIS),
FIS_DIFF = FIS_MAX-FIS_MIN
) %>%
dplyr::filter(FIS_MIN >= fis.min.threshold) %>%
dplyr::filter(FIS_MAX <= fis.max.threshold) %>%
dplyr::filter(FIS_DIFF <= fis.diff.threshold) %>%
dplyr::group_by(LOCUS) %>%
dplyr::tally(.) %>%
dplyr::filter((n * multiplication.number) >= pop.threshold) %>%
dplyr::select(LOCUS) %>%
dplyr::left_join(data, by = "LOCUS") %>%
dplyr::arrange(LOCUS, POP_ID)
} else {
message("Approach selected: SNP")
fis.filter <- data %>%
dplyr::select(LOCUS, POS, POP_ID, FIS) %>%
dplyr::group_by(LOCUS, POS, POP_ID) %>%
dplyr::summarise(
FIS_MIN = min(FIS),
FIS_MAX = max(FIS),
FIS_DIFF = FIS_MAX-FIS_MIN
) %>%
dplyr::filter(FIS_MIN >= fis.min.threshold) %>%
dplyr::filter(FIS_MAX <= fis.max.threshold) %>%
dplyr::filter(FIS_DIFF <= fis.diff.threshold) %>%
dplyr::group_by(LOCUS, POS) %>%
dplyr::tally(.) %>%
dplyr::filter((n * multiplication.number) >= pop.threshold) %>%
dplyr::select(LOCUS, POS) %>%
dplyr::left_join(data, by = c("LOCUS", "POS")) %>%
dplyr::arrange(LOCUS, POS, POP_ID)
}
if (missing(filename) == "FALSE") {
message("Saving the file in your working directory...")
readr::write_tsv(fis.filter, filename, append = FALSE, col_names = TRUE)
saving <- paste("Saving was selected, the filename:", filename, sep = " ")
} else {
saving <- "Saving was not selected..."
}
invisible(cat(sprintf(
"Fis filter:
Fis min >= %s or Fis max <= %s or
difference along the read/haplotype between the max and min Fis > %s,
all in %s percent of the sampling sites/pop were removed\n
The number of SNP removed by the Fis filter = %s SNP
The number of LOCI removed by the Fis filter = %s LOCI
The number of SNP before -> after the Fis filter: %s -> %s SNP
The number of LOCI before -> after the Fis filter: %s -> %s LOCI\n
%s\n
Working directory:
%s",
fis.min.threshold, fis.max.threshold, fis.diff.threshold, pop.threshold,
dplyr::n_distinct(data$POS) - dplyr::n_distinct(fis.filter$POS),
dplyr::n_distinct(data$LOCUS) - dplyr::n_distinct(fis.filter$LOCUS),
dplyr::n_distinct(data$POS), dplyr::n_distinct(fis.filter$POS),
dplyr::n_distinct(data$LOCUS), dplyr::n_distinct(fis.filter$LOCUS),
saving, getwd()
)))
return(fis.filter)
}