-
Notifications
You must be signed in to change notification settings - Fork 0
/
mod_nanopolish.R
38 lines (33 loc) · 1.41 KB
/
mod_nanopolish.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
suppressMessages(require(tidyverse))
load_file.nanopolish <- function(filename) {
cols <- c("chromosome", "start", "read_name", "strand", "log_lik_ratio",
"num_motifs", "sequence")
coltypes <- list(chromosome = 'c', start = 'i', read_name = 'c',
log_lik_ratio = 'd', num_motifs = 'i', sequence = 'c',
strand = col_factor(levels=c('+', '-')))
vroom(filename, delim='\t', col_select=cols, col_types=coltypes) %>%
dplyr::rename(seqname = chromosome,
pos = start,
read_id = read_name)
}
preprocess.nanopolish <- function(df, motif="CG") {
## Assign same log-lik-ratio to all bases covered in motif
offset <- unlist(gregexpr(pattern=motif, df$sequence)) - 1
data <- df %>%
uncount(num_motifs) %>%
mutate(pos = pos - 5 + offset,
log_lik_ratio = -1 * log_lik_ratio) %>%
select(-sequence)
## Upon inspecting, a single read sometimes produces more than one statistic at a position
## Assuming statistic in the middle contribute the most, use weights from binomial distribution
data <- data %>%
group_by_at(vars(-log_lik_ratio)) %>%
summarise(log_lik_ratio = sum(
dbinom(0:(n()-1), n()-1, 0.5) * log_lik_ratio
)) %>%
ungroup() %>%
mutate(read_id = sub("_Basecall_1D_template", "", read_id),
pos = pos + 1,
prob_mod = 1 / (1 + exp(log_lik_ratio)))
data
}