In [2]:
library(igraph)
library(dplyr)
library(parallel)
library(stringr)
library(randomcoloR)
library(ggplot2)
library(Biostrings)


Attaching package: ‘igraph’


The following objects are masked from ‘package:stats’:

    decompose, spectrum


The following object is masked from ‘package:base’:

    union



Attaching package: ‘dplyr’


The following objects are masked from ‘package:igraph’:

    as_data_frame, groups, union


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


“package ‘randomcoloR’ was built under R version 4.2.3”
Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:igraph’:

    normalize, path, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    c

In [1]:
tsv = read.table("../data/T47_MAS_SEQ_20240104/edlibR/adapter.list.T47.mismatch2.df.tsv")

In [3]:
colnames(tsv) = c("start","end","width","type","fqname")

In [None]:
sci_fq_T47 = readDNAStringSet("../fastq/T47_MAS_SEQ_20240104.fastq.gz",format = "fastq")

In [None]:
fqnames = read.table("../fastq/T47_MAS_SEQ_20240104.fq.names")

In [None]:
plot_adapter_num = function(df,title){
    set.seed(43)
    palette <- distinctColorPalette(450)
    adapter.list.edlibR.mismatch.count = df %>% group_by(fqname) %>% summarise(adapter_count = n())
    df2 = df %>% group_by(fqname) %>%
        summarise(adapter_chain = paste(type,collapse = ","))
    df2$classification = unlist(lapply(df2$adapter_chain,classify_read))
    adapter.list.edlibR.mismatch.count.table = data.frame(table(adapter.list.edlibR.mismatch.count$adapter_count,df2$classification))
    colnames(adapter.list.edlibR.mismatch.count.table) = c("adapter_count","adapter_classification","count")
    adapter.list.edlibR.mismatch.count.table = adapter.list.edlibR.mismatch.count.table %>% arrange(adapter_classification)
    names(palette) = all.classification
    ggplot(adapter.list.edlibR.mismatch.count.table, aes(fill=adapter_classification, y=count, x=adapter_count)) + 
        geom_bar(position="stack", stat="identity") + scale_fill_manual(values = palette) +
    theme_bw() + guides(fill=guide_legend(ncol=2)) + ggtitle(title)
}

adapter_num = function(df){
    adapter.list.edlibR.mismatch.count = df %>% group_by(fqname) %>% summarise(adapter_count = n())
    df2 = df %>% group_by(fqname) %>%
        summarise(adapter_chain = paste(type,collapse = ","))
    df2$classification = unlist(mclapply(df2$adapter_chain,classify_read,mc.cores = 80))
    return(df2)
}

order_suffix = function(x){
    paste(sort(str_split(x,pattern = "-")[[1]]),collapse  = "-")
}

classify_read = function(adapter_chain,adapters_graph = adapter_graph){
    adapters = str_split(adapter_chain,pattern = ",")[[1]]
    all.forward = all(!grepl(adapters,pattern = "*rc$"))
    all.reverse = all(grepl(adapters,pattern = "*rc$"))
    #This no.rc means that the read has no palindrome sequence.
    rc_mix = !(all.forward | all.reverse)
    forward.index = which(!grepl(adapters,pattern = "*rc$"))
    reverse.index = grep(adapters,pattern = "*rc$")

    rc_mix = ifelse(rc_mix,ifelse(all(diff(forward.index)==1) & all(diff(reverse.index)==1),"RC-successive","RC-mosaic"),"No-RC")
    forward.adapters = adapters[forward.index]
    reverse.adapters = adapters[reverse.index]
    
    if((length(forward.adapters)+length(reverse.adapters))>1){
        if(length(forward.adapters) == 1 & length(reverse.adapters) == 1){
            suffix = "Singleton-Singleton"
        }else if(length(forward.adapters) <= 1){
            reverse.adapters = data.frame(from = reverse.adapters[1:(length(reverse.adapters)-1)],
                                      to = reverse.adapters[2:length(reverse.adapters)])
            reverse.dist = apply(reverse.adapters,1,function(reverse.adapter){
                test_paths(adapters_graph,reverse.adapter[1],reverse.adapter[2])
            })
            reverse.adapter_sequence = ifelse(any(is.infinite(reverse.dist)),"Wrong direction",ifelse(all(reverse.dist==1),"Successive","Jump"))
            forward.adapter_sequence = ifelse(length(forward.adapters)==0,"NA","Singleton")
            suffix = paste(forward.adapter_sequence,reverse.adapter_sequence,sep = "-")
            suffix = order_suffix(suffix)
        }else if(length(reverse.adapters) <= 1){
            forward.adapters = data.frame(from = forward.adapters[1:(length(forward.adapters)-1)],
                                  to = forward.adapters[2:length(forward.adapters)])
            forward.dist = apply(forward.adapters,1,function(forward.adapter){
                test_paths(adapters_graph,forward.adapter[1],forward.adapter[2])
            })
            reverse.adapter_sequence = ifelse(length(reverse.adapters)==0,"NA","Singleton")
            forward.adapter_sequence = ifelse(any(is.infinite(forward.dist)),"Wrong direction",ifelse(all(forward.dist==1),"Successive","Jump"))
            suffix = paste(forward.adapter_sequence,reverse.adapter_sequence,sep = "-")
            suffix = order_suffix(suffix)
        }else{
            forward.adapters = data.frame(from = forward.adapters[1:(length(forward.adapters)-1)],
                                  to = forward.adapters[2:length(forward.adapters)])
            forward.dist = apply(forward.adapters,1,function(forward.adapter){
                test_paths(adapters_graph,forward.adapter[1],forward.adapter[2])
            })

            reverse.adapters = data.frame(from = reverse.adapters[1:(length(reverse.adapters)-1)],
                                      to = reverse.adapters[2:length(reverse.adapters)])
            reverse.dist = apply(reverse.adapters,1,function(reverse.adapter){
                test_paths(adapters_graph,reverse.adapter[1],reverse.adapter[2])
            })
            forward.adapter_sequence = ifelse(any(is.infinite(forward.dist)),"Wrong direction",ifelse(all(forward.dist==1),"Successive","Jump"))
            reverse.adapter_sequence = ifelse(any(is.infinite(reverse.dist)),"Wrong direction",ifelse(all(reverse.dist==1),"Successive","Jump"))
            suffix = paste(forward.adapter_sequence,reverse.adapter_sequence,sep = "-")
            suffix = order_suffix(suffix)
        }
        final = paste(rc_mix,suffix,sep = ";")
    }else{
        final = "Singleton"
    }
}

test_paths <- function(g, from, to){
    ifelse(is.finite(c(shortest.paths(g, from,to,mode = "out"))),c(shortest.paths(g, from,  to,mode = "out")),Inf)
}

adapters = c(paste("adapter",LETTERS[1:16],sep  = ""),paste("adapter",LETTERS[1:16],".rc",sep  = ""))

links = data.frame(from = adapters[1:15],to = adapters[2:16])

links2 = data.frame(from = adapters[18:32],to = adapters[17:31])

links = rbind(links,links2)

adapter_graph = graph_from_data_frame(links,directed = T)

In [None]:
T47_new_adapter = adapter_num(tsv)

In [None]:
T47_new_adapter.count = tsv %>% group_by(fqname) %>% summarise(adapter_count = n())

In [None]:
#classification definition
suffix = expand.grid(c("Wrong direction","Successive","Jump","NA","Singleton"),c("Wrong direction","Successive","Singleton","Jump","NA"))%>%
filter(!(Var1 == "NA" & Var2 == "NA")) %>% apply(.,1,function(x){paste(x[1],x[2],sep = "-")})
prefix = c("RC-successive","RC-mosaic","No-RC")
all.classification = c(expand.grid(prefix,suffix)%>% apply(.,1,function(x){paste(x[1],x[2],sep = ";")}),"Singleton")

In [None]:
set.seed(43)
palette <- distinctColorPalette(100)

In [None]:
T47_new_adapter.count.table = data.frame(table(T47_new_adapter.count$adapter_count,T47_new_adapter$classification))
colnames(T47_new_adapter.count.table) = c("adapter_count","adapter_classification","count")
T47_new_adapter.count.table = T47_new_adapter.count.table %>% arrange(adapter_classification)
names(palette) = all.classification
T47_new_adapter.count.table = T47_new_adapter.count.table %>% filter(adapter_count %in% 1:30)

In [None]:
options(repr.plot.height = 10,repr.plot.width = 20)
ggplot(T47_new_adapter.count.table, aes(fill=adapter_classification, y=count, x=adapter_count)) + 
    geom_bar(position="stack", stat="identity") + scale_fill_manual(values = palette) +
theme_bw() + guides(fill=guide_legend(ncol=2)) + ggtitle("Mismatch2")

In [None]:
T47_new_adapter.count.table %>% mutate(type = ifelse(adapter_count %in% 1:6,"short","long"))%>% group_by(type) %>% summarise(count = sum(count))

In [None]:
T47_new_adapter.count.table %>% filter(grepl(adapter_classification,pattern = "mosaic")) %>% 
mutate(type = ifelse(adapter_count %in% 1:6,"short","long")) %>% group_by(type) %>% summarise(count = sum(count))

In [107]:
321853/1796525
23295/305486

113924/1796525
92339/305486

In [127]:
names(sci_fq_T47) = fqnames$V1

In [140]:
notrim.fq = sci_fq_T47[setdiff(names(sci_fq_T47),unique(T47_new_adapter$fqname))]

In [147]:
notrim.fq2 = sci_fq_T47[T47_new_adapter.count %>% filter(adapter_count <= 2) %>% pull(fqname)]

In [151]:
notrim.fq3 = c(notrim.fq,notrim.fq2)

In [149]:
to_deal.fq = sci_fq_T47[T47_new_adapter.count %>% filter(adapter_count > 2) %>% pull(fqname)]

In [153]:
to_deal.fq

DNAStringSet object of length 1045251:
           width seq                                        names               
      [1]   7224 [47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m...[47m[30mC[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m 000000d6-25b9-433...
      [2]   1572 [