In [21]:
library(igraph)
library(dplyr)
library(parallel)
library(stringr)

In [1]:
tsv = read.table("../data/T47_MAS_SEQ_20240104/edlibR/adapter.list.T47.mismatch2.df.tsv")

In [16]:
colnames(tsv) = c("start","end","width","type","fqname")

In [30]:
plot_adapter_num = function(df,title){
    set.seed(43)
    palette <- distinctColorPalette(450)
    adapter.list.edlibR.mismatch.count = df %>% group_by(fqname) %>% summarise(adapter_count = n())
    df2 = df %>% group_by(fqname) %>%
        summarise(adapter_chain = paste(type,collapse = ","))
    df2$classification = unlist(lapply(df2$adapter_chain,classify_read))
    adapter.list.edlibR.mismatch.count.table = data.frame(table(adapter.list.edlibR.mismatch.count$adapter_count,df2$classification))
    colnames(adapter.list.edlibR.mismatch.count.table) = c("adapter_count","adapter_classification","count")
    adapter.list.edlibR.mismatch.count.table = adapter.list.edlibR.mismatch.count.table %>% arrange(adapter_classification)
    names(palette) = all.classification
    ggplot(adapter.list.edlibR.mismatch.count.table, aes(fill=adapter_classification, y=count, x=adapter_count)) + 
        geom_bar(position="stack", stat="identity") + scale_fill_manual(values = palette) +
    theme_bw() + guides(fill=guide_legend(ncol=2)) + ggtitle(title)
}

adapter_num = function(df){
    adapter.list.edlibR.mismatch.count = df %>% group_by(fqname) %>% summarise(adapter_count = n())
    df2 = df %>% group_by(fqname) %>%
        summarise(adapter_chain = paste(type,collapse = ","))
    unlist(mclapply(df2$adapter_chain,classify_read,mc.cores = 80))
}

order_suffix = function(x){
    paste(sort(str_split(x,"-")[[1]]),sep = "-")
}

classify_read = function(adapter_chain,adapters_graph = adapter_graph){
    adapters = str_split(adapter_chain,pattern = ",")[[1]]
    all.forward = all(!grepl(adapters,pattern = "*rc$"))
    all.reverse = all(grepl(adapters,pattern = "*rc$"))
    #This no.rc means that the read has no palindrome sequence.
    rc_mix = !(all.forward | all.reverse)
    forward.index = which(!grepl(adapters,pattern = "*rc$"))
    reverse.index = grep(adapters,pattern = "*rc$")

    rc_mix = ifelse(rc_mix,ifelse(all(diff(forward.index)==1) & all(diff(reverse.index)==1),"RC-successive","RC-mosaic"),"No-RC")
    forward.adapters = adapters[forward.index]
    reverse.adapters = adapters[reverse.index]
    
    if((length(forward.adapters)+length(reverse.adapters))>1){
        if(length(forward.adapters) == 1 & length(reverse.adapters) == 1){
            suffix = "Singleton-Singleton"
        }else if(length(forward.adapters) <= 1){
            reverse.adapters = data.frame(from = reverse.adapters[1:(length(reverse.adapters)-1)],
                                      to = reverse.adapters[2:length(reverse.adapters)])
            reverse.dist = apply(reverse.adapters,1,function(reverse.adapter){
                test_paths(adapters_graph,reverse.adapter[1],reverse.adapter[2])
            })
            reverse.adapter_sequence = ifelse(any(is.infinite(reverse.dist)),"Wrong direction",ifelse(all(reverse.dist==1),"Successive","Jump"))
            forward.adapter_sequence = ifelse(length(forward.adapters)==0,"NA","Singleton")
            suffix = paste(forward.adapter_sequence,reverse.adapter_sequence,sep = "-")
            suffix = order_suffix(suffix)
        }else if(length(reverse.adapters) <= 1){
            forward.adapters = data.frame(from = forward.adapters[1:(length(forward.adapters)-1)],
                                  to = forward.adapters[2:length(forward.adapters)])
            forward.dist = apply(forward.adapters,1,function(forward.adapter){
                test_paths(adapters_graph,forward.adapter[1],forward.adapter[2])
            })
            reverse.adapter_sequence = ifelse(length(reverse.adapters)==0,"NA","Singleton")
            forward.adapter_sequence = ifelse(any(is.infinite(forward.dist)),"Wrong direction",ifelse(all(forward.dist==1),"Successive","Jump"))
            suffix = paste(forward.adapter_sequence,reverse.adapter_sequence,sep = "-")
            suffix = order_suffix(suffix)
        }else{
            forward.adapters = data.frame(from = forward.adapters[1:(length(forward.adapters)-1)],
                                  to = forward.adapters[2:length(forward.adapters)])
            forward.dist = apply(forward.adapters,1,function(forward.adapter){
                test_paths(adapters_graph,forward.adapter[1],forward.adapter[2])
            })

            reverse.adapters = data.frame(from = reverse.adapters[1:(length(reverse.adapters)-1)],
                                      to = reverse.adapters[2:length(reverse.adapters)])
            reverse.dist = apply(reverse.adapters,1,function(reverse.adapter){
                test_paths(adapters_graph,reverse.adapter[1],reverse.adapter[2])
            })
            forward.adapter_sequence = ifelse(any(is.infinite(forward.dist)),"Wrong direction",ifelse(all(forward.dist==1),"Successive","Jump"))
            reverse.adapter_sequence = ifelse(any(is.infinite(reverse.dist)),"Wrong direction",ifelse(all(reverse.dist==1),"Successive","Jump"))
            suffix = paste(forward.adapter_sequence,reverse.adapter_sequence,sep = "-")
            suffix = order_suffix(suffix)
        }
        final = paste(rc_mix,suffix,sep = ";")
    }else{
        final = "Singleton"
    }
}

test_paths <- function(g, from, to){
    ifelse(is.finite(c(shortest.paths(g, from,to,mode = "out"))),c(shortest.paths(g, from,  to,mode = "out")),Inf)
}

adapters = c(paste("adapter",LETTERS[1:16],sep  = ""),paste("adapter",LETTERS[1:16],".rc",sep  = ""))

links = data.frame(from = adapters[1:15],to = adapters[2:16])

links2 = data.frame(from = adapters[18:32],to = adapters[17:31])

links = rbind(links,links2)

adapter_graph = graph_from_data_frame(links,directed = T)

In [31]:
a = adapter_num(tsv[1:1000,])

ERROR: [1m[33mError[39m in `$<-`:[22m
[1m[22m[33m![39m Assigned data `unlist(mclapply(df2$adapter_chain, classify_read,
  mc.cores = 80))` must be compatible with existing data.
[31m✖[39m Existing data has 286 rows.
[31m✖[39m Assigned data has 480 rows.
[36mℹ[39m Only vectors of size 1 are recycled.
[1mCaused by error in `vectbl_recycle_rhs_rows()`:[22m
[33m![39m Can't recycle input of size 480 to size 286.


In [26]:
a

fqname,adapter_chain
<chr>,<chr>
000000d6-25b9-433a-be9f-a47958002048,"adapterC,adapterD,adapterE,adapterF,adapterG,adapterJ,adapterJ.rc"
000001c0-e4f8-487c-86ec-c1fb54d55a4e,"adapterP.rc,adapterO.rc,adapterN.rc"
000018d2-cd68-4109-82db-726ea92c1fdf,"adapterA,adapterB,adapterC,adapterD.rc,adapterD,adapterF,adapterG,adapterH"
00002219-6961-4207-afbf-2db56725b61c,adapterN.rc
000026b5-ecb4-44d0-8a24-f36231839d63,"adapterD,adapterE,adapterF,adapterG.rc,adapterF.rc,adapterE.rc,adapterD.rc,adapterD.rc,adapterC.rc"
00002c26-5eea-48d9-871f-232d34632482,"adapterA,adapterB,adapterC,adapterD,adapterD.rc"
00002e4b-c436-4437-84d5-dd0c0fc6ee5c,"adapterL,adapterM,adapterN,adapterO,adapterO.rc,adapterN.rc,adapterM.rc,adapterL.rc,adapterK.rc,adapterJ.rc,adapterH.rc"
00003b02-a74f-4a45-8db0-5cdab3dd780f,"adapterH.rc,adapterG.rc"
00004cf3-2404-4994-add6-213b35a215c0,adapterP
00004ecb-4c51-4111-8b7b-b231e51d67e8,adapterA.rc
