In [1]:
library(data.table)
library(plyr)
library(dplyr)
library(Seurat)
library(ggplot2)
library(repr)
library(stringr)



Attache Paket: 'dplyr'


Die folgenden Objekte sind maskiert von 'package:plyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


Die folgenden Objekte sind maskiert von 'package:data.table':

    between, first, last


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


Lade n"otiges Paket: SeuratObject

Lade n"otiges Paket: sp

'SeuratObject' was built under R 4.3.1 but the current version is
4.3.2; it is recomended that you reinstall 'SeuratObject' as the ABI
for R may have changed

'SeuratObject' was built with package 'Matrix' 1.6.3 but the current
version is 1.6.4; it is recomended that you reinstall 'SeuratObject' as
the ABI for 'Matrix' may have changed


Attache Paket: 'SeuratObject'


Das folgende Objekt ist maskiert 'package:base':

    intersect




In [2]:
library(extrafont)
font_import(paths = "/Library/Fonts", prompt = FALSE)
loadfonts()
fonts()


Registering fonts with R

Scanning ttf files in /Library/Fonts ...

Extracting .afm files from .ttf files...

/System/Library/Fonts/Supplemental/Arial Unicode.ttf
 : ArialUnicodeMS already registered in fonts database. Skipping.

/Library/Fonts/FontsFree-Net-MYRIADPRO-REGULAR.ttf
 : MyriadPro-Regular already registered in fonts database. Skipping.

Found FontName for 0 fonts.

Scanning afm files in /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/extrafontdb/metrics

Arial Unicode MS already registered with pdfFont().

Myriad Pro already registered with pdfFont().

Arial Unicode MS already registered with postscriptFont().

Myriad Pro already registered with postscriptFont().



In [3]:
plot_theme <- function() {
    theme_bw() + theme(
        panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        text = element_text(size = 24, family = "Myriad Pro"),
        axis.text = element_text(size = 26),
        axis.text.x = element_text(vjust = 0.5, angle = 45),
        axis.title = element_text(size = 30),
        legend.text = element_text(size = 24),
        legend.title = element_text(size = 26),
        legend.key.size = unit(0.8, "cm"),
        plot.title = element_text(size = 30, hjust = 0.5),
        axis.line = element_line(linewidth = 0.7, colour = "black"),
        plot.margin = margin(
            t = 1,
            b = 0.1,
            r = 0.3,
            l = 0.6,
            "cm"
        )
    )
}
options(repr.plot.width = 10, repr.plot.height = 9)


In [4]:
prepare_df_for_plot <- function(assignment) {
    compare <- fread(assignment, na.strings = "")
    summary_compare <- melt(
        compare,
        id.vars = "Barcode",
        variable.name = "Dataset",
        value.name = "Assignment"
    )
    summary_compare <- na.omit(summary_compare)
    summary_compare <- as.data.frame(table(summary_compare[, -1]))
    colnames(summary_compare)[3] <- "Count"
    summary_compare <-
        summary_compare[order(summary_compare$Assignment, decreasing = T), ]
    summary_compare$Percentage <- 0
    summary_compare$Dataset <-
        gsub(" Dataset", "", summary_compare$Dataset)
    summary_compare$Dataset <-
        factor(summary_compare$Dataset, levels = c("Full", paste0("Hash45", 1:6, " Downsampled")))
    summary_compare <- summary_compare %>%
        group_by(Dataset) %>%
        mutate(Percentage = round(Count / sum(Count) * 100, 2))
    summary_compare_cumsum <-
        ddply(summary_compare,
            "Dataset",
            transform,
            label_ypos = cumsum(Percentage) - 0.5 * Percentage
        )
    summary_compare_cumsum <-
        summary_compare_cumsum[summary_compare_cumsum$Count != 0, ]
    summary_compare_cumsum
}


In [5]:
for (i in c("vireo", "souporcell", "freemuxlet", "scsplit")) {
    compare <-
        prepare_df_for_plot(paste0("../data/downsampled/downsampled_compare_", i, "_all.csv"))
    gp <-
        ggplot(compare, aes(x = Dataset, y = Percentage, fill = Assignment)) +
        geom_bar(
            stat = "identity",
            width = 0.8,
            colour = "grey20"
        ) +
        theme_bw() +
        xlab(paste0("Dataset (", i, ")")) +
        ylab("Percentage (%)") +
        plot_theme() +
        geom_text(
            aes(y = label_ypos, label = Percentage),
            vjust = 0.5,
            color = "white",
            size = 5
        )
    if (i == "scsplit") {
        gp + scale_fill_manual(
            values = c(
                "#FFD92F",
                "#9971ab",
                "#6a6a6a",
                "#A6D854",
                "#FC8D62",
                "#6BAED6",
                "#E5C494",
                "#E78AC3",
                "#B3B3B3"
            ),
            limits = c(
                "doublet",
                "negative",
                "Not matched",
                paste0("Hash45", 1:6, "_TotalSeqA")
            ),
            labels = c("Doublet", "Negative", "Not matched", paste0("Hash45", 1:6))
        )
    } else {
        gp + scale_fill_manual(
            values = c(
                "#FFD92F",
                "#9971ab",
                "#A6D854",
                "#FC8D62",
                "#6BAED6",
                "#E5C494",
                "#E78AC3",
                "#B3B3B3"
            ),
            limits = c("doublet", "negative", paste0("Hash45", 1:6, "_TotalSeqA")),
            labels = c("Doublet", "Negative", paste0("Hash45", 1:6))
        )
    }

    # ggsave(
    #     paste0(
    #         "fig5/downsample_",
    #         i,
    #         ".pdf"
    #     ),
    #     dpi = 300,
    #     height = 9,
    #     width = 10
    # )
}


In [6]:
downsampled_assignment <- list.files(
  path = "../data/downsampled",
  pattern = "downsampled_compare_*",
  all.files = TRUE
)
for (i in 1:6) {
  downsampled_barcodes <-
    fread(
      paste0(
        "../data/downsampled/downsampled_Hash45",
        i,
        "_TotalSeqA_0.5.tsv"
      ),
      header = FALSE
    )$V1
  df_list <- lapply(downsampled_assignment, function(f) {
    x <-
      read.csv(paste0("../data/downsampled/", f), na.strings = "")[, c(1, 2, i + 2)]
    colnames(x)[3] <- gsub("downsampled_compare_", "", f)
    colnames(x)[3] <- gsub("_all.csv", "", colnames(x)[3])
    return(x)
  })
  compare <- do.call(cbind, df_list)
  compare <- compare[, !duplicated(colnames(compare))]
  compare <- compare[compare$Barcode %in% downsampled_barcodes, ]
  assign(paste0("compare_hash45", i), compare)
  print(paste0("compare_hash45", i))
}


[1] "compare_hash451"
[1] "compare_hash452"
[1] "compare_hash453"
[1] "compare_hash454"
[1] "compare_hash455"
[1] "compare_hash456"


In [7]:
for (i in 1:6) {
  compare <- get(paste0("compare_hash45", i))
  compare <-
    compare[!compare$Full.Dataset %in% c("negative", "doublet"), ]
  selected_columns <- compare[, 3:6]
  comparison_results <-
    sapply(selected_columns, function(col) {
      col == compare$Full.Dataset
    })
  comparison_results <-
    cbind(compare[, c("Barcode", "Full.Dataset")], comparison_results)
  comparison_results[is.na(comparison_results)] <- FALSE
  assign(paste0("compare_mismatch_hash45", i), comparison_results)

  long_df <- melt(
    comparison_results,
    id.vars = "Barcode",
    measure.vars = 3:6,
    variable.name = "Method",
    value.name = "Match"
  )
  match_table <- as.data.frame(table(long_df$Method, long_df$Match))
  colnames(match_table) <- c("Method", "Assignment", "Count")
  match_table$Assignment <- ifelse(match_table$Assignment == TRUE, "Match", "Mismatch")

  total_counts <- aggregate(Count ~ Method, data = match_table, sum)
  match_table <-
    merge(match_table,
      total_counts,
      by = "Method",
      suffixes = c("", "_Total")
    )

  match_table$Method <- str_to_title(match_table$Method)

  match_table$Percentage <-
    round((match_table$Count / match_table$Count_Total) * 100, 2)
  match_table <-
    ddply(match_table,
      "Method",
      transform,
      label_ypos = cumsum(Percentage) - 0.5 * Percentage
    )
  if (i %% 3 != 0) {
    match_table$label_ypos <- match_table$label_ypos + 5
  }
  assign(paste0("match_table_hash45", i), match_table)


  ggplot(match_table, aes(x = Method, y = Percentage, fill = Assignment)) +
    geom_bar(
      stat = "identity",
      width = 0.8,
      colour = "grey20"
    ) +
    theme_bw() +
    xlab(paste0("Method: Hash45", i)) +
    ylab("Percentage (%)") +
    plot_theme() +
    scale_fill_manual(
      values = c(
        "Mismatch" = "#e1eaf8",
        "Match" = "#66a0da"
      )
    ) +
    theme(axis.text.x = element_text(vjust = 0.5, angle = 45)) +
    geom_text(aes(y = label_ypos, label = Percentage), vjust = 0.5, color = "white", size = 10)

  # ggsave(paste0("fig5/mismatch_hash45", i, ".pdf"), dpi = 300, height = 9, width = 10)
}


