In [None]:
suppressWarnings(suppressPackageStartupMessages({
    library(Seurat)
    library(EnsDb.Hsapiens.v86)
    library(dplyr)
    library(ggplot2)
    library(bedr)
    library(SeuratDisk)
    library(scales)
    library(reshape2)
    library(Hmisc)
    library(tidyr)
    library(tidyverse)
    library(crayon)
    library(readr)
    library(future)
    library("TxDb.Hsapiens.UCSC.hg38.knownGene")
    library(GenomicRanges)
    library(GenomicFeatures)
    library(rsnps)
    library(biomaRt)
    library(ggpubr)
    library(gridExtra)
    library(patchwork)
    library(EnhancedVolcano)
    library(rlist)
    library(purrr)
    library(edgeR)
}))
options(timeout = 100000)
source("helper_functions_for_diffex.r")


# Definitions etc

In [None]:
set.seed(123456)
options(digits = 2)
stats <- c()


In [None]:
options(future.globals.maxSize = 250 * 1024^3) # 650Gb
# plan()

# Enable parallelization
plan(sequential)
plan("multicore", workers = 64)


In [None]:
if (!exists("papermill_run")) {
    prj_name <- "Screen1"
    secondary_a_path <- "S1/analysis/secondary/"
    save_seurat_h5 <- "YES"
    de_testing <- "NEIGHBORHOOD"
}


In [None]:
custom_theme <- theme(
  plot.title = element_text(size = 16, hjust = 0.5),
  legend.key.size = unit(0.7, "cm"),
  legend.text = element_text(size = 14)
)


### Load guide df

In [None]:
df_guide <- read.table(
    "primary/S1_resources/66CRISPRi_ref_for_diffex.txt",
    sep = "\t", header = T, strip.white = T
)
integrated_h5_path <-
    "S1/analysis/secondary/integrated/seurat_objects/integrated.h5seurat"
neighbors_list <- list.load(
    "primary/S1_resources/neighbors_list.rds"
)

sample_n(df_guide, 6)

### Load Seurat file

In [None]:
seurat_combined <- LoadH5Seurat(integrated_h5_path, verbose = F)
# seurat_combined    <- subset(seurat_combined, subset=status=='singlet')


In [None]:
table(seurat_combined@meta.data$n_guides)

Normalize RNA counts by <font color='red'>NormalizeData</font>  before running FindMarkers

In [None]:
seurat_rna <- CreateSeuratObject(seurat_combined[["RNA"]])
seurat_rna <- NormalizeData(seurat_rna)
seurat_rna@meta.data <- seurat_combined@meta.data


### Load THRESHOLDS.tsv

In [None]:
thr_f_name <- paste0(secondary_a_path, "/integrated/THRESHOLDS.tsv")
df_thresholds <- read.table(thr_f_name, header = T, strip.white = T)
df_thresholds <- df_thresholds %>%
    pivot_wider(names_from = batch, values_from = threshold)
df_thresholds <- as.data.frame(df_thresholds)


## Select perturbed/control cells
Find guide+ (perturbed) and guide- (unperturbed) cells for each guide

In [None]:
libraries <- unique(seurat_combined$library)
seurat_libs <- list()
for (i in seq_along(libraries)) {
    lib <- libraries[i]
    seurat_libs[[i]] <- subset(seurat_combined, subset = library == lib)
}
names(seurat_libs) <- libraries


In [None]:
THRESHOLD_MULTIPLIER <- 1
cat(red("Using ", THRESHOLD_MULTIPLIER, "x threshold"))

perturbed_cells_by_guide <- list()

for (i in 1:nrow(df_thresholds)) {
    perturbed_cells_in_all_libs <- list()
    guide <- df_thresholds$guide[i]
    # Loop over libraries
    for (lib in libraries) {
        seurat_lib <- seurat_libs[[lib]]
        threshold <- THRESHOLD_MULTIPLIER * df_thresholds[i, lib]
        cells_in_lib <- Cells(seurat_lib)
        sgrna_counts <- seurat_lib[["sgRNA"]]@counts
        select_perturbed <- sgrna_counts[guide, cells_in_lib] >= threshold
        perturbed_cells_in_library <- cells_in_lib[select_perturbed]
        if (!is.na(threshold)) {
            perturbed_cells_in_all_libs <-
                append(perturbed_cells_in_all_libs, perturbed_cells_in_library)
        }
    }
    perturbed_cells_by_guide[[i]] <- perturbed_cells_in_all_libs
}
names(perturbed_cells_by_guide) <- df_thresholds$guide


# Check DE for genes near SNPs

In [None]:
select_distal <- df_guide$subclass == "ckd_de"
df_snps <- unique(df_guide[select_distal, c("gene", "alias")])


# Run diffex on pseudobulked samples using EdgeR

### Helper functions

In [None]:
# Helper function to prepare design matrix and contrast matrix
prepare_design_and_contrast <- function(y) {
    library <- factor(y$samples$sample)
    cluster <- as.factor(y$samples$cluster) 

    design <- model.matrix(~ cluster + library)
    colnames(design) <- gsub("library", "", colnames(design))
    colnames(design)[1] <- "Int"

    ncls <- nlevels(cluster)
    contr <- rbind(matrix(1 / (1 - ncls), ncls, ncls), matrix(0, ncol(design) - ncls, ncls))
    diag(contr) <- 1
    contr[1, ] <- 0
    rownames(contr) <- colnames(design)
    colnames(contr) <- paste0("cluster", levels(cluster))

    return(list(design = design, contrast = contr))
}

# Helper function to perform differential expression analysis
perform_de_analysis <- function(
    y, design, contrast, neighbors, target, de_testing) {
    y <- estimateDisp(y, design, robust = TRUE)
    print(y$common.dispersion)

    fit <- glmQLFit(y, design, robust = TRUE)
    qlf <- glmQLFTest(fit, contrast = contrast[, 1])
    plotQLDisp(fit)

    if (de_testing == "NEIGHBORHOOD") {
        markers <- qlf$table[neighbors, ] %>% arrange(PValue)
    } else {
        # Test all genes
        markers <- qlf$table %>% arrange(PValue)
    }

    return(markers)
}

# Helper function to process cells with the given target and perturbation status
process_target_cells <- function(
    seurat_dummy, target, vector_id, is_de, neighbors, diffex_gene,
    perturbed_cells_by_guide, guides_on_plasmid, guides_4_target,
    df_cell_counts, df_neighbor_de, pos_label, neg_label, target_subclass) {
    # Save the current random seed
    if (exists(".Random.seed", .GlobalEnv)) {
        oldseed <- .GlobalEnv$.Random.seed
    } else {
        oldseed <- NULL
    }

    # Filter out the vector_neg_target_pos cells, bc they mess up the design matrix
    seurat_dummy <- subset(seurat_dummy, ident = c(pos_label, neg_label))
    seurat_dummy$perturbation_status <- Idents(seurat_dummy)
    print(
        table(
            filter(
                seurat_dummy@meta.data, 
                perturbation_status == "vector_positive"
            )$library
        )
    )

    cluster_list <- as.character(seurat_dummy$perturbation_status)
    cluster_list[cluster_list == neg_label] <- 1
    cluster_list[cluster_list == pos_label] <- 0
    seurat_dummy$cluster <- as.factor(cluster_list)
    y <- Seurat2PB(seurat_dummy, sample = "library", cluster = "cluster")

    # Add target+/- counts to df_cell_counts
    # if (!is_de | diffex_gene != "") {
    if (diffex_gene != "") {
        df_dummy <- as.data.frame(cpm(y)[diffex_gene, ])
        colnames(df_dummy) <- "cpm"
        df_dummy$target <- diffex_gene
        df_dummy$vector <- vector_id
        df_dummy$label <- rownames(df_dummy)
        df_dummy$subclass <- target_subclass
        df_samples <- y$samples
        df_dummy <- merge(df_dummy, df_samples, by = 0)
        df_cell_counts <- rbind(df_cell_counts, df_dummy)
    }

    # Filter out small samples, lowly expressed genes.
    keep.samples <- y$samples$lib.size > 5e4
    y <- y[, keep.samples]
    keep.genes <- filterByExpr(y,
        group = y$samples$cluster, 
        min.count = 3,
        min.total.count = 3
    )
    y <- y[keep.genes, ,keep = FALSE]
    y <- normLibSizes(y)

    design_and_contrast <- prepare_design_and_contrast(y)
    markers <- perform_de_analysis(
        y,
        design_and_contrast$design,
        design_and_contrast$contrast,
        neighbors,
        target,
        de_testing
    )

    if (nrow(markers) == 0) {
        cat(red("No DE genes", target, "\n"))
    } else {
        markers$de_gene <- rownames(markers)
        markers$target <- target
        markers$subclass <- target_subclass
        markers$label <- label
        markers$plasmid <- vector_id
        rownames(markers) <- NULL
        df_neighbor_de <- rbind(df_neighbor_de, markers)
        print(head(markers))
        cat("-------------------------------------------------------\n\n")
    }

    if (de_testing == "NEIGHBORHOOD") {
        n_tests_performed <- length(neighbors)
    } else {
        n_tests_performed <- nrow(y)
    }

    # Restore the random seed
    if (!is.null(oldseed)) {
        .GlobalEnv$.Random.seed <- oldseed
    } else {
        rm(".Random.seed", envir = .GlobalEnv)
    }

    return(
        list(
            df_cell_counts = df_cell_counts, 
            df_neighbor_de = df_neighbor_de, 
            n_tests_performed = n_tests_performed
        )
    )
}


In [None]:
set.seed(1234567)
source("helper_functions_for_diffex.r")
df_targets <- read.table(
    "primary/S1_resources/66CRISPRi_TARGETS.txt",
    sep = "\t", header = T, strip.white = T
)

pos_label <- "vector_positive"
neg_label <- "target_negative"

df_neighbor_de <- NULL
df_random_de <- NULL
df_cell_counts <- NULL
p_val_cutoff_estimates <- c()
test_use <- "LR" # "MAST"  #"wilcox" "LR"
guides_to_skip <- c() # c("DE6", "DE15")
logfc_threshold <- 0.01
n_tests <- 0
n_random_tests <- 0

neighbor_genes <- unique(flatten(neighbors_list))
all_gene_names <- as.vector(rownames(seurat_rna[["RNA"]]@counts))
non_neighbor_genes <- all_gene_names[!(all_gene_names %in% neighbor_genes)]

# Main loop
for (i in 1:nrow(df_targets)) {
    is_de <- FALSE
    target <- df_targets[i, "alias"]
    target_subclass <- df_targets[i, "subclass"]
    label <- df_targets[i, "label"]
    vector_id <- gsub("a", "", df_targets[i, "guide1"])
    diffexed_gene <- df_targets[i, "diffexed_gene"]
    snp_id_or_gene_name <- df_targets[i, "gene"]

    # Check if "target" is not a DE gene
    if (!str_detect(target, regex("^DE\\d+$"))) {
        is_de <- FALSE
        neighbors <- neighbors_list[[target]]
    } else {
        is_de <- TRUE
        neighbors <- neighbors_list[[snp_id_or_gene_name]]
    }

    if ((target %in% guides_to_skip) | length(neighbors) == 0) {
        cat(
            red(
                "Skipping", 
                target, 
                snp_id_or_gene_name, 
                length(neighbors), 
                "\n"
            )
        )
        next
    }

    cat(green("Diffexed gene", "'", diffexed_gene, "'\n"))
    if (diffexed_gene != "") {
        guides_4_target <-
            get_guides_by_subclass(df_targets, "diffexed_gene", diffexed_gene)
    } else {
        guides_4_target <-
            get_guides_by_subclass(df_targets, "alias", target)
    }

    guides_on_plasmid <- unlist(
        as.list(t(df_targets[i, c("guide1", "guide2")]))
    )
    cat(blue(target, ":"), paste(guides_on_plasmid, collapse = ","), "\n")
    cat(blue("Guides4target:"), paste(guides_4_target, collapse = ","), "\n")

    seurat_dummy <- mark_vector_pos_target_neg(
        seurat_rna,
        perturbed_cells_by_guide,
        df_targets,
        guides_on_plasmid,
        guides_4_target,
        print_counts = T,
        pos_label = pos_label,
        neg_label = neg_label
    )

    # Process cells with the given target and perturbation status
    if ("vector_positive" %in% levels((Idents(seurat_dummy)))) {
        dummy <- process_target_cells(
            seurat_dummy, target, vector_id, is_de, neighbors, diffexed_gene,
            perturbed_cells_by_guide, guides_on_plasmid, guides_4_target,
            df_cell_counts, df_neighbor_de, pos_label, neg_label, 
            target_subclass
        )
        df_cell_counts <- dummy$df_cell_counts
        df_neighbor_de <- dummy$df_neighbor_de
        n_tests <- n_tests + dummy$n_tests_performed

        # -- Now test random genes to estimate a cutoff for the p-value --
        # -- Run this test only for DE genes ---
        if (is_de) {
            n_neighbors <- length(neighbors)
            random_non_neighbors <- sample(
                non_neighbor_genes, 
                n_neighbors,
                replace = F
            )

            df_dummy_cell_counts <- NULL
            df_dummy_neighbor_de <- NULL
            dummy2 <- process_target_cells(
                seurat_dummy, target, vector_id, is_de, random_non_neighbors, 
                diffexed_gene, perturbed_cells_by_guide, guides_on_plasmid, 
                guides_4_target, df_dummy_cell_counts, df_random_de, pos_label, 
                neg_label, target_subclass
            )
            df_dummy_cell_counts <- dummy2$df_cell_counts
            df_random_de <- dummy2$df_neighbor_de
            n_random_tests <- n_random_tests + dummy2$n_tests_performed
        }
        # -- end --
    } else {
        cat(
            red("No cells found:"), 
            paste(guides_on_plasmid, collapse = ","), 
            "\n"
        )
    }
}


In [None]:
n_tests
n_random_tests


### P-value correction

In [None]:
# n_tests = length(flatten(neighbors_list)) * length(df_targets)
print(n_tests)
df_neighbor_de$adjusted_p_val <- p.adjust(
    p = df_neighbor_de$PValue,
    method = "bonferroni",
    n = n_tests
)

# Select only negative logFCs from random gene tests
cat(
    red(
        "Selecting only negative logFCs," %+%
        "assuming this is a CRISPRi dataset!!!!!\n"
    )
)
df_random_de$adjusted_p_val <- p.adjust(
    p = df_random_de$PValue,
    method = "bonferroni",
    n = n_tests
)

df_dummy <- df_random_de %>% filter(logFC < 0)
p_val_cutoff <- min(df_dummy$adjusted_p_val)

cat(green("Corrected p-value cutoff:", p_val_cutoff))


In [None]:
head(df_neighbor_de)
df_random_de %>%
    arrange(PValue) %>%
    head(25)


In [None]:
df_cell_counts_backup <- df_cell_counts


In [None]:
head(df_cell_counts_backup)
head(df_cell_counts)

# Visualize

### Plot each vector separately

In [None]:
df_cell_counts <- df_cell_counts_backup
df_cell_counts$label <- str_replace(df_cell_counts$label, "Lib_._", "")
df_cell_counts$label <-
    str_replace(df_cell_counts$label, "cluster0", "vector(+)")
df_cell_counts$label <-
    str_replace(df_cell_counts$label, "cluster1", " control")
df_cell_counts$label <-
    str_replace(df_cell_counts$label, "clustervector_B", "(O)")
df_cell_counts$t_and_v <-
    paste0(df_cell_counts$target, ":", df_cell_counts$vector)

df_cell_counts_tss <- df_cell_counts %>% filter(subclass == "tss")
df_cell_counts_de <- df_cell_counts %>% filter(subclass == "de_control")
df_cell_counts_ckd_de <- df_cell_counts %>% filter(subclass == "ckd_de")

options(repr.plot.width = 8, repr.plot.height = 3)
ggplot(df_cell_counts_tss, aes(x = factor(label), y = cpm)) +
    geom_jitter(aes(color = sample), width = 0.2) +
    facet_wrap(~vector, scales = "free", ncol = 6) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)), limits = c(0, NA)) +
    theme_bw() +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12),
        strip.text = element_text(size = 10)
    )

options(repr.plot.width = 10, repr.plot.height = 6)
ggplot(df_cell_counts_de, aes(x = factor(label), y = cpm)) +
    geom_jitter(aes(color = sample), width = 0.2) +
    facet_wrap(~vector, scales = "free", ncol = 7) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)), limits = c(0, NA)) +
    theme_bw() +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12),
        strip.text = element_text(size = 10)
    )
options(repr.plot.width = 12, repr.plot.height = 16)
ggplot(df_cell_counts_ckd_de, aes(x = factor(label), y = cpm)) +
    geom_jitter(aes(color = sample), width = 0.2) +
    facet_wrap(~t_and_v, scales = "free", ncol = 7) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)), limits = c(0, NA)) +
    theme_bw() +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12),
        strip.text = element_text(size = 10)
    )



### Plot vectors side-by-side

In [None]:
df_cell_counts <- df_cell_counts_backup
df_cell_counts$label <-
    str_replace(df_cell_counts$label, "Lib_._", "")
df_cell_counts$label <-
    str_replace(df_cell_counts$label, "cluster0", "vector(+)")
df_cell_counts$label <-
    str_replace(df_cell_counts$label, "cluster1", " control")
df_cell_counts$label <-
    str_replace(df_cell_counts$label, "clustervector_B", "(O)")
df_cell_counts$t_and_v <-
    paste0(df_cell_counts$target, ":", df_cell_counts$vector)

# **** Remark this part out to plot one vector for each plot
# ** Also change 'factor(new_label)' to 'factor(label)'
# ** and facet_wrap(~ vector)  to ~target or ~t_and_v (for ckd_de)
# ** in ggplot calls

select_vecpos <- df_cell_counts$label == "vector(+)"
select_tarneg <- df_cell_counts$label == " control"

df_cell_counts$new_label <- df_cell_counts$label
df_cell_counts[select_vecpos, "new_label"] <- paste0(df_cell_counts[select_vecpos, "vector"], "(+)")
# **** End

df_cell_counts_tss <- df_cell_counts %>% filter(subclass == "tss")
df_cell_counts_de <- df_cell_counts %>% filter(subclass == "de_control")
df_cell_counts_ckd_de <- df_cell_counts %>% filter(subclass == "ckd_de")

options(repr.plot.width = 8, repr.plot.height = 3)
ggplot(df_cell_counts_tss, aes(x = factor(new_label), y = cpm)) +
    geom_jitter(aes(color = sample), width = 0.2) +
    facet_wrap(~vector, scales = "free", ncol = 6) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)), limits = c(0, NA)) +
    theme_bw() +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12),
        strip.text = element_text(size = 10)
    )

options(repr.plot.width = 10, repr.plot.height = 3)
ggplot(df_cell_counts_de, aes(x = factor(new_label), y = cpm)) +
    geom_jitter(aes(color = sample), width = 0.2) +
    facet_wrap(~target, scales = "free", ncol = 7) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)), limits = c(0, NA)) +
    theme_bw() +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12),
        strip.text = element_text(size = 10)
    )

options(repr.plot.width = 10, repr.plot.height = 9)
ggplot(df_cell_counts_ckd_de, aes(x = factor(new_label), y = cpm)) +
    geom_jitter(aes(color = sample), width = 0.2) +
    facet_wrap(~target, scales = "free", ncol = 7) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)), limits = c(0, NA)) +
    theme_bw() +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12),
        strip.text = element_text(size = 10)
    )


## Inpect DE results

Print top high-confidence hits

In [None]:
df_neighbor_de %>%
    filter(logFC < 0) %>%
    arrange(PValue) %>%
    head(25)


## Volcano Plot

In [None]:
colors <- ifelse(
    df_neighbor_de$subclass == "tss", "black",
    ifelse(df_neighbor_de$subclass == "de_control", "gray", "green")
)
names(colors) <- df_neighbor_de$label


In [None]:
options(repr.plot.width = 10, repr.plot.height = 12)
EnhancedVolcano(df_neighbor_de,
    lab = df_neighbor_de$de_gene,
    x = "logFC",
    y = "adjusted_p_val",
    title = paste(prj_name, "hits by vector"),
    subtitle = "Pseudobulk, EdgeR",
    colCustom = colors,
    drawConnectors = TRUE,
    arrowheads = FALSE,
    pCutoff = 10e-04,
    FCcutoff = logfc_threshold,
    pointSize = 5.0,
    labSize = 4.0
) +
    xlim(-2, 2) +
    xlab(expression(paste("Average ", Log[2], " Fold Change"))) +
    ylab(expression(paste("\u2013", Log[10], " ", italic(p))))


### Plot just the control TSS

In [None]:
de_filter <- (df_neighbor_de$subclass == "tss") &
    (df_neighbor_de$target == df_neighbor_de$de_gene)
df_neighbor_de_subset <- df_neighbor_de[de_filter, ]

options(repr.plot.width = 20, repr.plot.height = 12)
p1 <- EnhancedVolcano(df_neighbor_de_subset,
    lab = df_neighbor_de_subset$de_gene,
    x = "logFC",
    y = "adjusted_p_val",
    title = paste(prj_name, "hits by vector"),
    subtitle = "only TSS controls, only hit==target, pseudobulk, EdgeR",
    col = "gray",
    drawConnectors = TRUE,
    arrowheads = FALSE,
    pCutoff = 10e-0,
    FCcutoff = logfc_threshold,
    pointSize = 5.0,
    labSize = 6.0
) +
    xlim(-2.5, 1) +
    xlab(expression(paste("Average ", Log[2], " Fold Change"))) +
    ylab(expression(paste("\u2013", Log[10], " ", italic(p))))


de_filter <- (df_neighbor_de$subclass == "tss")
df_neighbor_de_subset <- df_neighbor_de[de_filter, ]
p2 <- EnhancedVolcano(df_neighbor_de_subset,
    lab = df_neighbor_de_subset$de_gene,
    x = "logFC",
    y = "adjusted_p_val",
    title = paste(prj_name, "hits by vector"),
    subtitle = "only TSS controls, pseudobulk, EdgeR",
    col = "gray",
    drawConnectors = TRUE,
    arrowheads = FALSE,
    pCutoff = 10e-5,
    FCcutoff = logfc_threshold,
    pointSize = 5.0,
    labSize = 4.0
) +
    xlim(-2.5, 1) +
    xlab(expression(paste("Average ", Log[2], " Fold Change"))) +
    ylab(expression(paste("\u2013", Log[10], " ", italic(p))))

p1 + p2


In [None]:
head(df_neighbor_de_subset)


### Plot just the control DE

In [None]:
de_filter <- (df_neighbor_de$subclass == "de_control") &
    (df_neighbor_de$target == df_neighbor_de$de_gene)
df_neighbor_de_subset <- df_neighbor_de[de_filter, ]

options(repr.plot.width = 20, repr.plot.height = 12)
p1 <- EnhancedVolcano(df_neighbor_de_subset,
    lab = df_neighbor_de_subset$de_gene,
    x = "logFC",
    y = "adjusted_p_val",
    title = paste(prj_name, "hits by vector"),
    subtitle = "only DE controls, only hit==target, pseudobulk, EdgeR",
    col = "gray",
    drawConnectors = TRUE,
    arrowheads = FALSE,
    pCutoff = 10e-1,
    FCcutoff = logfc_threshold,
    pointSize = 5.0,
    labSize = 6.0
) +
    xlim(-2, 1) +
    xlab(expression(paste("Average ", Log[2], " Fold Change"))) +
    ylab(expression(paste("\u2013", Log[10], " ", italic(p))))


de_filter <- (df_neighbor_de$subclass == "de_control")
df_neighbor_de_subset <- df_neighbor_de[de_filter, ]
p2 <- EnhancedVolcano(df_neighbor_de_subset,
    lab = df_neighbor_de_subset$de_gene,
    x = "logFC",
    y = "adjusted_p_val",
    title = paste(prj_name, "hits by vector"),
    subtitle = "only DE controls, pseudobulk, EdgeR",
    col = "gray",
    drawConnectors = TRUE,
    arrowheads = FALSE,
    pCutoff = 10e-1,
    FCcutoff = logfc_threshold,
    pointSize = 5.0,
    labSize = 6.0
) +
    xlim(-2, 1) +
    xlab(expression(paste("Average ", Log[2], " Fold Change"))) +
    ylab(expression(paste("\u2013", Log[10], " ", italic(p))))

p1 + p2


### Plot just the CKD DE

In [None]:
de_filter <- (df_neighbor_de$label == "CKD DE")
df_neighbor_de_subset <- df_neighbor_de[de_filter, ]
df_neighbor_de_subset$plot_label <- paste0(
    df_neighbor_de_subset$de_gene, 
    ":", 
    df_neighbor_de_subset$plasmid
)

options(repr.plot.width = 12, repr.plot.height = 16)
p1 <- EnhancedVolcano(df_neighbor_de_subset,
    lab = df_neighbor_de_subset$plot_label,
    x = "logFC",
    y = "adjusted_p_val",
    title = paste(prj_name, "hits by vector"),
    subtitle = "only CKD DE, pseudobulk, EdgeR",
    col = "gray",
    drawConnectors = TRUE,
    arrowheads = FALSE,
    pCutoff = 1e-5,
    FCcutoff = logfc_threshold,
    pointSize = 5.0,
    labSize = 4.0
) +
    xlim(-1.5, 0.5) +
    xlab(expression(paste("Average ", Log[2], " Fold Change"))) +
    ylab(expression(paste("\u2013", Log[10], " ", italic(p))))


p2 <- EnhancedVolcano(df_neighbor_de_subset,
    lab = df_neighbor_de_subset$plot_label,
    x = "logFC",
    y = "adjusted_p_val",
    title = paste(prj_name, "hits by vector"),
    subtitle = "only CKD DE, pseudobulk, EdgeR",
    col = "gray",
    drawConnectors = TRUE,
    arrowheads = FALSE,
    pCutoff = 0.05,
    FCcutoff = logfc_threshold,
    pointSize = 5.0,
    labSize = 4.0
) +
    xlim(-1., 0.5) +
    ylim(0, 22) +
    xlab(expression(paste("Average ", Log[2], " Fold Change"))) +
    ylab(expression(paste("\u2013", Log[10], " ", italic(p))))

p1
p2
