In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(Seurat)
    library(Signac)
    library(EnsDb.Hsapiens.v86)
    library(dplyr)
    library(ggplot2)
    library(bedr)
    library(SeuratDisk)
    library(scales)
    library(reshape2)
    library(Hmisc)
    library(tidyr)
    library(tidyverse)
    library(crayon)
    library(readr)
    library(reticulate)
    library(gridExtra)
    library(grid)
    library(ggpubr)
    library(scales)
}))
use_condaenv("R42")
source_python("/home/vsevim/prj/workflows/ckd/secondary/mixture.py")
source('/home/vsevim/prj/workflows/ckd/secondary/helper_functions_for_diffex.r')

# Definitions etc

In [2]:
set.seed(1234)
options(digits=2)
stats <- c()

In [3]:
if(!exists("papermill_run")) {
    prj_name = "Screen1_66guides"
    # data_subdir_suffixes = "L1,L2,L3,L4"     
    secondary_a_path = "/home/vsevim/prj/1012-ckd/S1/analysis/secondary/"
    save_seurat_h5 = "YES"

    # prj_name = "Screen2_101vecs"
    # # data_subdir_suffixes = "L1,L2,L3,L4"     
    # secondary_a_path = "/home/vsevim/prj/1012-ckd/S2/analysis/secondary/"
    # save_seurat_h5 = "NO"

}


In [4]:
custom_theme <- theme(
  plot.title = element_text(size=16, hjust = 0.5), 
  legend.key.size = unit(0.7, "cm"), 
  legend.text = element_text(size = 14))

In [62]:
get_all_perturbed_cells_by_guide <- function(seurat_obj_libs = NULL, df_thresholds = NULL) {
    perturbed_cells_by_guide = list()

    for(i in 1:nrow(df_thresholds)){  
        perturbed_cells_in_all_libs = list()
        guide = df_thresh$guide[i]
        # Loop over libraries
        libraries = names(seurat_obj_libs)
        for(lib in libraries){        
            seurat_lib = seurat_obj_libs[[lib]]
            threshold = df_thresholds[i, lib]        
            #cat(blue(guide, lib, threshold, "\n"))
            cells_in_lib = Cells(seurat_lib)        
            sgrna_counts = seurat_lib[['sgRNA']]@counts
            select_perturbed = sgrna_counts[guide, cells_in_lib] >= threshold
            perturbed_cells_in_library = cells_in_lib[select_perturbed]
            #cat(length(cells_in_lib), "in", lib, guide, length(perturbed_cells_in_library), "cells >", threshold, "\n")        
            if(!is.na(threshold)) {
                perturbed_cells_in_all_libs = append(perturbed_cells_in_all_libs, perturbed_cells_in_library)
            }
        }
        perturbed_cells_by_guide[[i]] = perturbed_cells_in_all_libs
    }
    names(perturbed_cells_by_guide) <- df_thresholds$guide
    perturbed_cells_by_guide
}


### Load Seurat file

In [18]:
integrated_h5_path  <- paste0(secondary_a_path, "integrated/seurat_objects/integrated_wo_guide_calls.h5seurat")
updated_integrated_h5_path  <- paste0(secondary_a_path, "integrated/seurat_objects/integrated.h5seurat")
seurat1 <- LoadH5Seurat(integrated_h5_path, verbose = FALSE)

Validating h5Seurat file

"Adding a command log without an assay associated with it"


Use only singlets (actually I didn't include the singlets in the integrated file, so this step is unnecessary)

In [19]:
seurat1 <- subset(seurat1, subset = status == 'singlet')

# ----

In [20]:
integrated_h5_path <- "/home/vsevim/prj/1012-ckd/S1/analysis/secondary/integrated/seurat_objects/integrated.h5seurat"
seurat2 <- LoadH5Seurat(integrated_h5_path, verbose = FALSE)
#seurat_combined_debug    <- subset(seurat_combined, subset=status=='singlet')

Validating h5Seurat file

"Adding a command log without an assay associated with it"


### Load thresholds

In [31]:
thr_f_name = paste0(secondary_a_path, "/integrated/THRESHOLDS.tsv")
df_thresh = read.table(thr_f_name, header=T, strip.white = T)
df_thresh <- df_thresh %>% pivot_wider(names_from = batch, values_from = threshold)
df_thresh <- as.data.frame(df_thresh)

# Inspect two files

In [22]:
nrow(seurat1)
nrow(seurat2)

In [23]:
unique(seurat1$status)
unique(seurat2$status)

In [25]:
head(df_thresh)

Unnamed: 0_level_0,guide,Lib_1,Lib_2,Lib_3,Lib_4
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>
1,CD13-1a,13,9,44,18
2,CD13-1b,8,7,22,15
3,CD55-1a,13,9,27,15
4,CD55-1b,26,21,80,27
5,CD73-1a,8,7,28,24
6,CD73-1b,14,8,29,12


# Run assignment

In [26]:
libraries = unique(seurat1$library)
seurat_libs1 = list()
for(i in seq_along(libraries)){ 
    lib = libraries[i]
    seurat_libs1[[i]] = subset(seurat1, subset = library == lib)
}
names(seurat_libs1) <- libraries

In [27]:
libraries = unique(seurat2$library)
seurat_libs2 = list()
for(i in seq_along(libraries)){ 
    lib = libraries[i]
    seurat_libs2[[i]] = subset(seurat2, subset = library == lib)
}
names(seurat_libs2) <- libraries

In [63]:
pert1 = get_all_perturbed_cells_by_guide(seurat_libs1, df_thresh)
pert2 = get_all_perturbed_cells_by_guide(seurat_libs2, df_thresh)

### How many common and different genes in each set?

In [64]:
length(pert1$'DE6-1b')
length(pert2$'DE6-1b')
length(intersect(pert1$'DE6-1b', pert2$'DE6-1b'))
length(setdiff(pert1$'DE6-1b', pert2$'DE6-1b'))

In [43]:
different_cells = setdiff(pert1$'DE6-1b', pert2$'DE6-1b')

In [54]:
cell = different_cells[[1]]
counts1 = as.data.frame(seurat1[['sgRNA']]@counts)[,cell]
counts2 = as.data.frame(seurat2[['sgRNA']]@counts)[,cell]
sum(counts1 - counts2)

### Run test on just one guide

In [102]:
libraries = names(seurat_libs1)
guide = 'DE6-1b'
lib = 'Lib_1'

seurat_lib1 = seurat_libs1[[lib]]
threshold = df_thresh[df_thresh$guide==guide, lib]
#cat(blue(guide, lib, threshold, "\n"))
cells_in_lib1 = Cells(seurat_lib1)   
sgrna_counts1 = seurat_lib1[['sgRNA']]@counts
select_perturbed1 = sgrna_counts1[guide, cells_in_lib1] >= threshold
perturbed_cells_in_library1 = cells_in_lib1[select_perturbed1]

seurat_lib2 = seurat_libs2[[lib]]
threshold = df_thresh[df_thresh$guide==guide, lib]
#cat(blue(guide, lib, threshold, "\n"))
cells_in_lib2 = Cells(seurat_lib2)   
sgrna_counts2 = seurat_lib2[['sgRNA']]@counts
select_perturbed2 = sgrna_counts2[guide, cells_in_lib2] >= threshold
perturbed_cells_in_library2 = cells_in_lib2[select_perturbed2]

length(perturbed_cells_in_library1)
length(perturbed_cells_in_library2)


Seurat file from integration and the one saved by guide calling nb have different cell barcodes in each library, even the counts are the same.

In [106]:
sort(cells_in_lib1) == sort(cells_in_lib2)

In [112]:
length(setdiff(cells_in_lib1,cells_in_lib2))
length(intersect(cells_in_lib1,cells_in_lib2))

In [115]:
length(setdiff(Cells(seurat1), Cells(seurat2)))

In [143]:
s1_l1 = subset(seurat1, subset=library==lib)
s2_l1 = subset(seurat2, subset=library==lib)

length(Cells(s1_l1))
length(Cells(s2_l1))

In [144]:
length(intersect(Cells(s1_l1), Cells(s2_l1)))