In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(Seurat)
    library(Signac)
    library(EnsDb.Hsapiens.v86)
    library(dplyr)
    library(ggplot2)
    library(bedr)
    library(SeuratDisk)
    library(scales)
    library(reshape2)
    library(Hmisc)
    library(tidyr)
    library(tidyverse)
    library(crayon)
    library(readr)
    library(future)
    library(combinat)
    library("TxDb.Hsapiens.UCSC.hg38.knownGene")
    library(GenomicRanges)
    library(GenomicFeatures)
    library(rsnps)
    library(biomaRt)
    library(ggpubr)
    library(gridExtra)
    library(patchwork)
}))

source('/home/vsevim/prj/workflows/ckd/secondary/helper_functions_for_diffex.r')

# Definitions etc

In [2]:
set.seed(1234)
options(digits=2)
stats <- c()

In [3]:
options(future.globals.maxSize= 250 * 1024^3) # 650Gb
# plan()

# Enable parallelization
plan(sequential)
plan("multicore", workers = 64)

In [4]:
if(!exists("papermill_run")) {
    prj_name = "Screen1_66guides"
    data_subdir_suffixes = "L1,L2,L3,L4"     
    secondary_a_path = "/home/vsevim/prj/1012-ckd/S1/analysis/secondary/"
    save_seurat_h5 = "YES"
}

In [5]:
custom_theme <- theme(
  plot.title = element_text(size=16, hjust = 0.5), 
  legend.key.size = unit(0.7, "cm"), 
  legend.text = element_text(size = 14))

### Load Seurat files

In [8]:
integrated_h5_path <- "/home/vsevim/prj/1012-ckd/S1/analysis/secondary/integrated/seurat_objects/integrated.h5seurat"
original_integrated_h5_path <- paste0(secondary_a_path, "integrated/seurat_objects/integrated_wo_guide_calls.h5seurat")
seurat1    <- LoadH5Seurat(original_integrated_h5_path, verbose = FALSE)
seurat2    <- LoadH5Seurat(integrated_h5_path, verbose = FALSE)

Validating h5Seurat file

"Adding a command log without an assay associated with it"
Validating h5Seurat file

"Adding a command log without an assay associated with it"


# Check consitency between post-integration and post-guide calling H5s

In [17]:
nrow(seurat1@meta.data)
nrow(seurat2@meta.data)
any(rownames(seurat1@meta.data) != rownames(seurat2@meta.data))
any(rownames(seurat1$donor) != rownames(seurat2$donor))
any(rownames(seurat1$library) != rownames(seurat2$library))

Manually checked if souporcell calls match the donors in the dataframe

In [27]:
sample_n(seurat2@meta.data, 1)


Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,nCount_sgRNA,nFeature_sgRNA,nCount_SCT,nFeature_SCT,orig.ident,library,percent.mt,status,donor,SCT_snn_res.0.5,seurat_clusters,guide_calls,plasmid_calls,n_guides,n_plasmids
Unnamed: 0_level_1,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<fct>,<fct>,<chr>,<chr>,<dbl>,<dbl>
CGTTCTGTCCACAAGT-1_1,48093,8283,1651,19,25798,7056,Screen1_66guides,Lib_1,7.7,singlet,3,1,1,DE5-2a|DE5-2b,DE5-2,2,1


### Check if get_perturbed_cells works

In [32]:
target = 'FTH1'
df_thresholds = seurat2@misc$thresholds
df_guide      = seurat2@misc$guides
guides_4_target = get_guides_by_subclass(df_guide, 'alias', target)
perturbed_cells_by_guide = get_perturbed_cells(seurat2, df_thresholds)
seurat_dummy = mark_target_pos_neg(seurat2, perturbed_cells_by_guide, guides_4_target, print_counts = T)

FTH1-1a   624 
FTH1-2a   3330 
FTH1-1b   3503 
FTH1-2b   3503 
[34mGuide+ = 3503 ; Guide- = 23638 
[39m

#### Check if target+/- cells marked correctly

ANPEP and NTSC gene names don't match the alias. So they're expected to fail.

In [97]:
targets = unique(filter(df_guide, class=='targeting')$alias)
seurat_dummy$ident = Idents(seurat_dummy)

for(target in targets) {
    cat(target, " ")
    guides_4_target = get_guides_by_subclass(df_guide, 'alias', target)
    perturbed_cells_by_guide = get_perturbed_cells(seurat2, df_thresholds)
    seurat_dummy = mark_target_pos_neg(seurat2, perturbed_cells_by_guide, guides_4_target, print_counts = T)
    seurat_dummy$ident = Idents(seurat_dummy)
    positives_plasmid_calls = subset(seurat_dummy, idents="target_positive")$plasmid_calls
    negatives_plasmid_calls = subset(seurat_dummy, idents="target_negative")$plasmid_calls
    # select_positives = seurat_dummy$ident=="target_positive"
    # select_negatives = seurat_dummy$ident=="target_negative"
    # positives_plasmid_calls = seurat_dummy$plasmid_calls[select_positives]
    # negatives_plasmid_calls = seurat_dummy$plasmid_calls[select_negatives]

    # attach a dash to target name so DE1 wont match DE10 etc
    search_term = paste0(target, "-") 
    pos_check = all(grepl(search_term, positives_plasmid_calls))
    neg_check = !any(grepl(search_term, negatives_plasmid_calls))
    if(neg_check & pos_check) {
        cat(green(" --- OK ----\n\n"))

    } else {
        cat(red(" --- FAIL ----\n\n"))
    }
    flush.console()
}

ANPEP  CD13-1a   868 
CD13-1b   1162 
[34mGuide+ = 1162 ; Guide- = 25979 
[39m[31m --- FAIL ----

[39mCD55  CD55-1a   796 
CD55-1b   1081 
[34mGuide+ = 1081 ; Guide- = 26060 
[39m[32m --- OK ----

[39mNT5E  CD73-1a   758 
CD73-1b   1021 
[34mGuide+ = 1021 ; Guide- = 26120 
[39m[31m --- FAIL ----

[39mFUBP1  FUBP1-1a   710 
FUBP1-1b   926 
[34mGuide+ = 926 ; Guide- = 26215 
[39m[32m --- OK ----

[39mC15orf40  C15orf40-1a   823 
C15orf40-1b   1077 
[34mGuide+ = 1077 ; Guide- = 26064 
[39m[32m --- OK ----

[39mMAFF  MAFF-1a   665 
MAFF-1b   871 
[34mGuide+ = 871 ; Guide- = 26270 
[39m[32m --- OK ----

[39mFTH1  FTH1-1a   624 
FTH1-2a   3330 
FTH1-1b   3503 
FTH1-2b   3503 
[34mGuide+ = 3503 ; Guide- = 23638 
[39m[32m --- OK ----

[39mPLIN3  PLIN3-1a   866 
PLIN3-2a   1585 
PLIN3-1b   1820 
PLIN3-2b   2060 
[34mGuide+ = 2060 ; Guide- = 25081 
[39m[32m --- OK ----

[39mTKT  TKT-1a   674 
TKT-2a   1497 
TKT-1b   1708 
TKT-2b   1978 
[34mGuide+ = 1978 ; Guide-