In [None]:
R.home()

In [None]:
baizer::pkglib(baizer, tidyverse, jsonlite, yaml, ComplexHeatmap)

In [None]:
sample <- snakemake@wildcards[['sample']]

# replace default configs with sample configs
config <- replace_item(snakemake@config, snakemake@config[[sample]])

In [None]:
Lmode <- config[['flt_mode']]

In [None]:
Pstat <- snakemake@output[['filter_stat']]
Pstat_dir <- snakemake@params[['stat_dir']]
Pfilter_dir <- snakemake@output[['filter_dir']]
dir.create(Pfilter_dir, recursive = TRUE)

In [None]:
subdirs <- dir(Pstat_dir)

In [None]:
Lstat <- list()
Lflt <- list()

In [None]:
# mRNA
if ('mRNA' %in% subdirs) {
    mRNA <- read_csv(str_glue('{Pstat_dir}/mRNA/mRNA.csv'))

    mRNA <- mRNA %>% 
        mutate(across(c(nFeature_RNA, nCount_RNA, mt_percent), ~ifelse(is.na(.x), 0, .x))) %>%
        mutate(
            mRNA_gene_keep = nFeature_RNA > config[['mRNA_gene_flt']],
            mRNA_umi_keep = nCount_RNA > config[['mRNA_umi_flt']],
            mRNA_mt_percent_keep = mt_percent < config[['mRNA_mt_percent_flt']],
            mRNA_keep = mRNA_gene_keep * mRNA_umi_keep * mRNA_mt_percent_keep
        ) %>% mutate(across(matches('_keep$'), as.numeric))
    mRNA_flt <- mRNA %>% filter(mRNA_keep==TRUE)
    write_excel_csv(mRNA_flt, str_glue('{Pfilter_dir}/mRNA_flt.csv'))
    
    Lflt$mRNA <- mRNA
    Lstat[['mRNA']][['raw']] <- nrow(mRNA)
    Lstat[['mRNA']][['flt']] <- nrow(mRNA_flt)
}

In [None]:
# FB
if ('FB' %in% subdirs) {
    FB <- read_csv(str_glue('{Pstat_dir}/FB/FB.csv'))

    aim_keeps <- str_glue("FB_{names(config[[sample]][['id2seq']])}_keep")
    
    FB <- FB %>% mutate(
        FB_NC_keep = NC_load == FALSE,
        FB_HT_keep = sum_HT > config[['FB_umi_flt']],
        FB_BD_keep = sum_BD > config[['FB_umi_flt']]
    ) %>% 
        mutate(FB_keep = apply(select(., all_of(aim_keeps)), 1, prod, na.rm = TRUE)) %>%
        mutate(across(matches('_keep$'), as.numeric))

    
    FB_flt <- FB %>% filter(FB_keep==TRUE)
    write_excel_csv(FB_flt, str_glue('{Pfilter_dir}/FB_flt.csv'))
    
    Lflt$FB <- FB
    
    Lstat[['FB']][['raw']] <- nrow(FB)
    Lstat[['FB']][['flt']] <- nrow(FB_flt)
}

In [None]:
# VDJB
if ('VDJB' %in% subdirs) {
    VDJB <- read_csv(str_glue('{Pstat_dir}/VDJB/VDJB.csv'))
    
    VDJB <- VDJB %>% 
        mutate(across(c(umis_H, umis_L), ~ifelse(is.na(.x), 0, .x))) %>%
        mutate(
            VDJB_productive_cellranger_keep = productive_cellranger,
            VDJB_unique_keep = unique, 
            VDJB_umi_H_keep = umis_H > config[['VDJB_umi_flt']],
            VDJB_umi_L_keep = umis_L > config[['VDJB_umi_flt']],
            VDJB_keep = VDJB_productive_cellranger_keep * VDJB_unique_keep * VDJB_umi_H_keep * VDJB_umi_L_keep
        ) %>% mutate(across(matches('_keep$'), as.numeric))
    
    VDJB_flt <- VDJB %>% filter(VDJB_keep==TRUE)
    write_excel_csv(VDJB_flt, str_glue('{Pfilter_dir}/VDJB_flt.csv'))
    
    Lflt$VDJB <- VDJB
    
    Lstat[['VDJB']][['raw']] <- nrow(VDJB)
    Lstat[['VDJB']][['flt']] <- nrow(VDJB_flt)
}

In [None]:
# VDJT
if ('VDJT' %in% subdirs) {
    VDJT <- read_csv(str_glue('{Pstat_dir}/VDJT/VDJT.csv'))
    
    VDJT <- VDJT %>% 
        mutate(across(c(umis_H, umis_L), ~ifelse(is.na(.x), 0, .x))) %>%
        mutate(
            VDJT_productive_cellranger_keep = productive_cellranger,
            VDJT_unique_keep = unique, 
            VDJT_umi_H_keep = umis_H > config[['VDJT_umi_flt']],
            VDJT_umi_L_keep = umis_L > config[['VDJT_umi_flt']],
            VDJT_keep = VDJT_productive_cellranger_keep * VDJT_unique_keep * VDJT_umi_H_keep * VDJT_umi_L_keep
        ) %>% mutate(across(matches('_keep$'), as.numeric))
    
    VDJT_flt <- VDJT %>% filter(VDJT_keep==TRUE)
    write_excel_csv(VDJT_flt, str_glue('{Pfilter_dir}/VDJT_flt.csv'))
    
    Lflt$VDJT <- VDJT
    
    Lstat[['VDJT']][['raw']] <- nrow(VDJT)
    Lstat[['VDJT']][['flt']] <- nrow(VDJT_flt)
}

In [None]:
# dedup function
Antibody_dedup <- function(TB, dedup_mode='CDR3_HCDR12'){
    if (dedup_mode == 'CDR'){
        Vdist_cell <- TB %>% distinct(cdr1_aa_H, cdr2_aa_H, cdr3_aa_H, cdr1_aa_L, cdr2_aa_L, cdr3_aa_L, .keep_all=TRUE) %>% select(cell) %>% unlist
        TBdedup <- TB %>% filter(cell %in% Vdist_cell)
    } else if (dedup_mode == 'HCDR'){
        Vdist_cell <- TB %>% distinct(cdr1_aa_H, cdr2_aa_H, cdr3_aa_H, .keep_all=TRUE) %>% select(cell) %>% unlist
        TBdedup <- TB %>% filter(cell %in% Vdist_cell)
    } else if (dedup_mode == 'CDR3'){
        Vdist_cell <- TB %>% distinct(cdr3_aa_H, cdr3_aa_L, .keep_all=TRUE) %>% select(cell) %>% unlist
        TBdedup <- TB %>% filter(cell %in% Vdist_cell)
    } else if (dedup_mode == 'HCDR3'){
        Vdist_cell <- TB %>% distinct(cdr3_aa_H, .keep_all=TRUE) %>% select(cell) %>% unlist
        TBdedup <- TB %>% filter(cell %in% Vdist_cell)
    } else if (dedup_mode == 'CDR3_HCDR12'){
        Vcdr3_dis <- TB %>% distinct(cdr3_aa_H, cdr3_aa_L, .keep_all=TRUE) %>% select('cell') %>% unlist
        Vcdr23_dis <- TB %>% distinct(cdr2_aa_H, cdr3_aa_H, cdr3_aa_L, .keep_all=TRUE) %>% select('cell') %>% unlist
        Vcdr13_dis <- TB %>% distinct(cdr1_aa_H, cdr3_aa_H, cdr3_aa_L, .keep_all=TRUE) %>% select('cell') %>% unlist
        Vcdr_cond <- intersect(Vcdr13_dis, Vcdr23_dis)
        TBdedup <- TB %>% filter(cell %in% Vcdr_cond)
    }
    
    return (TBdedup)
}

In [None]:
Lflt <- Lflt %>% map(~select(.x, -batch))

In [None]:
# output

for (mode_name in names(Lmode)) {
    
    libs <- Lmode[[mode_name]]
    mandatory_lib <- str_subset(libs, '\\*$') %>% str_replace('\\*$', '')
    libs <- intersect(str_replace(libs, '\\*$', ''), subdirs)

    if(any(mandatory_lib %nin% libs)) {
        next
    }
    
    if (length(libs) == 0) {
        next
    }
    
    # common unflt
    TBunflt <- Lflt[libs] %>% reduce(inner_join, by='cell')
    TBunflt <- TBunflt %>% mutate(batch=snakemake@wildcards[['sample']], .after=cell)
    Vkeep <- TBunflt %>% 
        select(str_c(libs, '_keep')) %>%
        mutate_all(as.logical) %>%
        apply(1, all) %>%
        unname
    TBunflt$keep <- Vkeep

    # unflt
    TBunflt %>%
        write_excel_csv(str_glue('{Pfilter_dir}/{mode_name}_unflt.csv'))

    # keep matrix
    TBkeep <- TBunflt %>% select(cell, matches('_keep$'))
    TBkeep %>% write_excel_csv(str_glue('{Pfilter_dir}/{mode_name}_keep.csv'))

    # flt
    TBflt <- TBunflt %>% filter(keep==TRUE) %>% select(!matches('_keep$'))
    TBflt %>% write_excel_csv(str_glue('{Pfilter_dir}/{mode_name}_flt.csv'))
    
    # dedup
    if ('VDJB' %in% subdirs) {
        TBflt_dedup <- Antibody_dedup(TBflt, dedup_mode='CDR3_HCDR12')
    } else {
        TBflt_dedup <- TBflt
    }
    TBflt_dedup %>% write_excel_csv(str_glue('{Pfilter_dir}/{mode_name}_dedup.csv'))
    
    
    Lstat[[mode_name]][['raw']] <- nrow(TBunflt)
    Lstat[[mode_name]][['flt']] <- nrow(TBflt)
    Lstat[[mode_name]][['dedup']] <- nrow(TBflt_dedup)

    
    
    # copy
    file.copy(str_glue('{Pfilter_dir}/{mode_name}_keep.csv'),
              str_glue('{Pstat_dir}/{mode_name}_keep.csv'))
    file.copy(str_glue('{Pfilter_dir}/{mode_name}_flt.csv'), 
              str_glue('{Pstat_dir}/{mode_name}_flt.csv'))
    file.copy(str_glue('{Pfilter_dir}/{mode_name}_dedup.csv'), 
              str_glue('{Pstat_dir}/{mode_name}_dedup.csv'))
    
}

In [None]:
# write
write_yaml(Lstat, file=Pstat)
file.copy(Pstat, str_c(Pstat_dir, '/', basename(Pstat)), overwrite=TRUE)

In [None]:
# Bcell changeo flt
if ('VDJB' %in% subdirs) {
    Bcell_changeo <- read_tsv(str_glue('{Pstat_dir}/VDJB/changeo_clone-pass_germ-pass.tsv'))
    
    Bcell_changeo %>% 
        filter(sequence_id %in% TBflt$sequence_id_H) %>%
        write_tsv(snakemake@params[['Bcell_changeo_flt_H']])

    Bcell_changeo %>% 
        filter(sequence_id %in% TBflt$sequence_id_L) %>%
        write_tsv(snakemake@params[['Bcell_changeo_flt_L']])
}