In [None]:
R.home()

In [None]:
baizer::pkglib(tidyverse, Biostrings, Peptides, baizer, genogamesh, yaml)

In [None]:
sample <- snakemake@wildcards[['sample']]

# replace default configs with sample configs
config <- replace_item(snakemake@config, snakemake@config[[sample]])

In [None]:
# read from cellranger
TBcr_airr <- read_tsv(str_c(snakemake@input[['count_dir']], str_glue(config[['count_VDJT_airr']]), sep='/'))
TBcr_anno <- read_csv(str_c(snakemake@input[['count_dir']], str_glue(config[['count_VDJT_anno']]), sep='/'))

In [None]:
##################################
### select cols
##################################

In [None]:
# columns from cellranger annotation
TBcr_anno_sel <- TBcr_anno %>% select(cell=barcode, sequence_id=contig_id, productive_cellranger=productive, chain, 
                     clone_cellranger=raw_clonotype_id, reads, umis, c_gene,
                     v_gene, d_gene, j_gene, 
                     cdr1_nt, cdr1_aa=cdr1, cdr2_nt, cdr2_aa=cdr2, cdr3_nt, cdr3_aa=cdr3, 
                     fwr1_nt, fwr1_aa=fwr1, fwr2_nt, fwr2_aa=fwr2, fwr3_nt, fwr3_aa=fwr3, fwr4_nt, fwr4_aa=fwr4
                    )

In [None]:
# columns from cellranger airr
TBcr_airr_sel <- TBcr_airr %>% select(sequence_id, seq_nt=sequence, seq_aa=sequence_aa, seq_align_nt=sequence_alignment)

In [None]:
##################################
### join
##################################

In [None]:
TBjoin <- TBcr_anno_sel %>% left_join(TBcr_airr_sel, by='sequence_id') %>%
    relocate(all_of(colnames(TBcr_airr_sel)[-1]), .after=fwr4_aa)

In [None]:
##################################
### widen for HL
##################################

In [None]:
TBjoin <- TBjoin %>% mutate(HL=case_when(chain=='TRA' ~ 'H', chain=='TRG' ~ 'H', chain=='TRB' ~ 'L', chain=='TRD' ~ 'L')) %>%
    mutate(HL=factor(HL, c('H', 'L'))) %>% 
    # remove the contigs unknown
    filter(!is.na(HL))

In [None]:
# contig numbers of a cell, and whether there is only an unique H or L
TBunique <- TBjoin %>% group_by(cell, HL) %>% summarise(contig_num = n(), unique = n() == 1) %>%
    ungroup

In [None]:
# for multi-contigs, only keep the first one with most umis
TBjoin <- TBjoin %>% group_by(cell, HL) %>% arrange(desc(umis)) %>% dplyr::slice(1) %>% ungroup %>%
    arrange(HL, cell, sequence_id)

In [None]:
TBjoin <- TBjoin %>% left_join(TBunique, by=c('cell', 'HL')) %>%
    relocate(contig_num, unique, .after=umis)

In [None]:
# pivot
keep_col <- c('cell', 'HL')
TBwider <- TBjoin %>% pivot_wider(names_from='HL', values_from=-all_of(keep_col))
TBwider <- TBwider %>% mutate(batch = sample, .after=cell)

In [None]:
# unique, productive, clone_cellranger, class
TBwider <- TBwider %>% 
    mutate(
           productive_cellranger=productive_cellranger_H & productive_cellranger_H,
           unique = unique_H & unique_L, 
           clone_cellranger = case_when(!is.na(clone_cellranger_H) ~ clone_cellranger_H,
                                        !is.na(clone_cellranger_L) ~ clone_cellranger_L),
           .after=cell
          )
# if single contig
TBwider <- TBwider %>% mutate(
    productive_cellranger=ifelse(is.na(seq_nt_H) | is.na(seq_nt_L), FALSE, productive_cellranger),
    unique=ifelse(is.na(seq_nt_H) | is.na(seq_nt_L), FALSE, unique)
)

In [None]:
# stat

TBmtx <- read_csv(str_c(snakemake@input[['count_dir']], str_glue(config[['count_VDJT_mtx']]), sep='/')) %>% 
    dplyr::rename(item=`Metric Name`, value=`Metric Value`) %>%
    filter(`Library Type` == 'VDJ T')

In [None]:
Lstat <- TBmtx %>% distinct(item, value) %>% pivot_wider(names_from='item', values_from='value') %>% select(any_of(c(
    'cells'='Estimated number of cells', 
    'mean_reads'='Mean reads per cell', 
    'median_TRA'='Median TRA UMIs per Cell', 
    'median_TRB'='Median TRB UMIs per Cell', 
    'align_rate'='Reads mapped to any V(D)J gene', 
    'total_reads'='Number of reads', 
    'Q30_barcode'='Q30 barcodes', 
    'Q30_umi'='Q30 UMI', 
    'Q30_R1'='Q30 RNA read',
    'Q30_R2'='Q30 RNA read 2'
))) %>% as.list

In [None]:
# write
dir.create(dirname(snakemake@output[['VDJT_csv']]), recursive = TRUE)
write_excel_csv(TBwider, snakemake@output[['VDJT_csv']])
write_yaml(Lstat, file=snakemake@output[['VDJT_stat']])

In [None]:
dir.create(snakemake@output[['stat_dir']], recursive = TRUE)
file.copy(snakemake@output[['VDJT_csv']], 
          str_c(snakemake@output[['stat_dir']], '/', basename(snakemake@output[['VDJT_csv']])), 
          overwrite=TRUE)
file.copy(snakemake@output[['VDJT_stat']], 
          str_c(snakemake@output[['stat_dir']], '/', basename(snakemake@output[['VDJT_stat']])), 
          overwrite=TRUE)