In [None]:
baizer::pkglib(tidyverse, Biostrings, Peptides, baizer, genogamesh, yaml)

In [None]:
sample <- snakemake@wildcards[['sample']]

# replace default configs with sample configs
config <- replace_item(snakemake@config, snakemake@config[[sample]])

In [None]:
Pcount <- snakemake@input[['VDJT_count_dir']]

Pcsv <- snakemake@output[['VDJT_csv']]
Pstat <- snakemake@output[['VDJT_stat']]

In [None]:
# read from cellranger
TBcr_airr <- read_tsv(str_glue('{Pcount}/airr_rearrangement.tsv'))
TBcr_anno <- read_csv(str_glue('{Pcount}/all_contig_annotations.csv'))

In [None]:
##################################
### select cols
##################################

In [None]:
# columns from cellranger annotation
TBcr_anno_sel <- TBcr_anno %>% select(cell=barcode, sequence_id=contig_id, productive_cellranger=productive, chain, 
                     clone_cellranger=raw_clonotype_id, reads, umis, c_gene,
                     v_gene, d_gene, j_gene, 
                     cdr1_nt, cdr1_aa=cdr1, cdr2_nt, cdr2_aa=cdr2, cdr3_nt, cdr3_aa=cdr3, 
                     fwr1_nt, fwr1_aa=fwr1, fwr2_nt, fwr2_aa=fwr2, fwr3_nt, fwr3_aa=fwr3, fwr4_nt, fwr4_aa=fwr4
                    )

In [None]:
# columns from cellranger airr
TBcr_airr_sel <- TBcr_airr %>% select(sequence_id, seq_nt=sequence, seq_aa=sequence_aa, seq_align_nt=sequence_alignment)

In [None]:
##################################
### join
##################################

In [None]:
TBjoin <- TBcr_anno_sel %>% left_join(TBcr_airr_sel, by='sequence_id') %>%
    relocate(all_of(colnames(TBcr_airr_sel)[-1]), .after=fwr4_aa)

In [None]:
##################################
### widen for HL
##################################

In [None]:
TBjoin <- TBjoin %>% mutate(HL=case_when(chain=='TRA' ~ 'H', chain=='TRG' ~ 'H', chain=='TRB' ~ 'L', chain=='TRD' ~ 'L')) %>%
    mutate(HL=factor(HL, c('H', 'L'))) %>% 
    # remove the contigs unknown
    filter(!is.na(HL))

In [None]:
# contig numbers of a cell, and whether there is only an unique H or L
TBunique <- TBjoin %>% group_by(cell, HL) %>% summarise(contig_num = n(), unique = n() == 1) %>%
    ungroup

In [None]:
# for multi-contigs, only keep the first one with most umis
TBjoin <- TBjoin %>% group_by(cell, HL) %>% arrange(desc(umis)) %>% dplyr::slice(1) %>% ungroup %>%
    arrange(HL, cell, sequence_id)

In [None]:
TBjoin <- TBjoin %>% left_join(TBunique, by=c('cell', 'HL')) %>%
    relocate(contig_num, unique, .after=umis)

In [None]:
# pivot
keep_col <- c('cell', 'HL')
TBwider <- TBjoin %>% pivot_wider(names_from='HL', values_from=-all_of(keep_col))

In [None]:
# unique, productive, clone_cellranger, class
TBwider <- TBwider %>% 
    mutate(
           productive_cellranger=productive_cellranger_H & productive_cellranger_H,
           unique = unique_H & unique_L, 
           clone_cellranger = case_when(!is.na(clone_cellranger_H) ~ clone_cellranger_H,
                                        !is.na(clone_cellranger_L) ~ clone_cellranger_L),
           .after=cell
          )

In [None]:
# stat
Lstat <- list()

In [None]:
# write
write_excel_csv(TBwider, Pcsv)
write_yaml(Lstat, file=Pstat)