In [1]:
from raptcr.analysis import Repertoire

In [2]:
import pandas as pd

In [7]:
!ls ~/UA/databases

mixcr_airr_example.tsv	vdjdb


In [52]:
class Repertoire():
    """
    Class for storing TCR-seq information and calculating useful properties.
    """
    def __init__(self, df) -> None:
        self.data = df

    def __repr__(self) -> str:
        return f"TCR repertoire of size {len(self.data)}"

    def to_df(self) -> pd.DataFrame:
        return self.df

In [53]:
fp = "~/UA/databases/mixcr_airr_example.tsv"

def read_AIRR(filepath: str, filter_productive:bool=True, filter_TRB:bool=True, filter_min_duplicate_count:int=0) -> Repertoire:

    cols = ['sequence_id', 'productive', 'v_call', 'j_call', 'duplicate_count', 'junction_aa']
    df = pd.read_csv(filepath, sep='\t', usecols = cols)
    df = df.set_index('sequence_id')

    if 'productive' in df:
        if df['productive'].dtype == "O":
            df['productive'] = df['productive']=='T'

    if filter_productive:
        df = df.query('productive == True')

    if filter_min_duplicate_count:
        df = df.query(f'duplicate_count > {filter_min_duplicate_count}')

    df = df.drop('productive', axis=1)

    if filter_TRB:
        df = df.query('v_call.str.contains("TRB") or j_call.str.contains("TRB")')

    return Repertoire(df)

In [55]:
read_AIRR(fp).data

Unnamed: 0_level_0,v_call,j_call,junction_aa,duplicate_count
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
clone.1,TRBV7-2*00,TRBJ2-7*00,CASSSPGREYDYEQYF,4051
clone.7,TRBV19*00,TRBJ2-7*00,CASSITPGQGTDEQYF,1615
clone.8,TRBV2*00,TRBJ1-4*00,CASIYQGSEKLFF,1480
clone.10,TRBV24-1*00,TRBJ2-2*00,CATYDGNTGELFF,1151
clone.12,TRBV29-1*00,TRBJ2-1*00,CSVDWPKNEQFF,991
...,...,...,...,...
clone.34118,TRBV29-1*00,TRBJ2-5*00,CSARGTTKETQYF,7
clone.34119,TRBV29-1*00,TRBJ1-2*00,CSDAQGYLGYTF,7
clone.34126,TRBV27*00,TRBJ2-1*00,RASSSSSSGYNEQFF,7
clone.34127,TRBV2*00,TRBJ1-3*00,RASSLQGPSGNTIYF,7


In [26]:
df.junction_aa

sequence_id
clone.0             CAVNLGYGQNFVF
clone.1          CASSSPGREYDYEQYF
clone.2              CAGYNYGQNFVF
clone.3              CIFYSGNTPLVF
clone.4           CALPYSGAGSYQLTF
                      ...        
clone.34125            PGYYGQNFVF
clone.34126       RASSSSSSGYNEQFF
clone.34127       RASSLQGPSGNTIYF
clone.34128    RASRLNRSRKYITDTQYF
clone.34129             QCRQ_MLTF
Name: junction_aa, Length: 34130, dtype: object

In [21]:
df.query('v_call.str.contains("TRB") or j_call.str.contains("TRB")')

Unnamed: 0_level_0,sequence,rev_comp,productive,v_call,d_call,j_call,c_call,sequence_alignment,germline_alignment,complete_vdj,...,c_sequence_end,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,c_alignment_start,c_alignment_end,duplicate_count
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
clone.1,TGTGCCAGCAGCTCCCCAGGGCGGGAGTACGACTACGAGCAGTACTTC,F,T,TRBV7-2*00,TRBD2*00,TRBJ2-7*00,TRBC2*00,TGTGCCAGCAGCTCCCCAGGGCGGGAGTACGACTACGAGCAGTACTTC,TGTGCCAGCAGCTNNNNNNNGCGGGAGNNNNNCTACGAGCAGTACTTC,F,...,,1.0,13.0,21.0,27.0,33.0,48.0,,,4051
clone.7,TGTGCCAGTAGCATCACGCCGGGACAGGGGACGGATGAGCAGTACTTC,F,T,TRBV19*00,TRBD1*00,TRBJ2-7*00,TRBC2*00,TGTGCCAGTAGCATCACGCCGGGACAGGGGACGGATGAGCAGTACTTC,TGTGCCAGTAGNNNNNNNCCGGGACAGGGGNNNNNNGAGCAGTACTTC,F,...,,1.0,11.0,19.0,30.0,37.0,48.0,,,1615
clone.8,TGTGCCAGCATTTACCAGGGAAGTGAAAAACTGTTTTTT,F,T,TRBV2*00,TRBD1*00,TRBJ1-4*00,TRBC1*00,TGTGCCAGCATTTACCAGGGAAGTGAAAAACTGTTTTTT,TGTGCCAGCANNNNNCAGGGNNNTGAAAAACTGTTTTTT,F,...,,1.0,10.0,16.0,20.0,24.0,39.0,,,1480
clone.10,TGTGCCACCTACGACGGAAACACCGGGGAGCTGTTTTTT,F,T,TRBV24-1*00,,TRBJ2-2*00,TRBC2*00,TGTGCCACCTACGACGGAAACACCGGGGAGCTGTTTTTT,TGTGCCACCNNNNNNNNNAACACCGGGGAGCTGTTTTTT,F,...,,1.0,9.0,,,19.0,39.0,,,1151
clone.12,TGCAGCGTTGATTGGCCTAAGAATGAGCAGTTCTTC,F,T,TRBV29-1*00,,TRBJ2-1*00,TRBC2*00,TGCAGCGTTGATTGGCCTAAGAATGAGCAGTTCTTC,TGCAGCGTTGANNNNNNNNNNAATGAGCAGTTCTTC,F,...,,1.0,11.0,,,22.0,36.0,,,991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
clone.34118,TGCAGCGCACGGGGGACTACGAAAGAGACCCAGTACTTC,F,T,TRBV29-1*00,TRBD2*00,TRBJ2-5*00,TRBC2*00,TGCAGCGCACGGGGGACTACGAAAGAGACCCAGTACTTC,TGCAGCGNNNNNGGGACTANNNAAGAGACCCAGTACTTC,F,...,,1.0,7.0,13.0,19.0,23.0,39.0,,,7
clone.34119,TGCAGCGACGCACAGGGATATCTTGGCTACACCTTC,F,T,TRBV29-1*00,TRBD1*00,TRBJ1-2*00,TRBC1*00,TGCAGCGACGCACAGGGATATCTTGGCTACACCTTC,TGCAGCGNNNNACAGGGNNNNNNTGGCTACACCTTC,F,...,,1.0,7.0,12.0,17.0,24.0,36.0,,,7
clone.34126,CGTGCCAGCAGTTCGTCGTCTAGCGGCTACAATGAGCAGTTCTTC,F,T,TRBV27*00,TRBD2*00,TRBJ2-1*00,TRBC2*00,CGTGCCAGCAGTTCGTCGTCTAGCGGCTACAATGAGCAGTTCTTC,TGTGCCAGCAGTTNNNNNNCTAGCGGCTACAATGAGCAGTTCTTC,F,...,,1.0,13.0,20.0,26.0,27.0,45.0,,,7
clone.34127,CGTGCCAGCAGCCTACAGGGCCCCTCTGGAAACACCATATATTTT,F,T,TRBV2*00,"TRBD2*00,TRBD1*00",TRBJ1-3*00,TRBC1*00,CGTGCCAGCAGCCTACAGGGCCCCTCTGGAAACACCATATATTTT,TGTGCCAGCAGNNNNNAGGGCCCCTCTGGAAACACCATATATTTT,F,...,,1.0,11.0,17.0,23.0,24.0,45.0,,,7


In [5]:
    df = (
        pd.read_csv(
            '~/UA/databases/mixcr_airr_example.tsv', 
            sep='\t',
            dtype = {
                'sequence_id' : str,
            })
        .set_index('sequence_id')

    )

In [8]:
(df["productive"]).dtype == 'O'

True

In [11]:


if filter_productive : 
    df = df.

df.columns

Index(['sequence', 'rev_comp', 'productive', 'v_call', 'd_call', 'j_call',
       'c_call', 'sequence_alignment', 'germline_alignment', 'complete_vdj',
       'junction', 'junction_aa', 'np1', 'np2', 'cdr1', 'cdr1_aa', 'cdr2',
       'cdr2_aa', 'cdr3', 'cdr3_aa', 'fwr1', 'fwr1_aa', 'fwr2', 'fwr2_aa',
       'fwr3', 'fwr3_aa', 'fwr4', 'fwr4_aa', 'v_score', 'v_cigar', 'd_score',
       'd_cigar', 'j_score', 'j_cigar', 'c_score', 'c_cigar',
       'junction_length', 'np1_length', 'np2_length', 'v_germline_start',
       'v_sequence_start', 'v_germline_end', 'v_sequence_end',
       'd_germline_start', 'd_sequence_start', 'd_germline_end',
       'd_sequence_end', 'j_germline_start', 'j_sequence_start',
       'j_germline_end', 'j_sequence_end', 'c_germline_start',
       'c_sequence_start', 'c_germline_end', 'c_sequence_end',
       'v_alignment_start', 'v_alignment_end', 'd_alignment_start',
       'd_alignment_end', 'j_alignment_start', 'j_alignment_end',
       'c_alignment_start', 'c_

In [15]:
df.query('productive == "T"')

Unnamed: 0_level_0,sequence,rev_comp,productive,v_call,d_call,j_call,c_call,sequence_alignment,germline_alignment,complete_vdj,...,c_sequence_end,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,c_alignment_start,c_alignment_end,duplicate_count
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
clone.0,TGTGCCGTGAACCTGGGTTATGGTCAGAATTTTGTCTTT,F,T,TRAV12-2*00,,TRAJ26*00,TRAC*00,TGTGCCGTGAACCTGGGTTATGGTCAGAATTTTGTCTTT,TGTGCCGTGAACNNNNNNTATGGTCAGAATTTTGTCTTT,F,...,,1.0,12.0,,,19.0,39.0,,,5235
clone.1,TGTGCCAGCAGCTCCCCAGGGCGGGAGTACGACTACGAGCAGTACTTC,F,T,TRBV7-2*00,TRBD2*00,TRBJ2-7*00,TRBC2*00,TGTGCCAGCAGCTCCCCAGGGCGGGAGTACGACTACGAGCAGTACTTC,TGTGCCAGCAGCTNNNNNNNGCGGGAGNNNNNCTACGAGCAGTACTTC,F,...,,1.0,13.0,21.0,27.0,33.0,48.0,,,4051
clone.2,TGTGCAGGGTATAACTATGGTCAGAATTTTGTCTTT,F,T,TRAV13-1*00,,TRAJ26*00,TRAC*00,TGTGCAGGGTATAACTATGGTCAGAATTTTGTCTTT,TGTGCAGNNNATAACTATGGTCAGAATTTTGTCTTT,F,...,,1.0,7.0,,,11.0,36.0,,,3050
clone.3,TGCATCTTCTATTCAGGAAACACACCTCTTGTCTTT,F,T,TRAV26-2*00,,TRAJ29*00,TRAC*00,TGCATCTTCTATTCAGGAAACACACCTCTTGTCTTT,TGCATCNNNNATTCAGGAAACACACCTCTTGTCTTT,F,...,,1.0,6.0,,,11.0,36.0,,,3010
clone.4,TGTGCTCTCCCATACTCTGGGGCTGGGAGTTACCAACTCACTTTC,F,T,TRAV19*00,,TRAJ28*00,TRAC*00,TGTGCTCTCCCATACTCTGGGGCTGGGAGTTACCAACTCACTTTC,TGTGCTCTNNCATACTCTGGGGCTGGGAGTTACCAACTCACTTTC,F,...,,1.0,8.0,,,11.0,45.0,,,2607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
clone.34124,CTCGGAGGAAGCCAAGGAAATCTCATCTTT,F,T,TRAV25*00,,TRAJ42*00,TRAC*00,GGAGGAAGCCAAGGAAATCTCATCTTT,GGAGGAAGCCAAGGAAATCTCATCTTT,F,...,,,,,,1.0,27.0,,,7
clone.34125,CCAGGGTACTATGGTCAGAATTTTGTCTTT,F,T,TRAV27*00,,TRAJ26*00,TRAC*00,ACTATGGTCAGAATTTTGTCTTT,ACTATGGTCAGAATTTTGTCTTT,F,...,,,,,,1.0,23.0,,,7
clone.34126,CGTGCCAGCAGTTCGTCGTCTAGCGGCTACAATGAGCAGTTCTTC,F,T,TRBV27*00,TRBD2*00,TRBJ2-1*00,TRBC2*00,CGTGCCAGCAGTTCGTCGTCTAGCGGCTACAATGAGCAGTTCTTC,TGTGCCAGCAGTTNNNNNNCTAGCGGCTACAATGAGCAGTTCTTC,F,...,,1.0,13.0,20.0,26.0,27.0,45.0,,,7
clone.34127,CGTGCCAGCAGCCTACAGGGCCCCTCTGGAAACACCATATATTTT,F,T,TRBV2*00,"TRBD2*00,TRBD1*00",TRBJ1-3*00,TRBC1*00,CGTGCCAGCAGCCTACAGGGCCCCTCTGGAAACACCATATATTTT,TGTGCCAGCAGNNNNNAGGGCCCCTCTGGAAACACCATATATTTT,F,...,,1.0,11.0,17.0,23.0,24.0,45.0,,,7


In [13]:
df.junction_aa

sequence_id
clone.0             CAVNLGYGQNFVF
clone.1          CASSSPGREYDYEQYF
clone.2              CAGYNYGQNFVF
clone.3              CIFYSGNTPLVF
clone.4           CALPYSGAGSYQLTF
                      ...        
clone.34125            PGYYGQNFVF
clone.34126       RASSSSSSGYNEQFF
clone.34127       RASSLQGPSGNTIYF
clone.34128    RASRLNRSRKYITDTQYF
clone.34129             QCRQ_MLTF
Name: junction_aa, Length: 34130, dtype: object