https://tcrdist3.readthedocs.io/en/latest/index.html

In [1]:
import os
import sys
import json
import collections

import numpy as np
import pandas as pd

from tcrdist.repertoire import TCRrep

MY_SRC_DIR = os.path.join(os.path.dirname(os.getcwd()), "tcr")
assert os.path.isdir(MY_SRC_DIR)
sys.path.append(MY_SRC_DIR)

import utils

In [2]:
df = pd.read_csv("lcmv_test_tcrdist3.tsv", sep="\t")
df

Unnamed: 0,cdr3_a_aa,v_a_gene,j_a_gene,cdr3_b_aa,v_b_gene,j_b_gene,gp33_binding
0,CAVSLGSALGRLHF,TRAV9D-3*01,TRAJ18*01,CASSLDWVSYEQYF,TRBV10*01,TRBJ2-7*01,False
1,CATDASQGGRALIF,TRAV8-1*01,TRAJ15*01,CASSSGGSQNTLYF,TRBV3*01,TRBJ2-4*01,False
2,CAMREMDSNYQLIW,TRAV16*01,TRAJ33*01,CASSLGLGTNTGQLYF,TRBV12-2*01,TRBJ2-2*01,False
3,CIVTDMSSNNRIFF,TRAV2*01,TRAJ31*01,CASSETGTNSDYTF,TRBV13-1*01,TRBJ1-2*01,False
4,CAASENSGTYQRF,TRAV7-4*01,TRAJ13*01,CGAREGFEQYF,TRBV20*01,TRBJ2-7*01,True
...,...,...,...,...,...,...,...
2650,CALSDPGTQVVGQLTF,TRAV12D-3*01,TRAJ5*01,CASGDAGRETGNTLYF,TRBV13-2*01,TRBJ1-3*01,False
2651,CALSPPSSNTNKVVF,TRAV12-2*01,TRAJ34*01,CGARVGEAEVFF,TRBV20*01,TRBJ1-1*01,False
2652,CALSDRSSGSWQLIF,TRAV12D-2*01,TRAJ22*01,CASSEGGGNSPLYF,TRBV13-3*01,TRBJ1-6*01,False
2653,CSATVYTNKVVF,TRAV5-1*01,TRAJ34*01,CASGDRTGGNYAEQFF,TRBV13-2*01,TRBJ2-1*01,False


In [3]:
# For TCR-BERT, ranges from 350-2000 clusters
# Run for TRB only

from sklearn.cluster import AgglomerativeClustering

tr = TCRrep(cell_df = df, 
            organism = 'mouse', 
            chains = ['beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv',
            compute_distances = True)


def eval_distance_with_agg_cluster(dmat, k_values=np.arange(30, 2401, 40)):
    """
    Evaluate the clustering of dmat with k clusters
    """
    k_to_labels = {}
    for k in k_values:
        ac = AgglomerativeClustering(n_clusters=k, affinity='precomputed', linkage='average')
        cluster_labels = ac.fit_predict(dmat)
        cluster_groups = []
        for l in set(cluster_labels):  # Aggregate into groups of labels
            idx = np.where(cluster_labels == l)[0]
            cluster_groups.append([str(i) for i in idx])
        k_to_labels[int(k)] = cluster_groups
    return k_to_labels    

k_to_labels = eval_distance_with_agg_cluster(tr.pw_beta)

with open("tcrdist3_aggcluster_results.json", 'w') as sink:
    json.dump(k_to_labels, sink, indent=4)

with open("tcrdist3_aggcluster_truth.json", 'w') as sink:
    truth_dict = {str(i): row['gp33_binding'] for i, row in df.iterrows()}
    json.dump(truth_dict, sink, indent=4)


  self._validate_cell_df()


In [4]:
# For TCR-BERT, ranges from 350-2000 clusters
# Run for TRA and TRB

from sklearn.cluster import AgglomerativeClustering

tr = TCRrep(cell_df = df, 
            organism = 'mouse', 
            chains = ['alpha', 'beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv',
            compute_distances = True)

k_to_labels = eval_distance_with_agg_cluster(tr.pw_alpha + tr.pw_beta)

with open("tcrdist3_aggcluster_dual_results.json", 'w') as sink:
    json.dump(k_to_labels, sink, indent=4)

with open("tcrdist3_aggcluster_dual_truth.json", 'w') as sink:
    truth_dict = {str(i): row['gp33_binding'] for i, row in df.iterrows()}
    json.dump(truth_dict, sink, indent=4)

  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  attr ='cdrs')
  self.deduplicate()
