In [2]:
import pandas as pd
import Levenshtein as lev
from scipy.spatial.distance import pdist, squareform
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

import networkx as nx
import gravis as gv

# Note you will need to download muscle into the usr/bin and rename it simply as muscle
import biotite.sequence as seq
import biotite.application.muscle as muscle
import biotite.sequence.graphics as graphics

from scipy.stats import mode

In [3]:
from biotite.sequence.align import get_sequence_identity

# CDR3 Distance Metrics

Notebook to try out various distance metrics to compare CDR3 sequences

In [5]:
os.chdir('../..')

In [7]:
config = pd.read_csv("data/10X_bcr_vdj_example/sc5p_v2_hs_B_1k_b/filtered_contig_annotations.csv")

# Filter out any barcodes that don't have exactly two chains
barcode_counts = config.groupby("barcode").count()[["is_cell"]].reset_index()
barcode_to_filter = list(barcode_counts[barcode_counts['is_cell'] != 2]['barcode'])
config = config[~config['barcode'].isin(barcode_to_filter)]

# Filter out any barcodes with two of the same chains (ie only two heavy chains)
chain_counts = config.groupby(["barcode", "chain"]).count().reset_index()
chain_to_filter = list(chain_counts[chain_counts['is_cell'] == 2]['barcode'])
config = config[~config['barcode'].isin(chain_to_filter)]

# Extract out the relevant sequences per chain
# Rename columns so its clear which chain the cdr belongs to 
config_heavy = config[config['chain'] == "IGH"][['barcode', 'cdr1', 'cdr2', 'cdr3']]
config_heavy = config_heavy.rename(columns={"cdr1": "heavy_cdr1", "cdr2": "heavy_cdr2", "cdr3": "heavy_cdr3"})
config_light = config[config['chain'] != "IGH"][['barcode', 'cdr1', 'cdr2', 'cdr3']]
config_light = config_light.rename(columns={"cdr1": "light_cdr1", "cdr2": "light_cdr2", "cdr3": "light_cdr3"})

# Recombine the two dataframes together
cdr3_sequences = config_heavy.set_index("barcode").join(config_light.set_index("barcode")).reset_index()

# Generate randomized affinity column
cdr3_sequences['affinity'] = np.random.rand(cdr3_sequences.shape[0])

cdr3_sequences = cdr3_sequences.head(10)

cdr3_sequences

Unnamed: 0,barcode,heavy_cdr1,heavy_cdr2,heavy_cdr3,light_cdr1,light_cdr2,light_cdr3,affinity
0,AAACCTGAGGGCTCTC-1,GGSISSGSY,YTSGS,CARGDSSGWRGGNWFDPW,TGSSSNIGAGYDVH,GNSNRPS,CQSYDSSLSDVF,0.266858
1,AAACCTGGTAAGGATT-1,GYTLTGY,NTNSGG,CAMGYCINNNCYEGWFDPW,KSSQSVLYSSNKKNYLA,WASTRES,CQQYYDTPRTF,0.333075
2,AAACCTGGTAATAGCA-1,GYTFTSY,SAYNGN,CARAKRWGYSSSWCDYW,SGSSSNIGSNTVN,SNNQRPS,CAAWDDSLNGGVF,0.757931
3,AAACCTGGTACGCACC-1,GYTFTSY,NTNTGN,CARALGAIELFDYW,GSSTGAVTSGHYPY,DTSNKHS,CLLSYSGAHVVF,0.404049
4,AAACCTGTCTATGTGG-1,GYTFTSY,NTNTGN,CAREYPTVVPAALGYYGMDVW,SGSSSNIGSNTVN,SNNQRPS,CAAWDDSLNGWVF,0.587283
5,AAACGGGTCCAGTAGT-1,GFTFDDY,SWNSGS,CAKDSGVAGTGFSYGMDVW,RASQSISSYLN,AASSLQS,CQQSYSTLWTF,0.657014
6,AAACGGGTCGCAGGCT-1,GYTFTGY,NPNSGG,CASGPLKWELMIDYW,TGTSSDVGGYNYVS,DVSNRPS,CSSYTSSSTRVF,0.693784
7,AAAGATGAGATCTGCT-1,GFTFSSY,SYDGSN,CARDYRPDYYYGSGSYYNVPMDGMDVW,TGTSSDVGGYNYVS,DVSNRPS,CSSYTSSSTRVF,0.408279
8,AAAGATGCACCCTATC-1,GFSFNTY,RYDGKN,CAKDWGVSGDGGAYLDYW,SGRSSNIGSNNVY,KTNQRPS,CAAWDDSLGVVLF,0.901855
9,AAAGATGCATGCATGT-1,GGSISNSDY,YYGGN,CARSPCSGGSCPYWYFDLW,TGTSSDIGGYNYVS,DVSNRPS,CSSYTSGSTVF,0.557566


## Global Functions and Variables

In [8]:
def apply_distance_to_sequence(df, cdr, weight, distance_func):
    
    # Prepare 2 dimensional array M x N (M entries (len cdr3_sequences) with N dimensions (1)) 
    transformed_strings = np.array(df[cdr]).reshape(-1,1)
            
    # Apply distance function
    distance_calculations = pdist(transformed_strings, lambda x,y: distance_func(cdr, x[0], y[0]))
            
    # Square distance matrix
    distance_matrix = squareform(distance_calculations).astype(np.int64)
            
    # Multiple distance matrix by the weight
    distance_matrix = distance_matrix*weight   
    
    return distance_matrix

## Distance Metrics

Possible distance metrics to try out...

Levenshtein distance: https://en.wikipedia.org/wiki/Levenshtein_distance

In [9]:
def levenshtein(cdr, seq1, seq2):
    return lev.distance(seq1, seq2)

In [32]:
# Generate blosum matrix for alignment scoring
alph = seq.ProteinSequence.alphabet
blosum_matrix = seq.align.SubstitutionMatrix(alph, alph, "BLOSUM62")

def align_similarity_score(cdr, seq1, seq2):
    seq1 = seq.ProteinSequence(seq1)
    seq2 = seq.ProteinSequence(seq2)
    
    # Align the two sequences with global alignment
    alignment = seq.align.align_optimal(seq1, seq2, blosum_matrix, local=False)
    
    return alignment[0].score

In [33]:
align_similarity_score("hi", "GGSISSGSY", "GYTLTGY")

0

In [34]:
apply_distance_to_sequence(cdr3_sequences, "heavy_cdr3", 1, align_similarity_score)

array([[  0,  34,  35,  -3,  -3,  20,   6, -42,  30,  37],
       [ 34,   0,   6, -14,   6,  17,  -7, -46,  16,  41],
       [ 35,   6,   0,  15, -14,  11,   9, -50,  24,  22],
       [ -3, -14,  15,   0, -29, -12,  24, -87,   3,  -6],
       [ -3,   6, -14, -29,   0,  35, -27,  11,  14,  22],
       [ 20,  17,  11, -12,  35,   0,  -7, -16,  56,  24],
       [  6,  -7,   9,  24, -27,  -7,   0, -78,   5, -10],
       [-42, -46, -50, -87,  11, -16, -78,   0, -36, -27],
       [ 30,  16,  24,   3,  14,  56,   5, -36,   0,  18],
       [ 37,  41,  22,  -6,  22,  24, -10, -27,  18,   0]])