In [1]:
"""This script will use esm models to visualize embeddings of proteins from SKEMPI datasets"""
import torch
import esm
import pandas as pd
import scanpy as sc
import numpy as np
from fuzzywuzzy import fuzz

In [3]:
# retrieve all proteins from the dataset
df = pd.read_csv('skempiwmutants_nanincl.csv', index_col=0)
df = df[['pdb','Mutation(s)_PDB', 'Affinity_mut (M)', 'Affinity_wt (M)', 'Protein 1', 'Protein 2', 'wild_seq1', 'wild_seq2', 'mutant_seq']]
df

Unnamed: 0,pdb,Mutation(s)_PDB,Affinity_mut (M),Affinity_wt (M),Protein 1,Protein 2,wild_seq1,wild_seq2,mutant_seq
0,1CSE,LI45G,5.26E-11,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGD...
1,1CSE,LI45S,8.33E-12,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTSD...
2,1CSE,LI45P,1.02E-07,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTPD...
3,1CSE,LI45I,1.72E-10,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTID...
4,1CSE,LI45D,1.92E-09,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTDD...
...,...,...,...,...,...,...,...,...,...
12244,3QIB,TP12A,>1.1E-03,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,
12245,3QIB,TP12S,3.38E-05,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,
12246,3QIB,TP12N,4.34E-05,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,
12247,3QIB,"YP7F,TP12S",4.29E-05,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,


In [4]:
# check and remove if there's any duplicates
df.drop_duplicates(inplace=True)

In [5]:
df

Unnamed: 0,pdb,Mutation(s)_PDB,Affinity_mut (M),Affinity_wt (M),Protein 1,Protein 2,wild_seq1,wild_seq2,mutant_seq
0,1CSE,LI45G,5.26E-11,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGD...
1,1CSE,LI45S,8.33E-12,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTSD...
2,1CSE,LI45P,1.02E-07,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTPD...
3,1CSE,LI45I,1.72E-10,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTID...
4,1CSE,LI45D,1.92E-09,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTDD...
...,...,...,...,...,...,...,...,...,...
12243,3QIB,KP9R,2.4E-04,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,
12244,3QIB,TP12A,>1.1E-03,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,
12245,3QIB,TP12S,3.38E-05,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,
12246,3QIB,TP12N,4.34E-05,5.5E-06,I-Ek plus MCC peptide,2B4 TCR,IKEEHTIIQAEFYLLPDKRGEFMFDFDGDEIFHVDIEKSETIWRLE...,GSGGGGSRPWFLEYCKSECHFYNGTQRVRLLVRYFYNLEENLRFDS...,


In [6]:
# adjust the columns of wt_seq1 and wt_seq2, so that mutations will consistently apply to wt_seq2
sample_1 = 'TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLDLRYNRVRVFYNPGTNVVNHVPHVG'
sample_2 = 'TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGDLRYNRVRVFYNPGTNVVNHVPHVG'
sample_3 = 'AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVGGASFVAGEAYNTDGNGHGTHVAGTVAALDNTTGVLGVAPSVSLYAVKVLNSSGSGSYSGIVSGIEWATTNGMDVINMSLGGASGSTAMKQAVDNAYARGVVVVAAAGNSGNSGSTNTIGYPAKYDSVIAVGAVDSNSNRASFSSVGAELE'

In [7]:
fuzz.ratio(sample_1, sample_3)

29

In [18]:
df_ = df.copy()
df_.dropna(inplace=True)
df_['wild_seq_1'] = df_['wild_seq1']
df_['wild_seq_2'] = df_['wild_seq2']
df_['Protein_1'] = df_['Protein 1']
df_['Protein_2'] = df_['Protein 2']
#df_ = df_.reindex(columns=df_.columns.values)
df_

Unnamed: 0,pdb,Mutation(s)_PDB,Affinity_mut (M),Affinity_wt (M),Protein 1,Protein 2,wild_seq1,wild_seq2,mutant_seq,wild_seq_1,wild_seq_2,Protein_1,Protein_2
0,1CSE,LI45G,5.26E-11,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
1,1CSE,LI45S,8.33E-12,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTSD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
2,1CSE,LI45P,1.02E-07,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTPD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
3,1CSE,LI45I,1.72E-10,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTID...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
4,1CSE,LI45D,1.92E-09,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTDD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12215,2WPT,"DA33A,FB86A",6.21E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,Colicin E2 immunity protein,Colicin E9 DNase
12216,2WPT,NA34A,6.49E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDAKLVREFERLTEH...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,Colicin E2 immunity protein,Colicin E9 DNase
12217,2WPT,SB84A,7.09E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,Colicin E2 immunity protein,Colicin E9 DNase
12218,2WPT,"SB84A,NA34A",3.92E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,Colicin E2 immunity protein,Colicin E9 DNase


In [17]:
df_ = df_.iloc[:100]
df_

Unnamed: 0,pdb,Mutation(s)_PDB,Affinity_mut (M),Affinity_wt (M),Protein 1,Protein 2,wild_seq1,wild_seq2,mutant_seq,wild_seq_1,wild_seq_2,Protein_1,Protein_2
0,1CSE,LI45G,5.26E-11,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
1,1CSE,LI45S,8.33E-12,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTSD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
2,1CSE,LI45P,1.02E-07,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTPD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
3,1CSE,LI45I,1.72E-10,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTID...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
4,1CSE,LI45D,1.92E-09,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTDD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,1IAR,RA85A,3.33E-10,1.62E-10,Interleukin-4,Interleukin-4 receptor,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,Interleukin-4,Interleukin-4 receptor
101,1IAR,RA85E,1.28E-09,1.62E-10,Interleukin-4,Interleukin-4 receptor,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,Interleukin-4,Interleukin-4 receptor
102,1IAR,RA88Q,1.92E-08,1.62E-10,Interleukin-4,Interleukin-4 receptor,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,Interleukin-4,Interleukin-4 receptor
103,1IAR,RA88D,n.b,1.62E-10,Interleukin-4,Interleukin-4 receptor,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,HKCDITLQEIIKTLNSLTEQKTLCTELTVTDIFAASKNTTEKETFC...,FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEA...,Interleukin-4,Interleukin-4 receptor


In [19]:
"""
    This chunk should re-organize the dataframe so that:
    1. all the sequence listed in wt_seq2 will have positions mutated
    2. if the wt_seq2 and wt_seq1 flipped, so well columns [Protein1, Protein2]
"""
for index, row in df_.iterrows():
    if fuzz.ratio(row['wild_seq1'], row['mutant_seq']) > fuzz.ratio(row['wild_seq2'], row['mutant_seq']) :
        # then the two sequences are similar
        # flip the sequence 1 to sequence 2
            df_.at[index, 'wild_seq_1'] = row['wild_seq2']
            df_.at[index, 'wild_seq_2'] = row['wild_seq1']
            df_.at[index, 'Protein_1'] = row['Protein 2']
            df_.at[index, 'Protein_2'] = row['Protein 1']
    elif fuzz.ratio(row['wild_seq1'], row['mutant_seq'])  < fuzz.ratio(row['wild_seq2'], row['mutant_seq']) :
        pass
    else:
        print(index, fuzz.ratio(row['wild_seq1'], row['mutant_seq']), fuzz.ratio(row['wild_seq2'], row['mutant_seq']))
        print('mutate both sequences?')
df_

7647 100 100
mutate both sequences?


Unnamed: 0,pdb,Mutation(s)_PDB,Affinity_mut (M),Affinity_wt (M),Protein 1,Protein 2,wild_seq1,wild_seq2,mutant_seq,wild_seq_1,wild_seq_2,Protein_1,Protein_2
0,1CSE,LI45G,5.26E-11,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
1,1CSE,LI45S,8.33E-12,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTSD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
2,1CSE,LI45P,1.02E-07,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTPD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
3,1CSE,LI45I,1.72E-10,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTID...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
4,1CSE,LI45D,1.92E-09,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTDD...,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,Subtilisin Carlsberg,Eglin c
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12215,2WPT,"DA33A,FB86A",6.21E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,Colicin E2 immunity protein,Colicin E9 DNase
12216,2WPT,NA34A,6.49E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDAKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,Colicin E9 DNase,Colicin E2 immunity protein
12217,2WPT,SB84A,7.09E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,Colicin E2 immunity protein,Colicin E9 DNase
12218,2WPT,"SB84A,NA34A",3.92E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,Colicin E2 immunity protein,Colicin E9 DNase


In [70]:
df_.drop(columns=['Protein 1', 'Protein 2', 'wild_seq1', 'wild_seq2'], inplace=True)
cols = ['pdb', 'Mutation(s)_PDB', 'Affinity_mut (M)', 'Affinity_wt (M)','Protein_1',
        'Protein_2','wild_seq_1','wild_seq_2', 'mutant_seq']
df_ = df_[cols]
df_

Unnamed: 0,pdb,Mutation(s)_PDB,Affinity_mut (M),Affinity_wt (M),Protein_1,Protein_2,wild_seq_1,wild_seq_2,mutant_seq
0,1CSE,LI45G,5.26E-11,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGD...
1,1CSE,LI45S,8.33E-12,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTSD...
2,1CSE,LI45P,1.02E-07,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTPD...
3,1CSE,LI45I,1.72E-10,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTID...
4,1CSE,LI45D,1.92E-09,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTDD...
...,...,...,...,...,...,...,...,...,...
12215,2WPT,"DA33A,FB86A",6.21E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...
12216,2WPT,NA34A,6.49E-07,8.2E-07,Colicin E9 DNase,Colicin E2 immunity protein,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDAKLVREFERLTEH...
12217,2WPT,SB84A,7.09E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...
12218,2WPT,"SB84A,NA34A",3.92E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...


In [64]:
seqs_tuples_wt1 = list(set(zip(df_.Protein_1, df_.wild_seq_1)))
print(len(seqs_tuples_wt1))
seqs_tuples_wt1

9


[('Subtilisin BPN',
  'AQSVPYGVSQIKAPALHSQGYTGSNVKVAVIDSGIDSSHPDLKVAGGASMVPSETNPFQDNNSHGTHVAGTVAALNNSIGVLGVAPSASLYAVKVLGADGSGQYSWIINGIEWAIANNMDVINMSLGGPSGSAALKAAVDKAVASGVVVVAAAGNEGTSGSSSTVGYPGKYPSVIAVGAVDSSNQRASFSSVGPELDVMAPGVSIQSTLPGNKYGAYNGTSMASPHVAGAAALILSKHPNWTNTQVRSSLENTTTKLGDSFYYGKGLINVQAAAQ'),
 ('Interleukin-4 receptor',
  'FKVLQEPTCVSDYMSISTCEWKMNGPTNCSTELRLLYQLVFLLSEAHTCIPENNGGAGCVCHLLMDDVVSADNYTLDLWAGQQLLWKGSFKPSEHVKPRAPGNLTVHTNVSDTLLLTWSNPYPPDNYLYNHLTYAVNIWSENDPADFRIYNVTYLEPSLRIAASTLKSGISYRARVRAWAQAYNTTWSEWSPSTKWHNSYREPFEQH'),
 ('Subtilisin BPN',
  'AQSVPYGVSQIKAPALHSQGYTGSNVKVAVIDSGIDSSHPDLKVAGGASMVPSETNPFQDNNSHGTHVAGTVAALNNSIGVLGVAPSASLYAVKVLGADGSGQYSWIINGIEWAIANNMDVINMSLGGPSGSAALKAAVDKAVASGVVVVAAAGNEGTSGSSSTVGYPGKYPSVIAVGAVDSSNQRASFSSVGPELDVMAPGVSIQSTLPGNKYGAYNGTSMASPHVAGAAALILSKHPNWTNTQVRSSLENTTTKLGDSFYYGKGLINVQAAAQHHHHHH'),
 ('Subtilisin Carlsberg',
  'AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVGGASFVAGEAYNTDGNGHGTHVAGTVAALDNTTGVLGVAPSVSLYAVKVLNSSGSGSYSGIVSGIEWATTNGMD

In [125]:
# Generate sequence embeddings to the proteins
seqs_wt1 = df_.wild_seq_1.values.tolist()
seqs_wt2 = df_.wild_seq_2.values.tolist()
seqs_mut = df_.mutant_seq.values.tolist()
print(len(seqs_wt1))
print(len(seqs_mut))
seqs_wt1 = set(seqs_wt1)
seqs_wt2 = set(seqs_wt2)
seqs_mut = set(seqs_mut)
print(len(seqs_wt1))
print(len(seqs_wt2))
print(len(seqs_mut))
seqs_mut

7187
7187
266
328
1525


{'FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQNPQTSLCFSESIPTPSNREETQQKSNLELLRISLLLIQSWLEPVQFLRSVFANSLAYGASDSNVYDLLKDLEERIQTLMGRLEDGSPRTGQIFKQTYSKFDTNSHNDDALLKNYGLLYCFRKDMDKVETFLRIVQCRSVEGSCGF',
 'VDCSEYPKPACTHEYRPLCGSDNKTYGNKCNFCNAVVESNGTLTLSHFGKC',
 'MEGCVSNIMICNLAYSGKLDELKERILADKSLATRTDQASRTALHWACSAGHTEIVEFLLQLGVPVNDKDDAGWSPLHIAASAGRDEIVKALLVKGAHVNAVNQNGCTPLHYAASKNRHEIAVMLLEGGANPDAKDHYDATAMHRAAAKGNLKMVHILLFYKASTNIQDTEGNTPLHLACDEERVEEAKFLVTQGASIYIENKEEKTPLQVAKGGLGLILKRLAEGEEASM',
 'KKVVLGKKGDTVELTCTASQKKSIQFHWKNSNQIKILGNQGSFLTKGPSKLNDRADSRRSLWDQGNFPLIIKNLKIEDSDTYICEVEDQAEEVQLLVFGLTANSDTHLLQGQSLTLTLESPPGSSPSVQCRSPRGKNIQGGKTLSVSQLELQDSGTWTCTVLQNQKKVEFKIDIVVLAFQKASNT',
 'MLRDLFDRAVVLSHYIANLSSEMFSEFDKRYTHGRGFITKAINSCHTSSLATPEDKEQAQQMNQKDFLSLIVSILRSWNEPLYHLVTEVRGMQEAPEAILSKAVEIEEQTKRLLERMELIVSQVHPETKENEIYPVWSGLPSLQMADEESRLSAYYNLLHCLRRDSHKIDNYLKLLKCRIIHNNNC',
 'KFIYGDVDGNGSVNSIDAVLIRDYVLGKINEFPYEYGMLAADVDGNGSIKINDAVLVRDYVLGKIFLFPVEEKEE',
 'MAASRELMKELEEIRKCGMKNFRNIQVDEANLLTWQGLIVPDNPPYDKG

In [None]:
# 289 vs 266
# some of the proteins are mislabeled, they share same sequences, but protein names got mislabeled

In [72]:
seqs_labeled_wt1 = []
count = 0
for seq in seqs_wt1:
    seqs_labeled_wt1.append(tuple((str('seq' + str(count)), seq)))
    count += 1
seqs_labeled_wt1

[('seq0',
  'RSSNELHQVPSNCDCLNGGTCVSNKYFSNIHWCNCPKKFGGQHCEIDKSKTCYEGNGHFYRGKASTDTMGRPCLPWNSATVLQQTYHAHRSDALQLGLGKHNYCRNPDNRRRPWCYVQVGLKPLVQECMVHDCAD'),
 ('seq1', 'ALGLLAPLRETKSTNPASRVMEMEPETMETKSVIDSRV'),
 ('seq2',
  'THTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPQVKFNWYVDGVQVHNAKTKPREQQYNSTYRVVSVLTVLHQNWLDGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPG'),
 ('seq3',
  'MLSGVWFLSVLTVAGILQTESRKTAKDICKIRCLCEEKENVLNINCENKGFTTVSLLQPPQYRIYQLFLNGNLLTRLYPNEFVNYSNAVTLHLGNNGLQEIRPGAFSGLKTLKRLHLNNNKLEVLREDTFLGLESLEYLQADYNYISTIEAGAFSKLNKLKVLILNDNLLLSLPSNVFRFVLLTHLDLRGNRLKVMPFAGVLEHIGGIMEIQLEENPWNCTCDLLPLKAWLDTITVFVGEIVCETPFRLHGKDVTQLTRQDLCPRKHHHHHH'),
 ('seq4',
  'MDRRQKRLIFSTITSKMNLSEEVDLEDYVARPDKISGADINSICQESGMLAVRENRYIVLAKDFEKAYKTVIKKDEQEHEFYK'),
 ('seq5',
  'APLAHCDGRGGLKLSQDMDTCEDILPCVPFSVAKSVKSLYLGRMFSGTPVIRLRFKRLQPTRLVAEFDFRTFDPEGILLFAGGHQDSTWIVLALRAGRLELQLRYNGVGRVTSSGPVINHGMWQTISVEELARNLVIKVNRDAVMKIAVAGD

In [129]:
seqs_labeled_wt2 = []
count = 0
for seq in seqs_wt2:
    seqs_labeled_wt2.append(tuple((str('seq' + str(count)), seq)))
    count += 1
seqs_labeled_mut = []
count = 0
for seq in seqs_mut:
    seqs_labeled_mut.append(tuple((str('seq' + str(count)), seq)))
    count += 1

TypeError: 'str' object is not callable

In [77]:
# Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bia

In [73]:
# find longest sequence
len(max(list(zip(*seqs_labeled_wt1))[1], key=len))

1295

In [84]:
# alternative way to generate batches
from torch.utils.data import TensorDataset
from esm.constants import proteinseq_toks
from esm import Alphabet, FastaBatchedDataset, ProteinBertModel, pretrained
batch_size = 1000
dataset = FastaBatchedDataset(list(zip(*seqs_labeled_wt1))[0], list(zip(*seqs_labeled_wt1))[1])
batches = dataset.get_batch_indices(batch_size, extra_toks_per_seq=1)
data_loader = torch.utils.data.DataLoader(dataset, collate_fn=Alphabet.from_architecture("roberta_large").get_batch_converter(),
            batch_sampler=batches, pin_memory=True)
dataset_seq2 = FastaBatchedDataset(list(zip(*seqs_labeled_wt2))[0], list(zip(*seqs_labeled_wt2))[1])
batches_seq2 = dataset_seq2.get_batch_indices(batch_size, extra_toks_per_seq=1)
data_loader_seq2 = torch.utils.data.DataLoader(dataset_seq2, collate_fn=Alphabet.from_architecture("roberta_large").get_batch_converter(),
            batch_sampler=batches_seq2, pin_memory=True)
dataset_mut = FastaBatchedDataset(list(zip(*seqs_labeled_mut))[0], list(zip(*seqs_labeled_mut))[1])
batches_mut = dataset_mut.get_batch_indices(batch_size, extra_toks_per_seq=1)
data_loader_mut = torch.utils.data.DataLoader(dataset_mut, collate_fn=Alphabet.from_architecture("roberta_large").get_batch_converter(),
            batch_sampler=batches_mut, pin_memory=True)

In [85]:
torch.cuda.empty_cache()

In [80]:
if torch.cuda.is_available():
    model = model.cuda()
    print('Transferred model to GPU')

Transferred model to GPU


In [86]:
for batch_idx, (labels, strs, toks) in enumerate(data_loader):
    print(batch_idx,labels)

0 ['seq80', 'seq218', 'seq135', 'seq23', 'seq220', 'seq120', 'seq186', 'seq188', 'seq242', 'seq1', 'seq206', 'seq98', 'seq253', 'seq97', 'seq130', 'seq12']
1 ['seq230', 'seq146', 'seq26', 'seq84', 'seq155', 'seq51', 'seq63', 'seq102', 'seq226', 'seq87', 'seq46', 'seq66']
2 ['seq71', 'seq137', 'seq187', 'seq15', 'seq30', 'seq88', 'seq221', 'seq4', 'seq8', 'seq33', 'seq73']
3 ['seq162', 'seq142', 'seq202', 'seq56', 'seq58', 'seq96', 'seq139', 'seq20', 'seq93', 'seq101']
4 ['seq250', 'seq37', 'seq196', 'seq27', 'seq47', 'seq171', 'seq126', 'seq233', 'seq57']
5 ['seq100', 'seq179', 'seq248', 'seq24', 'seq83', 'seq131', 'seq70', 'seq207']
6 ['seq41', 'seq254', 'seq92', 'seq159', 'seq246', 'seq152', 'seq157', 'seq261']
7 ['seq10', 'seq176', 'seq154', 'seq191', 'seq54', 'seq228', 'seq6']
8 ['seq167', 'seq144', 'seq232', 'seq172', 'seq36', 'seq197', 'seq203']
9 ['seq205', 'seq209', 'seq0', 'seq52', 'seq105', 'seq141', 'seq234']
10 ['seq103', 'seq34', 'seq150', 'seq225', 'seq79', 'seq44']
11 ['

In [89]:
a = np.array([[[1,2,3],
               [4,5,6],
               [7,8,9],
              [10,11,12]],
              [[7,8,9],
              [10,11,12],
               [7,8,9],
              [10,11,12]]])
np.shape(a)

(2, 4, 3)

In [54]:
a

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]],

       [[ 7,  8,  9],
        [10, 11, 12],
        [ 7,  8,  9],
        [10, 11, 12]]])

In [57]:
a[1, 1:-1]

array([[10, 11, 12],
       [ 7,  8,  9]])

In [127]:
representation_store_dict = {}
for batch_idx, (labels, strs, toks) in enumerate(data_loader):
    if torch.cuda.is_available():
        toks = toks.to(device='cuda', non_blocking=True)
    with torch.no_grad():
        results = model(toks, repr_layers = [33], return_contacts = True)['representations'][33]
        #results = model(toks, repr_layers = [33], return_contacts = True)['logits']
    print(results.shape)
    results_cpu = results.to(device='cpu')
    for i, str_ in enumerate(strs):
        #if str not in representation_store_dict:
            #representation_store_dict[str] = []
        # only select representations relate to the sequence
        # rest of the sequences are paddings, check notebook
        # create dictionary {sequence: embeddings}
        representation_store_dict[str_] = results_cpu[i, 1: (len(strs[i])+1)].numpy()

torch.Size([16, 60, 1280])
torch.Size([12, 77, 1280])
torch.Size([11, 88, 1280])
torch.Size([10, 98, 1280])
torch.Size([9, 112, 1280])
torch.Size([8, 116, 1280])
torch.Size([8, 124, 1280])
torch.Size([7, 131, 1280])
torch.Size([7, 136, 1280])
torch.Size([7, 138, 1280])
torch.Size([6, 150, 1280])
torch.Size([6, 160, 1280])
torch.Size([6, 167, 1280])
torch.Size([5, 168, 1280])
torch.Size([5, 172, 1280])
torch.Size([5, 180, 1280])
torch.Size([5, 187, 1280])
torch.Size([5, 199, 1280])
torch.Size([4, 208, 1280])
torch.Size([4, 212, 1280])
torch.Size([4, 213, 1280])
torch.Size([4, 216, 1280])
torch.Size([4, 216, 1280])
torch.Size([4, 218, 1280])
torch.Size([4, 221, 1280])
torch.Size([4, 226, 1280])
torch.Size([4, 233, 1280])
torch.Size([4, 241, 1280])
torch.Size([4, 243, 1280])
torch.Size([4, 250, 1280])
torch.Size([3, 264, 1280])
torch.Size([3, 265, 1280])
torch.Size([3, 267, 1280])
torch.Size([3, 274, 1280])
torch.Size([3, 276, 1280])
torch.Size([3, 283, 1280])
torch.Size([3, 284, 1280])
t

In [115]:
representation_store_dict

{'ARTKQTARKS': array([[ 0.02447946,  0.01928901, -0.03956539, ...,  0.12661682,
         -0.09285621,  0.01617499],
        [ 0.10751038,  0.06919377,  0.12876596, ..., -0.00814786,
          0.02145161, -0.02784804],
        [ 0.00273106,  0.03329812,  0.06105419, ...,  0.07983748,
         -0.03923513, -0.00519595],
        ...,
        [ 0.07251852,  0.07814158,  0.15600905, ...,  0.02425219,
          0.00744778, -0.06048933],
        [-0.06360918,  0.06384679,  0.1498925 , ..., -0.01001437,
         -0.13084146,  0.01428795],
        [ 0.07733045,  0.06695724,  0.13977003, ...,  0.08957948,
         -0.2533254 ,  0.00227757]], dtype=float32),
 'YMDFDDDIPF': array([[ 0.13910443, -0.07379941, -0.29339817, ...,  0.00111225,
         -0.07300624, -0.07609034],
        [ 0.05488537, -0.12718895, -0.0194888 , ...,  0.01878373,
         -0.191497  ,  0.02732703],
        [ 0.06376954, -0.13735689, -0.09811898, ..., -0.12271725,
          0.07584549,  0.06119155],
        ...,
        [ 0

In [119]:
df_

Unnamed: 0,pdb,Mutation(s)_PDB,Affinity_mut (M),Affinity_wt (M),Protein_1,Protein_2,wild_seq_1,wild_seq_2,mutant_seq
0,1CSE,LI45G,5.26E-11,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTGD...
1,1CSE,LI45S,8.33E-12,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTSD...
2,1CSE,LI45P,1.02E-07,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTPD...
3,1CSE,LI45I,1.72E-10,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTID...
4,1CSE,LI45D,1.92E-09,1.12E-12,Subtilisin Carlsberg,Eglin c,AQTVPYGIPLIKADKVQAQGFKGANVKVAVLDTGIQASHPDLNVVG...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTLD...,TEFGSELKSFPEVVGKTVDQAREYFTLHYPQYNVYFLPEGSPVTDD...
...,...,...,...,...,...,...,...,...,...
12215,2WPT,"DA33A,FB86A",6.21E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...
12216,2WPT,NA34A,6.49E-07,8.2E-07,Colicin E9 DNase,Colicin E2 immunity protein,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MELKHSISDYTEAEFLEFVKKIARAEGATECDDAKLVREFERLTEH...
12217,2WPT,SB84A,7.09E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...
12218,2WPT,"SB84A,NA34A",3.92E-07,8.2E-07,Colicin E2 immunity protein,Colicin E9 DNase,MELKHSISDYTEAEFLEFVKKIARAEGATECDDNKLVREFERLTEH...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...,MESKRNKPGKATGKGKPVGDKWLDDAGKDSGAPIPDRIADKLRDKE...


In [93]:
seqs_labeled_wt1

[('seq0',
  'RSSNELHQVPSNCDCLNGGTCVSNKYFSNIHWCNCPKKFGGQHCEIDKSKTCYEGNGHFYRGKASTDTMGRPCLPWNSATVLQQTYHAHRSDALQLGLGKHNYCRNPDNRRRPWCYVQVGLKPLVQECMVHDCAD'),
 ('seq1', 'ALGLLAPLRETKSTNPASRVMEMEPETMETKSVIDSRV'),
 ('seq2',
  'THTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPQVKFNWYVDGVQVHNAKTKPREQQYNSTYRVVSVLTVLHQNWLDGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPG'),
 ('seq3',
  'MLSGVWFLSVLTVAGILQTESRKTAKDICKIRCLCEEKENVLNINCENKGFTTVSLLQPPQYRIYQLFLNGNLLTRLYPNEFVNYSNAVTLHLGNNGLQEIRPGAFSGLKTLKRLHLNNNKLEVLREDTFLGLESLEYLQADYNYISTIEAGAFSKLNKLKVLILNDNLLLSLPSNVFRFVLLTHLDLRGNRLKVMPFAGVLEHIGGIMEIQLEENPWNCTCDLLPLKAWLDTITVFVGEIVCETPFRLHGKDVTQLTRQDLCPRKHHHHHH'),
 ('seq4',
  'MDRRQKRLIFSTITSKMNLSEEVDLEDYVARPDKISGADINSICQESGMLAVRENRYIVLAKDFEKAYKTVIKKDEQEHEFYK'),
 ('seq5',
  'APLAHCDGRGGLKLSQDMDTCEDILPCVPFSVAKSVKSLYLGRMFSGTPVIRLRFKRLQPTRLVAEFDFRTFDPEGILLFAGGHQDSTWIVLALRAGRLELQLRYNGVGRVTSSGPVINHGMWQTISVEELARNLVIKVNRDAVMKIAVAGD

In [122]:
# take the average of the representations of the proteins
# for umaps, along axis 0
sequence_embeddings = {key: np.mean(value, axis=0, keepdims=True) for key, value in representation_store_dict.items()}
#print({key: value.shape for key, value in sequence_embeddings.items()})
sequence_embeddings

{'ARTKQTARKS': array([[ 0.0223621 ,  0.06744134,  0.06751989, ...,  0.0498579 ,
         -0.09143854,  0.00964986]], dtype=float32),
 'YMDFDDDIPF': array([[ 0.04857468, -0.00012262, -0.10690913, ..., -0.11258976,
         -0.04252142,  0.07379431]], dtype=float32),
 'CGVPAIQPVLSGL': array([[ 0.00409246,  0.07187316,  0.08544851, ..., -0.01148665,
         -0.04319402,  0.05069447]], dtype=float32),
 'ARTKQTARKSTGGKA': array([[-0.03295551,  0.05737768, -0.0015403 , ...,  0.09619958,
         -0.08495762,  0.02021424]], dtype=float32),
 'AVPIAQKSEPHSLSS': array([[ 0.01182264,  0.13893493,  0.05097224, ..., -0.09801121,
         -0.09132418, -0.00897526]], dtype=float32),
 'ARTKQTARKSTGGKAY': array([[-0.05625677,  0.04186503,  0.03291382, ...,  0.09167849,
         -0.06138652,  0.02887965]], dtype=float32),
 'KEPDYLDIPAFLRKQAD': array([[ 0.02545632,  0.01806677, -0.04201803, ..., -0.09341508,
         -0.05944117, -0.04591647]], dtype=float32),
 'XRRRRCPLYISYDPVCRRRRX': array([[-0.015403

In [None]:
representation_store_dict_seq2 = {}
for batch_idx, (labels, strs, toks) in enumerate(data_loader_seq2):
    if torch.cuda.is_available():
        toks = toks.to(device='cuda', non_blocking=True)
    with torch.no_grad():
        results = model(toks, repr_layers = [33], return_contacts = True)['representations'][33]
        #results = model(toks, repr_layers = [33], return_contacts = True)['logits']
    print(results.shape)
    results_cpu = results.to(device='cpu')
    for i, str_ in enumerate(strs):
        #if str not in representation_store_dict:
            #representation_store_dict[str] = []
        # only select representations relate to the sequence
        # rest of the sequences are paddings, check notebook
        # create dictionary {sequence: embeddings}
        representation_store_dict_seq2[str_] = results_cpu[i, 1: (len(strs[i])+1)].numpy()
sequence_embeddings_seq2 = {key: np.mean(value, axis=0, keepdims=True) for key, value in representation_store_dict_seq2.items()}
representation_store_dict_mut = {}
for batch_idx, (labels, strs, toks) in enumerate(data_loader_mut):
    if torch.cuda.is_available():
        toks = toks.to(device='cuda', non_blocking=True)
    with torch.no_grad():
        results = model(toks, repr_layers = [33], return_contacts = True)['representations'][33]
        #results = model(toks, repr_layers = [33], return_contacts = True)['logits']
    print(results.shape)
    results_cpu = results.to(device='cpu')
    for i, str_ in enumerate(strs):
        #if str not in representation_store_dict:
            #representation_store_dict[str] = []
        # only select representations relate to the sequence
        # rest of the sequences are paddings, check notebook
        # create dictionary {sequence: embeddings}
        representation_store_dict_mut[str_] = results_cpu[i, 1: (len(strs[i])+1)].numpy()
sequence_embeddings_mut = {key: np.mean(value, axis=0, keepdims=True) for key, value in representation_store_dict_mut.items()}


In [None]:
def update_embeddings(row, embedding_dict):
    """
    add embeddings to the metadata column.
    cannot do the reverse, because due to mislabel, several different protein names share the same sequences
    but as long as sequences are correct, so will the embeddings
    """
    for key, value in embedding_dict.items():
        if row == key:
            return value
df_['wild_seq_1_embeddings'] = df_['wild_seq_1'].apply(update_embeddings, sequence_embeddings)
df_['wild_seq_2_embeddings'] = df_['wild_seq_2'].apply(update_embeddings, sequence_embeddings_seq2)
df_['mutant_seq_embeddings'] = df_['mutant_seq'].apply(update_embeddings, sequence_embeddings_mut)
df_

In [None]:
pd.DataFrame(representation_store_dict['ARTKQTARKS'][0])

In [None]:
if torch.cuda.is_available():
    batch_tokens = batch_tokens.to(device='cuda', non_blocking=True)
    model = model.cuda()
    print('Transferred model to GPU')

In [None]:
output = self.model(tokens, repr_layers=repr_layers, return_contacts=return_contacts)

In [None]:
batch_labels, batch_strs, batch_tokens = batch_converter(seqs_tuples_wt1)
if torch.cuda.is_available():
    batch_tokens = batch_tokens.to(device='cuda', non_blocking=True)
    model = model.cuda()
    print('Transferred model to GPU')
with torch.no_grad():
    output = model(batch_tokens, repr_layers=[33], return_contacts=False)