In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 3.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 42.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
# default_exp prot_bert

In [None]:
#hide
from nbdev.showdoc import *

In [29]:
# export
from transformers import BertForMaskedLM, BertTokenizer, pipeline
import torch
import pandas as pd

# Bert new approach

In [26]:
class modelPredDF():
    def __init__(self, predDict, seq, aas):
        self.predDf = pd.DataFrame.from_dict(predDict, orient = "index", columns = list(aas))
        self.predDf = self.predDf.div(self.predDf.sum(axis=1),axis=0)
        self.predDf.insert(0, "wt",list(seq))
        self.predDf.insert(1, "wtIndex",list(range(1,len(seq)+1)))
        wtScore = self.wtScoreCol()
        self.predDf.insert(2, "wtScore",wtScore)

    def wtScoreCol(self):
        wtScore = []
        for row in self.predDf.to_dict(orient="records"):
	        wt = row["wt"]
	        wtScore.append(row[wt])
        return wtScore

In [4]:
def load_model(model_path, tokenizerLM, maskedLM):
  tokenizer = tokenizerLM.from_pretrained(model_path)
  model = maskedLM.from_pretrained(model_path)
  return tokenizer, model

Hmm, I'm wondering how crazy it would be to have this be generalized to the point where if you just provide the values for tokenizerLM and maskedLM that it could do a from transformers import tokenizerLM, maskedLM?? That sounds like crazy talk..

In [7]:
bert_tokenizer, bert_model = load_model("Rostlab/prot_bert",BertTokenizer,BertForMaskedLM)

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def run_model(model, inputs):
  with torch.no_grad():
    logits = model(**inputs).logits
  return logits

In [8]:
bert_tokenizer.mask_token

'[MASK]'

In [18]:
bert_tokenizer("M E N [MASK] E L")

{'input_ids': [2, 21, 9, 17, 4, 9, 5, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
def maskifySeq(seq, tokenizer, i , sep=""):
    seqList = list(seq)
    if i != None:
      seqList[i] = tokenizer.mask_token 
      return sep.join(seqList)

In [20]:
def tokenizeSeq(seq, tokenizer, mask_index = None, sep="", return_tensors = "pt"):
  maskified_seq = maskifySeq(seq, tokenizer, mask_index, sep)
  return tokenizer(maskified_seq, return_tensors=return_tensors)

In [27]:
mendel_mask3 = tokenizeSeq("MENDEL", bert_tokenizer, mask_index=3, sep = " ")

In [30]:
run_model(bert_model, mendel_mask3)

tensor([[[-1.9025e+01, -1.9418e+01, -1.9070e+01, -2.0259e+01, -2.1078e+01,
           6.8966e-01, -5.8320e-02,  9.9683e-02,  1.1744e-01,  9.2746e-01,
           1.7481e-01,  2.6523e-01,  1.0348e+00,  3.9002e-01, -1.5987e-01,
           9.6185e-02, -7.2865e-01,  2.0590e-01,  1.6183e-01, -6.3175e-01,
          -8.3162e-01,  2.3495e-01, -8.9098e-01, -1.4568e+00, -1.2295e+00,
          -5.1355e+00, -1.8929e+01, -1.8691e+01, -1.8911e+01, -1.9173e+01],
         [-1.8777e+01, -1.9759e+01, -1.9174e+01, -1.7491e+01, -2.1409e+01,
           4.5803e-01, -1.3080e-01, -4.1512e-01, -8.9476e-02,  4.4306e-01,
          -2.1111e-01, -2.7136e-01,  1.9480e-01, -3.5983e-01, -4.5520e-01,
          -4.6519e-01, -1.1010e+00, -3.5952e-01, -4.2683e-01, -8.3092e-01,
          -1.0543e+00,  3.9943e+00, -1.4092e+00, -1.9026e+00, -1.7119e+00,
          -3.4918e+00, -1.7906e+01, -1.8299e+01, -1.9515e+01, -1.9495e+01],
         [-2.1036e+01, -2.1122e+01, -2.1379e+01, -1.9386e+01, -2.2437e+01,
           3.5269e-01, 

So it seems like I'm getting to a decent generalized approach that works for both ESM and BERT. They both have different mask tokens, which can be inherited from their respective tokenizers, they also have different separators (BERT expects it to be space separated?? It's strange, I don't know if ESM supports space separated?? I also don't know if the separator is inherited from the tokenizer..

In [38]:
def naturalAAIndex(aas, tokenizer, sep = ""):
    return tokenizeSeq(aas, tokenizer, return_tensors=None, sep = sep)["input_ids"][1:-1]

In [33]:
def getNatProbs(natAAList,probList):
    natProbList = []
    for natAAIndex in natAAList:
      natProbList.append(probList[natAAIndex])
    return natProbList

In [34]:
def logits2prob(logits):
  return torch.softmax(logits,dim=2)

In [39]:
naturalAAIndex("ACDEFGHIKLMNPQRSTVWY",bert_tokenizer, sep = " ")


[6, 23, 14, 9, 19, 7, 22, 11, 12, 5, 21, 17, 16, 18, 13, 10, 15, 8, 24, 20]

In [40]:
def bertPredictionDF(seq, tokenizer, model, aas = "ACDEFGHIKLMNPQRSTVWY"):
  naturalAAIndices = naturalAAIndex(aas,tokenizer, sep = " ")
  bertPredDict = {}
  for wtIndex in range(len(seq)):
    maskedSeq = tokenizeSeq(seq, tokenizer, mask_index = wtIndex, sep = " ")
    seq_logits = run_model(model, maskedSeq)
    seq_probs = logits2prob(seq_logits)
    bertPredDict[wtIndex] = [i.item() for i in getNatProbs(naturalAAIndices, seq_probs[0, wtIndex +1])]
  bertPredDF = modelPredDF(bertPredDict, seq, aas).predDf
  return bertPredDF

In [41]:
bertPredictionDF("MENDEL", bert_tokenizer, bert_model)

Unnamed: 0,wt,wtIndex,wtScore,A,C,D,E,F,G,H,...,M,N,P,Q,R,S,T,V,W,Y
0,M,1,0.076602,0.036697,0.011504,0.048245,0.118906,0.024072,0.039202,0.012621,...,0.076602,0.072661,0.024722,0.038672,0.043105,0.07028,0.056544,0.049927,0.007781,0.021699
1,E,2,0.07483,0.045721,0.015662,0.041921,0.07483,0.037153,0.044325,0.018264,...,0.043581,0.062667,0.025277,0.036911,0.055543,0.064425,0.049955,0.056789,0.012691,0.029893
2,N,3,0.04199,0.043564,0.009685,0.16259,0.184364,0.033782,0.044661,0.012355,...,0.041484,0.04199,0.019992,0.025515,0.029433,0.048106,0.030303,0.054742,0.00743,0.024924
3,D,4,0.049748,0.042083,0.013244,0.049748,0.086194,0.039736,0.055911,0.016861,...,0.04008,0.060822,0.032024,0.039689,0.046228,0.062323,0.044901,0.058937,0.010875,0.026596
4,E,5,0.086915,0.046641,0.01877,0.079822,0.086915,0.050638,0.050466,0.022397,...,0.028962,0.062234,0.023879,0.030534,0.040489,0.065195,0.044938,0.068038,0.012156,0.038034
5,L,6,0.060736,0.038191,0.009217,0.065189,0.152547,0.02095,0.049525,0.013955,...,0.040042,0.096484,0.020712,0.035022,0.046888,0.049071,0.046247,0.048276,0.010486,0.022727


# Bert old approach

In [None]:
# export
from berteome import berteomeDF

In [None]:
# export
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# export
def spacifySeq(seq):
  return ' '.join(list(seq))

In [None]:
assert spacifySeq("MENDEL") == 'M E N D E L'

In [None]:
' '.join(list("MENDEL"))

'M E N D E L'

In [None]:
# export 
def maskifySeq(seq, pos, mask="[MASK]"):
  spacifiedSeq = spacifySeq(seq)
  seqList = spacifiedSeq.split()
  seqList[pos] = mask
  return " ".join(seqList)

In [None]:
assert maskifySeq("MENDEL", 3) == 'M E N [MASK] E L'

NameError: name 'maskifySeq' is not defined

In [None]:
# export
def allResidueCoordinates(seq,residue):
  return [i for i, x in enumerate(seq) if x == residue]

This should be renamed, to something along the lines of bertPredictionDict

In [None]:
# export
def aaPosDict(aas):
    aaDict = {}
    for aaPos in range(len(aas)):
        aa = aas[aaPos]
        aaDict[aa] = aaPos
    return aaDict

In [None]:
aaPosDict("ACDEFGHIKLMNPQRSTVWY")

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19}

In [None]:
# export 
def bertPredictionDF(seq, aas="ACDEFGHIKLMNPQRSTVWY"):
  aaDict = aaPosDict(aas)
  bertPredDict = {}
  # posPredictions = []
  for aaPos in range(len(seq)):
    aa = seq[aaPos]
    maskPosSeq = maskifySeq(seq, aaPos)
    predictions = unmasker(maskPosSeq, top_k=30)
    predList = [0]*len(aas)
    for prediction in predictions:
      predAA = prediction["token_str"]
      if predAA in aaDict:
        predList[aaDict[predAA]] = prediction["score"]
    bertPredDict[aaPos] = predList
  bertPredDF = berteomeDF.modelPredDF(bertPredDict,seq, aas).predDf
  return bertPredDF

In [None]:
bertPredictionDF("MENDEL")

Unnamed: 0,wt,wtIndex,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
0,M,1,0.036685,0.011501,0.048229,0.118868,0.024064,0.03919,0.012617,0.066477,...,0.07658,0.072637,0.024714,0.03866,0.043091,0.070257,0.056526,0.049911,0.007779,0.021692
1,E,2,0.045712,0.015659,0.041913,0.074816,0.037146,0.044317,0.01826,0.073063,...,0.043572,0.062655,0.025272,0.036905,0.055532,0.064412,0.049945,0.056779,0.012689,0.029887
2,N,3,0.043558,0.009684,0.162566,0.184336,0.033777,0.044654,0.012353,0.052622,...,0.041478,0.041984,0.019989,0.025511,0.029428,0.048098,0.030299,0.054734,0.007428,0.02492
3,D,4,0.042079,0.013243,0.049744,0.086189,0.039733,0.055907,0.01686,0.073291,...,0.040078,0.060817,0.032022,0.039686,0.046224,0.062319,0.044898,0.058933,0.010875,0.026594
4,E,5,0.046638,0.018769,0.079816,0.086908,0.050634,0.050462,0.022395,0.074495,...,0.02896,0.062229,0.023877,0.030532,0.040486,0.06519,0.044934,0.068032,0.012155,0.038031
5,L,6,0.035695,0.008615,0.060928,0.142576,0.019581,0.046287,0.013043,0.060374,...,0.037424,0.090177,0.019358,0.032733,0.043823,0.045863,0.043224,0.045121,0.0098,0.021241


I should probably think about this more, maybe I should go ahead and put the data into the dataframe format that I want from here. It probably doesn't make too much sense to make this structure (which has little utility) just to make the useful structure. Instead, I think I should go ahead and start iterating through predictions and add those predictions to a dict that will be easily converted to a dataframe.

I think I would just need to know the intended index for the 20 amino acids, since the order of the predicitons is sorted by score