In [1]:
import Phosformer

# load Phosformer model and tokenizer
model = Phosformer.RobertaForSequenceClassification.from_pretrained('waylandy/phosformer')
tokenizer = Phosformer.RobertaTokenizer.from_pretrained('waylandy/phosformer')

# disables dropout for deterministic results
model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(95, 768, padding_idx=1)
      (position_embeddings): Embedding(4000, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [2]:
# Provide the kinase domain sequence 
kinase_sequence  = 'YTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYL'

# Provide the peptide sequence
peptide_sequence = 'LLKLASPELER'

# Run the prediction
Phosformer.predict_one(kinase_sequence, peptide_sequence, model=model, tokenizer=tokenizer)

0.67370486

In [3]:
# Provide the kinase domain sequence 
kinase_sequence  = 'YTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYL'

# Provide the peptide sequence
peptide_sequence = 'LLKLASPELER'

# Run the prediction
Phosformer.predict_one(kinase_sequence, peptide_sequence, model=model, tokenizer=tokenizer)

0.67370486

In [4]:
import pandas as pd

# Load the included csv file containing kinase domain sequences
kinase_csv       = pd.read_csv('data/reference_human_kinases.csv')
# Retrieve the kinase domain sequence from the csv based on UniProt
kinase_sequence  = kinase_csv[kinase_csv['uniprot']=='P28482']['sequence'].item()

# Provide the peptide sequence
peptide_sequence = 'LLKLASPELER'

# Run the prediction
Phosformer.predict_one(kinase_sequence, peptide_sequence, model=model, tokenizer=tokenizer)


0.67370486

In [5]:
import pandas as pd

kinases                 = pd.read_csv('data/example_input_peptide.csv')
kinase_sequences_list   = kinases['kinase domain sequence'].values
peptide_sequences_list  = kinases['peptide sequence'].values

Phosformer.predict_many(
    kinase_sequences_list,
    peptide_sequences_list,
    model=model,
    tokenizer=tokenizer,
    batch_size=20, # how many samples to load at once, if you're running out of memory, you can set this number lower
    device='cpu',  # either "cpu" or "cuda"
    threads=1      # specify how many threads to use, can help speed up if running on cpu
)


array([0.6737049 , 0.5447024 , 0.60668427, 0.12248913, 0.08716565,
       0.67627823, 0.70279336, 0.50462765, 0.02995262, 0.14288309,
       0.6487391 , 0.5467956 , 0.83140016, 0.312455  , 0.05308862],
      dtype=float32)