```
# Copyright 2025 Vít Dohnálek.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
```

In [None]:
#@title Prepare Tensorflow

!python3 -m pip install -q -U tensorflow==2.8.2
!python3 -m pip install -q -U tensorflow-text==2.8.2
import tensorflow as tf
import tensorflow_text
import numpy as np
import re

import IPython.display
from absl import logging

tf.compat.v1.enable_eager_execution()

logging.set_verbosity(logging.ERROR)  # Turn down tensorflow warnings

def print_markdown(string):
  IPython.display.display(IPython.display.Markdown(string))

In [None]:
#@title Prepare Functions

def query(seq):
  return f"[protein_name_in_english] <extra_id_0> [sequence] {seq}"

EC_NUMBER_REGEX = r'(\d+).([\d\-n]+).([\d\-n]+).([\d\-n]+)'

def run_inference(seq):
  labeling = infer(tf.constant([query(seq)]))
  names = labeling['output_0'][0].numpy().tolist()
  scores = labeling['output_1'][0].numpy().tolist()
  beam_size = len(names)
  names = [names[beam_size-1-i].decode().replace('<extra_id_0> ', '') for i in range(beam_size)]
  for i, name in enumerate(names):
    if re.match(EC_NUMBER_REGEX, name):
      names[i] = 'EC:' + name
  scores = [np.exp(scores[beam_size-1-i]) for i in range(beam_size)]
  return names, scores

In [None]:
#@title Load Model

!pip install biopython
! mkdir -p protnlm

! wget -nc https://storage.googleapis.com/brain-genomics-public/research/proteins/protnlm/uniprot_2022_04/savedmodel__20221011__030822_1128_bs1.bm10.eos_cpu/saved_model.pb -P protnlm -q --no-check-certificate
! mkdir -p protnlm/variables
! wget -nc https://storage.googleapis.com/brain-genomics-public/research/proteins/protnlm/uniprot_2022_04/savedmodel__20221011__030822_1128_bs1.bm10.eos_cpu/variables/variables.index -P protnlm/variables/ -q --no-check-certificate
! wget -nc https://storage.googleapis.com/brain-genomics-public/research/proteins/protnlm/uniprot_2022_04/savedmodel__20221011__030822_1128_bs1.bm10.eos_cpu/variables/variables.data-00000-of-00001 -P protnlm/variables/ -q --no-check-certificate

imported = tf.saved_model.load(export_dir="protnlm")
infer = imported.signatures["serving_default"]

In [None]:
#@title Mount Google Drive

from google.colab import drive
import glob

drive.mount('/content/gdrive')

output_path = "/content/gdrive/My Drive/ProtNLM_RESULTS.tsv" #@param {type:"string"}
#@markdown - Output path & name; results will be written into a .tsv file
#@markdown - Appends results after each prediction

files = []
for file in glob.glob(f"/{input_folder.strip('/')}/*.pdb"):
  files.append(file)

print(f"Collected {len(files)} pdb files")

In [None]:
#@title Upload fasta file
from google.colab import files

# Upload the file
uploaded = files.upload()

# Get the filename from the uploaded dictionary
fasta_file = list(uploaded.keys())[0]

In [None]:
#@title 3. ProtNLM Predictions
from Bio import SeqIO
import os


table_rows = [["Protein ID","Hit 1","Hit 1 Score","Hit 2","Hit 2 Score","Hit 3","Hit 3 Score","Hit 4","Hit 4 Score","Hit 5","Hit 5 Score"]]

#Sets up the result files and writes the header
with open(f"/{output_path.strip('/')}", "w") as f:
  f.write("\t".join(table_rows[0]) + "\n")

for seq_rec in SeqIO.parse(fasta_file, "fasta"):

  sequence_ID = seq_rec.id
  sequence = str(seq_rec.seq)
  sequence = sequence.replace(' ', '')

  names, scores = run_inference(sequence)
  results = [sequence_ID]

  for name, score, i in zip(names, scores, range(len(names))):
    if i+1 <= 5:
      #print_markdown(f"Prediction number {i+1}: **{name}** with a score of **{score:.03f}**")
      score = f"{score:.03f}"
      results.append(name)
      results.append(score)

  with open(f"/{output_path.strip('/')}", "a") as f:
    f.write("\t".join(results) + "\n")

  print(f"ProtNLM prediction for {sequence_ID} has been done... ")