# Use trained model from the publication "Using Deep Learning to Annotate the Protein Universe".
[preprint link](https://doi.org/10.1101/626507)

This notebook used to calculate the embeddings of the protein sequence using trained ProtCNN model.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# !pip install -e /content/drive/MyDrive/_ruslan_project/protein_universe_annotate

In [3]:
# import sys
# sys.path.append('/content/drive/MyDrive/_ruslan_project/protein_universe_annotate')

In [3]:
import json
import numpy as np
import tensorflow.compat.v1 as tf
import os
import pandas as pd
import math
import tqdm

# Suppress noisy log messages.
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

In [5]:
from protein_universe_annotate.utils import get_top_k_values_indices
from protein_universe_annotate.constants import AMINO_ACID_VOCABULARY, _PFAM_GAP_CHARACTER
from protein_universe_annotate.data_processing import read_pfam_dataset
from protein_universe_annotate.inference.inference_misc import infer_predictions

In [None]:
from protein_universe_annotate.inference.inference_ProtCNN import residues_to_one_hot, pad_one_hot_sequence, batch_iterable

## Library functions: convert sequence to one-hot array (input to model)

## Download model and vocabulary

In [6]:
# Get a TensorFlow SavedModel
# !wget -qN https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/seed_random_32.0/5356760.tar.gz
# unzip
# !tar xzf 5356760.tar.gz
# Get the vocabulary for the model, which tells you which output index means which family
# !wget https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/trained_model_pfam_32.0_vocab.json

## Load the model into TensorFlow

In [7]:
model_path = '/models/trn-_cnn_random__random_sp_gpu-cnn_for_random_pfam-5356760'

In [9]:
sess = tf.Session()
graph = tf.Graph()

In [10]:
with graph.as_default():
    trained_model = tf.saved_model.load(sess, ['serve'], model_path)

## Load tensors for getting the embedding of the trained model

In [12]:
sequence_input_tensor_name = trained_model.signature_def['confidences'].inputs['sequence'].name
sequence_lengths_input_tensor_name = trained_model.signature_def['confidences'].inputs['sequence_length'].name

## Compute embedding on one sample

In [13]:
# Get embedding for globin_domain
hemoglobin = 'MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR'
globin_domain = hemoglobin[6:107]

In [20]:
embedding_signature = trained_model.signature_def['pooled_representation']
embedding_signature_tensor_name = embedding_signature.outputs['output'].name

In [21]:
# The first run of this cell will be slower; the subsequent runs will be fast.
# This is because on the first run, the TensorFlow XLA graph is compiled, and
# then is reused.
with graph.as_default():
    embedding = sess.run(
        embedding_signature_tensor_name,
        {
            # Note that this function accepts a batch of sequences which
            # can speed up inference when running on many sequences.
            sequence_input_tensor_name: [residues_to_one_hot(globin_domain)],
            sequence_lengths_input_tensor_name: [len(globin_domain)],
        }
    )

In [22]:
# Shape of embedding is (# seqs in batch, number of features in embedding space)
embedding.shape

(1, 1100)

In [23]:
embedding

array([[-17.9971   ,   1.3453426, -43.63362  , ...,  -9.863431 ,
        -33.738045 ,  23.25798  ]], dtype=float32)

In [24]:
def calc_embeddings(batch):
    
    seq_lens = [len(seq) for seq in batch]
    one_hots = [residues_to_one_hot(seq) for seq in batch]
    padded_sequence_inputs = [pad_one_hot_sequence(seq, max(seq_lens)) for seq in one_hots]

    with graph.as_default():
        return sess.run(
            embedding_signature_tensor_name,
            {
                sequence_input_tensor_name: padded_sequence_inputs,
                sequence_lengths_input_tensor_name: seq_lens,
            }
        )

## Compute embeddings for entire dataset

In [15]:
data_partitions_dirpath = '../data/'
print('Available dataset partitions: ', os.listdir(data_partitions_dirpath))

Available dataset partitions:  ['test', 'train']


In [17]:
test_df = read_pfam_dataset('test', data_partitions_dirpath)

In [19]:
# Sort test_df by sequence length so that batches have as little padding as 
# possible -> faster forward pass
test_df = test_df.sort_values('sequence', key=lambda col: [len(c) for c in col])

In [None]:
embeddings_testset = []

batches = list(batch_iterable(test_df.sequence, 16))

for seq_batch in tqdm.tqdm(batches, position=0):
    batch_embeddings = calc_embeddings(seq_batch)
    embeddings_testset.extend(batch_embeddings)

100%|██████████| 7886/7886 [14:26<00:00,  9.10it/s]


In [None]:
len(embeddings_testset)

126171

In [None]:
with open('/content/drive/MyDrive/_ruslan_project/testset_embeddings.npy', 'wb') as f:
    np.save(f, np.array(embeddings_testset))

## Compute the embeddings of the training data (e.g. to fit the KNN)

In [None]:
train_df = read_pfam_dataset('train')

In [None]:
train_df = train_df.sort_values('sequence', key=lambda col: [len(c) for c in col])

In [None]:
batch_size = 16

In [None]:
batches = list(batch_iterable(train_df.sequence, batch_size))

In [None]:
# Keep embeddings of the train dataset in predefined size list
embeddings_trainset = [0] * len(train_df)

In [None]:
count_batches = 0
curr_batch_position = 0

for seq_batch in tqdm.tqdm(batches, position=0):
    batch_embeddings = calc_embeddings(seq_batch)
    embeddings_trainset[curr_batch_position : curr_batch_position + batch_size] = batch_embeddings
    
    count_batches += 1
    curr_batch_position += batch_size

    # Process in blocks since embeddings_trainset is too huge to keep in limited RAM
    # embeddings_trainset, (len(train_data), 1100), where embedding of length 1100
    # if count_batches % 30000 == 0:
    #     with open(f'/content/drive/MyDrive/_ruslan_project/trainset_embeddings_{count_batches}.npy', 'wb') as f:
    #         np.save(f, np.array(embeddings_trainset))
    #     embeddings_trainset = []

  np.save(f, np.array(embeddings_trainset))
100%|██████████| 67922/67922 [1:25:18<00:00, 13.27it/s]


In [None]:
with open(f'/content/drive/MyDrive/_ruslan_project/trainset_embeddings.npy', 'wb') as f:
    np.save(f, np.array(embeddings_trainset))

In [None]:
# Check the available RAM memory
import psutil
psutil.virtual_memory().available

3451133952

In [None]:
# Free up RAM memory
# del train_df

In [None]:
embeddings_train_1 = np.load('/content/drive/MyDrive/_ruslan_project/trainset_embeddings_3000.npy', allow_pickle=True)

In [None]:
embeddings_train_2 = np.load('/content/drive/MyDrive/_ruslan_project/trainset_embeddings_60000.npy', allow_pickle=True)

In [None]:
embeddings_train_3 = np.load('/content/drive/MyDrive/_ruslan_project/trainset_embeddings_remaining.npy', allow_pickle=True)

In [None]:
embeddings_train_1.dtype

dtype('float64')

In [None]:
embeddings_train_2.dtype

dtype('float32')

In [None]:
embeddings_train_3.dtype

dtype('float32')

In [None]:
# Convert the dtype of the array from float64 to float32 -->
# Save memory on disk and RAM
embeddings_train_1 = embeddings_train_1.astype(np.float32)

In [None]:
# Concatenate the arrays along the first axis
train_embeddings = np.concatenate((embeddings_train_1, embeddings_train_2, embeddings_train_3), axis=0)

In [None]:
print(f"Memory usage: {train_embeddings.nbytes / (1024 ** 2):.2f} MB")

Memory usage: 4560.15 MB


In [None]:
with open(f'/content/drive/MyDrive/_ruslan_project/training_embeddings.npy', 'wb') as f:
    np.save(f, train_embeddings)