In [None]:
# !pip install transformers
!pip install datasets
!pip install tqdm

In [4]:
import time
import scipy.sparse as sp
from transformers import EsmTokenizer, EsmModel
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

path="./data/"

# loading data
def load_data(path):
    """
    Function that loads graphs
    """
    graph_indicator = np.loadtxt(path+"graph_indicator.txt", dtype=np.int64)
    _, graph_size = np.unique(graph_indicator, return_counts=True)

    edges = np.loadtxt(path+"edgelist.txt", dtype=np.int64, delimiter=",")
    edges_inv = np.vstack((edges[:, 1], edges[:, 0]))
    edges = np.vstack((edges, edges_inv.T))
    s = edges[:, 0] * graph_indicator.size + edges[:, 1]
    idx_sort = np.argsort(s)
    edges = edges[idx_sort, :]
    edges, idx_unique = np.unique(edges, axis=0, return_index=True)
    A = sp.csr_matrix(
        (np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
        shape=(graph_indicator.size, graph_indicator.size),
    )

    x = np.loadtxt(path+"node_attributes.txt", delimiter=",")
    edge_attr = np.loadtxt(path+"edge_attributes.txt", delimiter=",")
    edge_attr = np.vstack((edge_attr, edge_attr))
    edge_attr = edge_attr[idx_sort, :]
    edge_attr = edge_attr[idx_unique, :]

    adj = []
    features = []
    edge_features = []
    idx_n = 0
    idx_m = 0
    for i in range(graph_size.size):
        adj.append(A[idx_n : idx_n + graph_size[i], idx_n : idx_n + graph_size[i]])
        edge_features.append(edge_attr[idx_m : idx_m + adj[i].nnz, :])
        features.append(x[idx_n : idx_n + graph_size[i], :])
        idx_n += graph_size[i]
        idx_m += adj[i].nnz

    return adj, features, edge_features

In [5]:
# normalize the ADJ matrix
def normalize_adjacency(A):
    """
    Function that normalizes an adjacency matrix
    """
    n = A.shape[0]
    A += sp.identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D = sp.diags(inv_degs)
    A_normalized = D.dot(A)

    return A_normalized

# Sparse matrix to torch sparse
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """
    Function that converts a Scipy sparse matrix to a sparse Torch tensor
    """
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
    )
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [9]:
path="data/"
# Load graphs
adj, features, edge_features = load_data(path)

# Normalize adjacency matrices
# adj = [normalize_adjacency(A) for A in adj]

# Split data into training and test sets
adj_train = list()
features_train = list()
y_train = list()
adj_test = list()
features_test = list()
proteins_test = list()

In [10]:
with open(path+"graph_labels.txt", "r") as f:
    for i, line in enumerate(f):
        t = line.split(",")
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            adj_test.append(adj[i])
            features_test.append(features[i])
        else:
            adj_train.append(adj[i])
            features_train.append(features[i])
            y_train.append(int(t[1][:-1]))

In [11]:
# Initialize device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

### transformer

In [17]:
from transformers import EsmTokenizer, EsmForSequenceClassification ,EsmModel
import torch


tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t36_3B_UR50D") #esm2_t6_8M_UR50D
model = EsmModel.from_pretrained("facebook/esm2_t36_3B_UR50D")

### using ou rfine tuned model

# model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t33_650M_UR50D", num_labels=18,output_hidden_states=True)
# model.load_state_dict(torch.load("/notebooks/build model/best_esm_fine_tuned/checkpoint-2500/pytorch_model.bin"))

Some weights of the model checkpoint at facebook/esm2_t36_3B_UR50D were not used when initializing EsmModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'esm.contact_head.regression.bias', 'esm.contact_head.regression.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t36_3B_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on

In [18]:
model.to(device); # 

tokenize data 

In [19]:
# from torch.utils.data import DataLoader
samples=[tokenizer(sample, return_tensors="pt") for sample in sequences_train]
samples_test=[tokenizer(sample, return_tensors="pt") for sample in sequences_test]


In [11]:
## getting the embbeding of each protein for train set

with torch.no_grad():
    new_features=[]
    for i, data in tqdm(enumerate(samples)):  # Iterate in batches over the training/test dataset.
        data.to(device)
        out = model(**data)
        out=out.last_hidden_state[0].to("cpu").detach().numpy()[1:-1]
        new_features.append(np.concatenate((out,features_train[i]),axis=1))
        torch.cuda.empty_cache()
        # new_features.append(out)

4888it [22:15,  3.66it/s]


In [20]:
with torch.no_grad():
    new_features_test=[]
    for i, data in tqdm(enumerate(samples_test)):  # Iterate in batches over the training/test dataset.
        data.to(device)
        out = model(**data)
        out=out.last_hidden_state[0].to("cpu").detach().numpy()[1:-1]
        new_features_test.append(np.concatenate((out,features_test[i]),axis=1))
        torch.cuda.empty_cache()
        # new_features.append(out)

1223it [05:20,  3.81it/s]


In [23]:
# save new_features in memory 

import joblib as joblib
joblib.dump(new_features_test, 'new_features_test_3B_params.sav', compress=1)
joblib.dump(new_features, 'new_features_3B_params.sav', compress=1)

['new_features_test_3B_params.sav']

In this notbook we :

    - get the old  features
    - use the pretrained model to get the ebbeding of each amino acide with respect to each protein 
    - we concatenate the old features [84] with the embbeding [almost 640] to get new fearures of size [724]