In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [2]:
import psutil
import torch
import os
import multiprocessing

print('Total memory:', psutil.virtual_memory().total / (1024 ** 3), "GB")
print("Number of CPUs:", multiprocessing.cpu_count())
torch.set_num_threads(multiprocessing.cpu_count())

print("PyTorch is using", torch.get_num_threads(), "threads.")

Total memory: 93.06243133544922 GB
Number of CPUs: 64
PyTorch is using 64 threads.


In [3]:
data = pd.read_csv('filtered_data.csv')
sequences = list(data['seq'])

In [11]:
max_length = max(len(seq) for seq in sequences)

## Embedding Generation Function

In [8]:
#if you're running on a GPU:

print(torch.cuda.is_available())

tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

sequences_short = sequences  
batch_size = 64 
batches = [sequences_short[i:i + batch_size] for i in range(0, len(sequences_short), batch_size)]

all_embeddings = []

with torch.no_grad(): 
    for batch in tqdm(batches, desc="Processing sequences"):
        tokens_ids = tokenizer.batch_encode_plus(batch, return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
        attention_mask = tokens_ids != tokenizer.pad_token_id

        #move tensors to GPU
        tokens_ids = tokens_ids.to(device)
        attention_mask = attention_mask.to(device)

        #model forward pass
        torch_outs = model(
            tokens_ids,
            attention_mask=attention_mask,
            encoder_attention_mask=attention_mask,
            output_hidden_states=True
        )

        embeddings = torch_outs['hidden_states'][-1]
        attention_mask = torch.unsqueeze(attention_mask, dim=-1)
        mean_sequence_embeddings = torch.sum(attention_mask * embeddings, dim=-2) / torch.sum(attention_mask, dim=1)

        all_embeddings.append(mean_sequence_embeddings)

#move the final result to CPU and convert to NumPy array
all_embeddings = torch.cat(all_embeddings, dim=0).cpu()

print(f"All Embeddings shape: {all_embeddings.shape}")

True


Processing sequences: 100%|██████████| 850/850 [02:58<00:00,  4.76it/s]


All Embeddings shape: torch.Size([54383, 512])


In [10]:
#saving to a npy file:
import pickle

np.save('all_embeddings.npy', all_embeddings)

with open('embeddings.pkl', 'wb') as f:
    pickle.dump(all_embeddings, f)

pickle_file_size = os.path.getsize('embeddings.pkl')

pickle_file_size

111376790

In [None]:
#CPU Implementation

tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)

sequences_short = sequences
batch_size = 48 
batches = [sequences_short[i:i + batch_size] for i in range(0, len(sequences_short), batch_size)]

all_embeddings = []

for batch in tqdm(batches, desc="Processing sequences"):
    tokens_ids = tokenizer.batch_encode_plus(batch, return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
    attention_mask = tokens_ids != tokenizer.pad_token_id

    torch_outs = model(
        tokens_ids,
        attention_mask=attention_mask,
        encoder_attention_mask=attention_mask,
        output_hidden_states=True
    )

    embeddings = torch_outs['hidden_states'][-1].detach().numpy()
    attention_mask = torch.unsqueeze(attention_mask, dim=-1)
    mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1)

    all_embeddings.append(mean_sequence_embeddings)

all_embeddings = torch.cat(all_embeddings, dim=0)

print(f"All Embeddings shape: {all_embeddings.shape}")

In [None]:
len(all_embeddings)