In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [2]:
import psutil
import torch
import os
import multiprocessing

print('Total memory:', psutil.virtual_memory().total / (1024 ** 3), "GB")
print("Number of CPUs:", multiprocessing.cpu_count())
torch.set_num_threads(multiprocessing.cpu_count())

print("PyTorch is using", torch.get_num_threads(), "threads.")

Total memory: 93.06242752075195 GB
Number of CPUs: 64
PyTorch is using 64 threads.


# Dataset 1

In [3]:
data = pd.read_csv('filtered_data.csv')
sequences = list(data['seq'])

In [4]:
max_length = max(len(seq) for seq in sequences)

# Regulon Dataset

## Embedding Generation Function

In [6]:
#if you're running on a GPU:

print(torch.cuda.is_available())

tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

sequences_short = sequences  
batch_size = 64 
batches = [sequences_short[i:i + batch_size] for i in range(0, len(sequences_short), batch_size)]

all_embeddings = []
print("gonna start")

with torch.no_grad(): 
    for batch in tqdm(batches, desc="Processing sequences"):
        tokens_ids = tokenizer.batch_encode_plus(batch, return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
        attention_mask = tokens_ids != tokenizer.pad_token_id

        #move tensors to GPU
        tokens_ids = tokens_ids.to(device)
        attention_mask = attention_mask.to(device)

        #model forward pass
        torch_outs = model(
            tokens_ids,
            attention_mask=attention_mask,
            encoder_attention_mask=attention_mask,
            output_hidden_states=True
        )

        embeddings = torch_outs['hidden_states'][-1]
        attention_mask = torch.unsqueeze(attention_mask, dim=-1)
        mean_sequence_embeddings = torch.sum(attention_mask * embeddings, dim=-2) / torch.sum(attention_mask, dim=1)

        all_embeddings.append(mean_sequence_embeddings)

#move the final result to CPU and convert to NumPy array
all_embeddings = torch.cat(all_embeddings, dim=0).cpu()

print(f"All Embeddings shape: {all_embeddings.shape}")

True


Downloading tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

Downloading esm_config.py:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading modeling_esm.py:   0%|          | 0.00/58.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/392M [00:00<?, ?B/s]

gonna start


Processing sequences:   0%|          | 0/850 [00:00<?, ?it/s]


RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [8]:
#saving to a npy file:
import pickle

np.save('all_embeddings_500m', all_embeddings)

# with open('embeddings_500m_human.pkl', 'wb') as f:
#     pickle.dump(all_embeddings, f)

pickle_file_size = os.path.getsize('all_embeddings_500m.npy')

pickle_file_size

278441088

In [None]:
#CPU Implementation

tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)

sequences_short = sequences
batch_size = 48 
batches = [sequences_short[i:i + batch_size] for i in range(0, len(sequences_short), batch_size)]

all_embeddings = []

for batch in tqdm(batches, desc="Processing sequences"):
    tokens_ids = tokenizer.batch_encode_plus(batch, return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
    attention_mask = tokens_ids != tokenizer.pad_token_id

    torch_outs = model(
        tokens_ids,
        attention_mask=attention_mask,
        encoder_attention_mask=attention_mask,
        output_hidden_states=True
    )

    embeddings = torch_outs['hidden_states'][-1].detach().numpy()
    attention_mask = torch.unsqueeze(attention_mask, dim=-1)
    mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1)

    all_embeddings.append(mean_sequence_embeddings)

all_embeddings = torch.cat(all_embeddings, dim=0)

print(f"All Embeddings shape: {all_embeddings.shape}")

In [None]:
len(all_embeddings)