In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [None]:
data = pd.read_csv('filtered_data.csv')
sequences = list(data['seq'])

In [3]:
max_length = max(len(seq) for seq in sequences)
sequences

['AAGGGCCGTCGGTGATTTTAGTCTTCCTCAGTGGTTACCAGGTCTCCCGTTCTCTCCACATTCTCTTCGCACTCGATCGGAGCGTAAGGTCCGTGCCAAGTCGCCGAGCGCTTTCGAGTGCCGTCTCTGGGTAGGGT',
 'AATCCGCAATTCTCACAGTGACGTGGGCGTGGCTCCCCACCAATGACAGACTCTCATTGGTGTCGCTCAGCAATCTCCCAGAATGCCCAGTCCATTTGTAGCCTCTAGTGAGACTACAGTCCAGGAGCCCCGCA',
 'CGAAGCGAGAGCACGGGAGACCAGAGTGGCCCCTAGGAGGGCCGTTAAGCCCGCTCAGATGCTCGAGCGACCCCGAGCGTCCTCACCTTCCTCGTAGCCTGAAAACCCGCTCCGCGGACGCCGGCTTCTCGTCCTGGT',
 'ATAGCTTAAGTGTGATTCACTCTGAGTCATTTACTGCTGCTGCTGCTGCTGCTATACTGCCTAGAGCTTGACATTGAACAAGTTGGCAGCCTGTATTGCTACATTTTTTCTAGGAACTCTGACACATTCATTCTTG',
 'AATCAAACCCTCTGTCGTCAAGACTCCCCGCCCCACCCACCTTTCCGGACCAATCGCTCGTGTCATACTTACGCTTCCGCCTCCTGCGTCCCTACGCGCCCGGCGCTTCGGCTGAGGAGGTGGGCTCCCTTTTGTACA',
 'GTGCTGTGACGATGCTCTCTGCCCTGCCCAGGGTCCTTGACAGGAACGGTTGCCATAACAACCATCTTACACCAATACAGAAATAGCGCAAGCTTTTTGCAACCACTCTTGACTTCATTAATTATGCCATTCCTAACT',
 'TCGATGAGGGATCTTCTTACTGACTTTAAAGAACGGTGACATTTCTGCTTCAGCATAAAAACTTAAGTGTGTGACATTTAATTCTGCCTAAACATAGTCTGCATAAATCCTCGCCACAGAGCCTGGCAGTGGTGGC',
 'CTTCTC

## Embedding Generation Function

In [4]:
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)

sequences_short = sequences[:1000]
batch_size = 50  # Define your batch size
batches = [sequences_short[i:i + batch_size] for i in range(0, len(sequences_short), batch_size)]

all_embeddings = []

# Process each batch
for batch in tqdm(batches, desc="Processing sequences"):
    tokens_ids = tokenizer.batch_encode_plus(batch, return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
    attention_mask = tokens_ids != tokenizer.pad_token_id
    
    torch_outs = model(
        tokens_ids,
        attention_mask=attention_mask,
        encoder_attention_mask=attention_mask,
        output_hidden_states=True
    )
    
    embeddings = torch_outs['hidden_states'][-1].detach().numpy()
    attention_mask = torch.unsqueeze(attention_mask, dim=-1)
    mean_sequence_embeddings = torch.sum(attention_mask * embeddings, axis=-2) / torch.sum(attention_mask, axis=1)
    
    all_embeddings.append(mean_sequence_embeddings)

# Combine embeddings from all batches
all_embeddings = torch.cat(all_embeddings, dim=0)

print(f"All Embeddings shape: {all_embeddings.shape}")

Processing sequences: 100%|█████████████████████| 20/20 [00:52<00:00,  2.65s/it]

All Embeddings shape: torch.Size([1000, 512])





In [None]:
len(all_embeddings)