In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load pre-trained Caduceus model and tokenizer
model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

In [None]:
import pandas as pd

pathogenecity_type='noncoding'
# pathogenecity_type='missense'

df=pd.read_csv('dna_segment_'+pathogenecity_type+'.csv')
# df.head()
df_origin = df
df_origin

In [None]:
# 设置并行参数和批处理大小
num_parallel = 10
batch_size = 3000

In [None]:
# %%time

import dask.dataframe as dd
import pandas as pd
import jax.numpy as jnp
from dask.diagnostics import ProgressBar

# Vectorized tokenization function
def vectorized_tokenizer(subsequences):
    # Assuming tokenizer.batch_tokenize can handle a list of subsequences
    # tokens_ids = [b[1] for b in tokenizer.batch_tokenize(subsequences)]
    # return jnp.asarray(tokens_ids, dtype=jnp.int32)

    
    # Tokenize the batch of sequences
    tokens = tokenizer(subsequences, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Move tokens to GPU
    tokens = {key: val.to(device) for key, val in tokens.items()}
    return tokens

# Vectorized embedding function
def vectorized_embedding(tokens):
    # Forward pass to compute last layer embeddings for the batch
    with torch.no_grad():
        outputs = model(**tokens, output_hidden_states=True)  # Enable output of hidden states
        hidden_states = outputs.hidden_states  # Access all hidden states
        last_layer_embeddings = hidden_states[-1]  # Get the last layer embeddings (batch_size, seq_len, hidden_size)
    
    # Compute the mean of the last layer embeddings across the token (sequence) dimension for each sequence in the batch
    # Dimension 1 corresponds to the token/sequence length, so we compute the mean along this axis
    mean_embeddings = torch.mean(last_layer_embeddings, dim=1)  # (batch_size, hidden_size)
    
    # If needed, squeeze out any extra dimensions (though this shouldn't be necessary after mean calculation)
    mean_embeddings_squeezed = mean_embeddings.squeeze(dim=1)

    return mean_embeddings_squeezed

# Tokenization and embedding combined in a batch-wise function
def process_batch(subsequences):
    tokens = vectorized_tokenizer(subsequences)
    embeddings = vectorized_embedding(tokens)
    return embeddings


def apply_get_embeddings_dask(df):
    # subsequences = df['ref_forward_sequence'].values  # Get all subsequences in the batch
    subsequences = df['sequence'].tolist() 
    embeddings = process_batch(subsequences)  # Process in a vectorized manner
    embeddings_cpu = embeddings.cpu().numpy()
    
    # df['embedding'] = list(embeddings_cpu)  # Assign embeddings back to the DataFrame
    df2 = pd.DataFrame(embeddings_cpu, columns=[f'{i+1}' for i in range(embeddings_cpu.shape[1])])
    df = pd.concat([df.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
    return df

In [None]:
%%time

import numpy as np
import pandas as pd
import dask.dataframe as dd
from datasets import load_dataset, load_from_disk
from datetime import datetime, timedelta

# Define chunk size (number of rows per chunk)
# chunksize = 10000  # Adjust chunk size according to your memory capacity
# num_parallel = 10

#================dataset -> df================
# code for loading homo_sapiens_dataset/train
# save_path="homo_sapiens_dataset"
# mydatasets = load_from_disk(save_path)
# # print(mydatasets)
# train = mydatasets['train']
# df = train.to_pandas()


# chunkid=0
typename="pathogenecity"

for chunkid in range(0,1):

    
    # save_path="methylation_multisets"
    # chunk_save_path = f"{save_path}/train_{chunkid}"
    # dataset_chunk = load_from_disk(chunk_save_path)
    # df = dataset_chunk.to_pandas()
    
    # df = segments_df
    
    # code for loading pathgenicity_noncoding_multisets/combined_dataset
    # save_path = "pathgenicity_noncoding_multisets"
    # df = load_from_disk(save_path+"/combined_dataset")
    
    
    #================df -> df's chunks================
    # Assume df is your huge DataFrame
    chunk_size = 10000  # Define the number of rows per chunk
    num_parallel = 10
    
    num_chunks = int(np.ceil(len(df) / chunk_size))  # Calculate the number of chunks
    
    # Split the DataFrame into chunks using array_split
    chunks = np.array_split(df, num_chunks)
    
    # Initialize an empty list to store the processed chunks
    processed_chunks = []
    # num_parallel=10
    
    #================process each chunk with dask's ddf================
    # Iterate over each chunk
    for chunk in chunks:
        # Process each chunk
        # print(chunk.head())  # Example: print the first few rows of the chunk
    
        ddf = dd.from_pandas(chunk, npartitions=num_parallel)  # Adjust 'npartitions' based on resources

        # sample_subsequences = chunk['ref_forward_sequence'].tolist()
        # sample_embeddings = process_batch(sample_subsequences).cpu().numpy()  # Get sample embeddings
        # num_embedding_columns = sample_embeddings.shape[1]
        num_embedding_columns = 256
        
        # Define metadata for the new DataFrame structure
        # Drop the 'embedding' column since it no longer exists
        meta = chunk.copy()
        meta = meta.drop(columns=['embedding'], errors='ignore')  # Drop 'embedding' if it exists
        # Add new embedding columns to the metadata
        for i in range(num_embedding_columns):
            meta[f'{i+1}'] = float  # Adjust type as necessary, float is common for embeddings


        
    
        # Apply the function in parallel using Dask
        ddf = ddf.map_partitions(apply_get_embeddings_dask, meta=meta)
    
        # Compute the result with progress tracking
        with ProgressBar():
            processed_chunk = ddf.compute()
    
        # Append processed chunk to list
        processed_chunks.append(processed_chunk)

        
    # Concatenate all processed chunks into a final DataFrame
    final_df = pd.concat(processed_chunks, ignore_index=True)

    now = datetime.now()
    formatted_time = now.strftime("%y-%m-%d-%H-%M-%S")
    # print(formatted_time)


    final_df = final_df.drop(columns=['sequence'])
    first_col = final_df.iloc[:, 0]  # Get the first column
    final_df = final_df.drop(final_df.columns[0], axis=1)  # Drop the first column
    final_df[first_col.name] = first_col  # Add it back as the last column

    final_df.to_csv(typename+'_caduceus_'+pathogenecity_type+'_'+str(chunkid)+'_'+formatted_time+'.csv', index=False)

    # final_df.to_csv(typename+'_train_'+str(chunkid)+'_with_embedding.csv', index=False)
    print(f"{typename}_caduceus_{pathogenecity_type}_{chunkid}_{formatted_time}.csv is created.")

final_df

In [None]:
import pandas as pd
df = pd.read_csv('./pathogenecity_gpn_missense.csv')
df.head()
len(df)