In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load pre-trained Caduceus model and tokenizer
model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

CaduceusForMaskedLM(
  (caduceus): Caduceus(
    (backbone): CaduceusMixerModel(
      (embeddings): CaduceusEmbeddings(
        (word_embeddings): Embedding(16, 256)
      )
      (layers): ModuleList(
        (0-15): 16 x Block(
          (mixer): BiMambaWrapper(
            (mamba_fwd): Mamba(
              (in_proj): Linear(in_features=256, out_features=1024, bias=False)
              (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
              (act): SiLU()
              (x_proj): Linear(in_features=512, out_features=48, bias=False)
              (dt_proj): Linear(in_features=16, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=256, bias=False)
            )
            (mamba_rev): Mamba(
              (in_proj): Linear(in_features=256, out_features=1024, bias=False)
              (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
              (act): SiLU()
          

In [4]:
import pandas as pd

pathogenecity_type='noncoding'
# pathogenecity_type='missense'

df=pd.read_csv('dna_segment_'+pathogenecity_type+'.csv')
# df.head()
df_origin = df
df_origin

Unnamed: 0,sequence,y
0,GACGGTGGAGGAGATCCTCAACGTGGACCCGGTGCAGCACACGTAC...,0
1,GAGGAGATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCA...,0
2,AGGAGATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAA...,0
3,GGAGATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAAG...,0
4,GGAGATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAAG...,0
...,...,...
95755,CATCATAGCTAAGGTCAAGCAATGGAAGGGGAAAGACAGAGAGAAA...,0
95756,AATGCTAACTGTTATAGATAGATTCAGTTGTTTGTACTTCTCTGCT...,0
95757,GAAGACTTTATCATCTTCTTTCTCCCTTTGACTGGTCTGATCATCA...,0
95758,ACCACTGTATCATAAACCTCAGCCTGGATGGTAGGACCTAGCAGAC...,0


In [5]:
# import dask.dataframe as dd
# from dask.diagnostics import ProgressBar
# import numpy as np

# # 设置并行参数和批处理大小
# num_parallel = 10
# batch_size = 3000

# # 创建一个DataFrame，包含DNA序列
# # segments_df = pd.DataFrame(segments, columns=['sequence'])
# # segments_df = segments_df.head(200)

# # segments_df = df.head(200)


# # 使用Dask对DataFrame进行并行化处理
# segments_ddf = dd.from_pandas(segments_df, npartitions=num_parallel)

# # 定义处理嵌入的函数
# def process_embedding(df):
#     embeddings = []
#     for dna in df['sequence']:
#         tokens = tokenizer(dna, return_tensors='pt', padding='max_length', max_length=512, truncation=True)
#         tokens = {key: val.to(device) for key, val in tokens.items()}

#         with torch.no_grad():
#             outputs = model(**tokens, output_hidden_states=True)
#             hidden_states = outputs.hidden_states
#             last_layer_embeddings = hidden_states[-1]  # 获取最后一层的嵌入
#             mean_embeddings = torch.mean(last_layer_embeddings, dim=1)  # 计算平均嵌入
#             mean_embeddings = mean_embeddings.view(mean_embeddings.shape[0], -1)
#             embeddings.append(mean_embeddings.cpu().numpy())

#     # 将嵌入结果转换为DataFrame
#     embeddings = np.vstack(embeddings)
#     return embeddings


In [6]:
# # 显示进度条并进行并行计算
# with ProgressBar():
#     ddf_embeddings = segments_ddf.map_partitions(process_embedding).compute()


# df_embeddings = pd.DataFrame(ddf_embeddings)
# df_embeddings

In [7]:
# 设置并行参数和批处理大小
num_parallel = 10
batch_size = 3000

# # 创建一个DataFrame，包含DNA序列
# segments_df = pd.DataFrame(segments, columns=['sequence'])
# segments_df

In [8]:
# %%time

import dask.dataframe as dd
import pandas as pd
import jax.numpy as jnp
from dask.diagnostics import ProgressBar

# Vectorized tokenization function
def vectorized_tokenizer(subsequences):
    # Assuming tokenizer.batch_tokenize can handle a list of subsequences
    # tokens_ids = [b[1] for b in tokenizer.batch_tokenize(subsequences)]
    # return jnp.asarray(tokens_ids, dtype=jnp.int32)

    
    # Tokenize the batch of sequences
    tokens = tokenizer(subsequences, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Move tokens to GPU
    tokens = {key: val.to(device) for key, val in tokens.items()}
    return tokens

# Vectorized embedding function
def vectorized_embedding(tokens):
    # random_key = jax.random.PRNGKey(0)
    # # Assuming forward_fn.apply() can handle batches of tokens
    # outs = forward_fn.apply(parameters, random_key, tokens)
    # return outs["embeddings_20"][:, 0, :]


    # Forward pass to compute last layer embeddings for the batch
    with torch.no_grad():
        outputs = model(**tokens, output_hidden_states=True)  # Enable output of hidden states
        hidden_states = outputs.hidden_states  # Access all hidden states
        last_layer_embeddings = hidden_states[-1]  # Get the last layer embeddings (batch_size, seq_len, hidden_size)
    
    # Compute the mean of the last layer embeddings across the token (sequence) dimension for each sequence in the batch
    # Dimension 1 corresponds to the token/sequence length, so we compute the mean along this axis
    mean_embeddings = torch.mean(last_layer_embeddings, dim=1)  # (batch_size, hidden_size)
    
    # If needed, squeeze out any extra dimensions (though this shouldn't be necessary after mean calculation)
    mean_embeddings_squeezed = mean_embeddings.squeeze(dim=1)

    return mean_embeddings_squeezed

# Tokenization and embedding combined in a batch-wise function
def process_batch(subsequences):
    tokens = vectorized_tokenizer(subsequences)
    embeddings = vectorized_embedding(tokens)
    return embeddings

# Function to apply on each Dask partition
# def apply_get_embeddings_dask(df):
#     # subsequences = df['ref_forward_sequence'].values  # Get all subsequences in the batch
#     subsequences = df['ref_forward_sequence'].tolist() 
#     embeddings = process_batch(subsequences)  # Process in a vectorized manner
#     embeddings_cpu = embeddings.cpu().numpy()
#     df['embedding'] = list(embeddings_cpu)  # Assign embeddings back to the DataFrame
#     return df


def apply_get_embeddings_dask(df):
    # subsequences = df['ref_forward_sequence'].values  # Get all subsequences in the batch
    subsequences = df['sequence'].tolist() 
    embeddings = process_batch(subsequences)  # Process in a vectorized manner
    embeddings_cpu = embeddings.cpu().numpy()
    
    # df['embedding'] = list(embeddings_cpu)  # Assign embeddings back to the DataFrame
    df2 = pd.DataFrame(embeddings_cpu, columns=[f'{i+1}' for i in range(embeddings_cpu.shape[1])])
    df = pd.concat([df.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
    return df

In [9]:
%%time

import numpy as np
import pandas as pd
import dask.dataframe as dd
from datasets import load_dataset, load_from_disk
from datetime import datetime, timedelta

# Define chunk size (number of rows per chunk)
# chunksize = 10000  # Adjust chunk size according to your memory capacity
# num_parallel = 10

#================dataset -> df================
# code for loading homo_sapiens_dataset/train
# save_path="homo_sapiens_dataset"
# mydatasets = load_from_disk(save_path)
# # print(mydatasets)
# train = mydatasets['train']
# df = train.to_pandas()


# chunkid=0
typename="pathogenecity"

for chunkid in range(0,1):

    
    # save_path="methylation_multisets"
    # chunk_save_path = f"{save_path}/train_{chunkid}"
    # dataset_chunk = load_from_disk(chunk_save_path)
    # df = dataset_chunk.to_pandas()
    
    # df = segments_df
    
    # code for loading pathgenicity_noncoding_multisets/combined_dataset
    # save_path = "pathgenicity_noncoding_multisets"
    # df = load_from_disk(save_path+"/combined_dataset")
    
    
    #================df -> df's chunks================
    # Assume df is your huge DataFrame
    chunk_size = 10000  # Define the number of rows per chunk
    num_parallel = 10
    
    num_chunks = int(np.ceil(len(df) / chunk_size))  # Calculate the number of chunks
    
    # Split the DataFrame into chunks using array_split
    chunks = np.array_split(df, num_chunks)
    
    # Initialize an empty list to store the processed chunks
    processed_chunks = []
    # num_parallel=10
    
    #================process each chunk with dask's ddf================
    # Iterate over each chunk
    for chunk in chunks:
        # Process each chunk
        # print(chunk.head())  # Example: print the first few rows of the chunk
    
        ddf = dd.from_pandas(chunk, npartitions=num_parallel)  # Adjust 'npartitions' based on resources

        # sample_subsequences = chunk['ref_forward_sequence'].tolist()
        # sample_embeddings = process_batch(sample_subsequences).cpu().numpy()  # Get sample embeddings
        # num_embedding_columns = sample_embeddings.shape[1]
        num_embedding_columns = 256
        
        # Define metadata for the new DataFrame structure
        # Drop the 'embedding' column since it no longer exists
        meta = chunk.copy()
        meta = meta.drop(columns=['embedding'], errors='ignore')  # Drop 'embedding' if it exists
        # Add new embedding columns to the metadata
        for i in range(num_embedding_columns):
            meta[f'{i+1}'] = float  # Adjust type as necessary, float is common for embeddings


        
    
        # Apply the function in parallel using Dask
        ddf = ddf.map_partitions(apply_get_embeddings_dask, meta=meta)
    
        # Compute the result with progress tracking
        with ProgressBar():
            processed_chunk = ddf.compute()
    
        # Append processed chunk to list
        processed_chunks.append(processed_chunk)

        
    # Concatenate all processed chunks into a final DataFrame
    final_df = pd.concat(processed_chunks, ignore_index=True)

    now = datetime.now()
    formatted_time = now.strftime("%y-%m-%d-%H-%M-%S")
    # print(formatted_time)


    final_df = final_df.drop(columns=['sequence'])
    first_col = final_df.iloc[:, 0]  # Get the first column
    final_df = final_df.drop(final_df.columns[0], axis=1)  # Drop the first column
    final_df[first_col.name] = first_col  # Add it back as the last column

    final_df.to_csv(typename+'_caduceus_'+pathogenecity_type+'_'+str(chunkid)+'_'+formatted_time+'.csv', index=False)

    # final_df.to_csv(typename+'_train_'+str(chunkid)+'_with_embedding.csv', index=False)
    print(f"{typename}_caduceus_{pathogenecity_type}_{chunkid}_{formatted_time}.csv is created.")

final_df



[########################################] | 100% Completed | 7.68 ss




[########################################] | 100% Completed | 5.65 sms




[########################################] | 100% Completed | 5.64 sms




[########################################] | 100% Completed | 5.65 sms




[########################################] | 100% Completed | 5.68 sms




[########################################] | 100% Completed | 5.65 sms




[########################################] | 100% Completed | 5.65 sms




[########################################] | 100% Completed | 5.66 sms




[########################################] | 100% Completed | 5.65 sms




[########################################] | 100% Completed | 5.75 sms
pathogenecity_caduceus_noncoding_0_24-11-02-11-18-14.csv is created.
CPU times: user 1min 43s, sys: 3.06 s, total: 1min 46s
Wall time: 1min 51s


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,248,249,250,251,252,253,254,255,256,y
0,-0.000009,-0.000114,0.000584,-0.075901,-0.001930,0.001802,-0.001458,0.001981,-0.003654,-0.000131,...,0.000479,-0.003681,-0.007886,0.000181,0.047053,-0.000462,-0.000022,-0.000186,-0.000416,0
1,-0.000009,-0.000189,0.000618,-0.071603,-0.001984,0.001968,-0.001570,0.001917,-0.003725,-0.000122,...,0.000511,-0.004245,-0.007986,0.000162,0.045868,-0.000432,-0.000024,-0.000193,-0.000412,0
2,-0.000009,-0.000167,0.000632,-0.070145,-0.001952,0.001946,-0.001553,0.001906,-0.003593,-0.000114,...,0.000493,-0.004148,-0.008074,0.000159,0.047394,-0.000442,-0.000023,-0.000185,-0.000385,0
3,-0.000010,-0.000174,0.000640,-0.072530,-0.001988,0.001940,-0.001615,0.001977,-0.003650,-0.000124,...,0.000500,-0.004350,-0.008170,0.000154,0.048532,-0.000431,-0.000026,-0.000191,-0.000396,0
4,-0.000010,-0.000174,0.000640,-0.072530,-0.001988,0.001940,-0.001615,0.001977,-0.003650,-0.000124,...,0.000500,-0.004350,-0.008170,0.000154,0.048532,-0.000431,-0.000026,-0.000191,-0.000396,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95755,0.000012,-0.000063,0.000123,0.060489,0.000282,0.001207,-0.000444,0.001210,-0.006644,0.000141,...,0.001271,-0.005495,-0.005705,0.000195,0.023038,-0.000592,-0.000172,0.000121,-0.000602,0
95756,0.000010,-0.000175,0.000252,0.102112,0.000956,0.000616,-0.000352,0.000465,-0.006788,0.000126,...,0.001107,-0.005858,-0.005800,0.000157,0.077814,-0.000655,-0.000207,0.000135,-0.000448,0
95757,0.000011,-0.000145,0.000245,0.099015,0.000773,0.000902,-0.000382,0.000648,-0.007072,0.000087,...,0.001311,-0.005878,-0.006231,0.000164,0.062446,-0.000602,-0.000208,0.000163,-0.000442,0
95758,0.000010,-0.000124,0.000107,0.043410,0.000423,0.001018,-0.000567,0.001161,-0.005971,0.000149,...,0.001163,-0.004528,-0.005713,0.000201,0.020973,-0.000631,-0.000148,0.000116,-0.000511,0


In [10]:
import pandas as pd
df = pd.read_csv('./pathogenecity_gpn_missense.csv')
df.head()
len(df)

560187

In [5]:
df.head()

Unnamed: 0,sequence,y
0,TCCTGTCCAGTCCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCG...,0
1,CCTGTCCAGTCCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCGC...,0
2,GTCCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGCC...,0
3,CGGCGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGCCTGCGCCATGG...,0
4,CGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGCCTGCGCCATGGCCG...,0
