In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load pre-trained Caduceus model and tokenizer
model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure the model is in evaluation mode
model.eval()

# Sample DNA sequence (adjust the length as needed for the specific model)
sequence = "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"

# Tokenize the sequence
tokens = tokenizer(sequence, return_tensors='pt', padding='max_length', max_length=512, truncation=True)

# Move tokens to GPU
tokens = {key: val.to(device) for key, val in tokens.items()}

# Forward pass to compute logits
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs[0]  # Accessing the first element of the tuple

# Print the logits
print(logits)
logits.shape

  from .autonotebook import tqdm as notebook_tqdm


tensor([[[-18.8940, -18.8245, -18.8113,  ..., -35.0860, -35.0857, -35.0853],
         [-17.3938, -17.3290, -17.3174,  ..., -32.2474, -32.2468, -32.2465],
         [-16.6716, -16.6126, -16.6009,  ..., -30.9116, -30.9106, -30.9104],
         ...,
         [ -8.7253,  -8.6407,  -8.6400,  ..., -15.5402, -15.5404, -15.5407],
         [ -5.2331,  -5.2614,  -5.2472,  ...,  -8.5717,  -8.5706,  -8.5695],
         [-11.8939, -11.8855, -11.8817,  ..., -22.6429, -22.6402, -22.6394]]],
       device='cuda:0')


torch.Size([1, 512, 16])

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load pre-trained Caduceus model and tokenizer
model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure the model is in evaluation mode
model.eval()

# Sample DNA sequence (adjust the length as needed for the specific model)
sequence = "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"

# Tokenize the sequence
tokens = tokenizer(sequence, return_tensors='pt', padding='max_length', max_length=512, truncation=True)

# Move tokens to GPU
tokens = {key: val.to(device) for key, val in tokens.items()}

# Forward pass to compute last layer embeddings
with torch.no_grad():
    outputs = model(**tokens, output_hidden_states=True)  # Enable output of hidden states
    hidden_states = outputs.hidden_states  # Access all hidden states
    last_layer_embeddings = hidden_states[-1]  # Get the last layer embeddings

# Print the embeddings
print(last_layer_embeddings)
last_layer_embeddings.shape  # This will show the shape of the embeddings tensor


tensor([[[-1.0830e-05, -1.7574e-04,  1.6580e-03,  ...,  1.9041e-04,
          -2.8915e-04,  4.7309e-04],
         [-1.2126e-05, -1.6908e-04,  1.5073e-03,  ...,  3.0167e-04,
          -3.4832e-04,  4.9999e-04],
         [-1.4850e-05, -1.4137e-04,  1.3906e-03,  ...,  3.8891e-04,
          -3.7303e-04,  6.1229e-04],
         ...,
         [ 2.5308e-07, -3.9808e-04,  2.7370e-04,  ..., -3.9304e-04,
           6.0536e-04, -7.6688e-04],
         [ 1.1982e-05, -1.2080e-03, -3.1884e-04,  ..., -6.1354e-04,
           1.3279e-04, -4.4638e-04],
         [-1.3478e-05, -2.5554e-05,  7.1849e-04,  ...,  2.0613e-04,
           5.7910e-05,  3.0344e-04]]], device='cuda:0')


torch.Size([1, 512, 256])

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load pre-trained Caduceus model and tokenizer
model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure the model is in evaluation mode
model.eval()

# Sample DNA sequence (adjust the length as needed for the specific model)
sequence = "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"

# Tokenize the sequence
tokens = tokenizer(sequence, return_tensors='pt', padding='max_length', max_length=512, truncation=True)

# Move tokens to GPU
tokens = {key: val.to(device) for key, val in tokens.items()}

# Forward pass to compute last layer embeddings
with torch.no_grad():
    outputs = model(**tokens, output_hidden_states=True)  # Enable output of hidden states
    hidden_states = outputs.hidden_states  # Access all hidden states
    last_layer_embeddings = hidden_states[-1]  # Get the last layer embeddings

# Compute the mean of the last layer embeddings across the token (sequence) dimension
# Dimension 1 corresponds to the token/sequence length, so we compute the mean along this axis
mean_embedding = torch.mean(last_layer_embeddings, dim=1)

# Unsqueeze to add an extra dimension (e.g., if you want to restore the token dimension)
mean_embedding_squeezed = mean_embedding.squeeze(dim=1)

# Print the result
print(mean_embedding_squeezed)
mean_embedding_squeezed.shape


  from .autonotebook import tqdm as notebook_tqdm


tensor([[-1.4744e-05, -4.7353e-05,  1.1077e-03,  6.9341e-02,  5.0779e-04,
          1.4715e-03, -4.3436e-04,  4.5703e-04, -7.2633e-03, -6.1166e-04,
          4.7888e-03, -1.9755e-03,  8.2193e-04,  1.2733e-03,  4.3967e-03,
          4.7374e-03,  2.5317e-03,  6.1662e-02, -3.2121e-03,  9.5049e-03,
         -7.1028e-04,  1.1912e-03, -3.5876e-03, -6.3549e-04,  2.2708e-04,
          2.5252e-03,  3.3616e-03, -3.0302e-03,  2.3135e-03, -9.9381e-04,
         -1.1784e-03, -4.8926e-04, -9.4225e-03, -6.0444e-03, -5.3644e-04,
          4.1137e-04, -4.8983e-03, -1.1070e-03,  3.3379e-03, -2.7270e-03,
         -8.1763e-05, -1.4678e-03,  9.8720e-04,  2.2008e-04, -3.1410e-03,
          1.1457e-02,  4.1260e-03, -4.2726e-03,  3.0358e-03,  2.4786e-02,
         -4.2216e-03,  3.1488e-04,  9.0541e-04, -8.1371e-03,  2.6645e-04,
          6.7988e-04, -1.2135e-03,  1.2914e-02,  1.9084e-03,  1.6925e-03,
         -5.4494e-04,  6.5336e-04, -1.0453e-03,  8.6360e-04,  2.6494e-02,
         -1.0172e-03, -4.1658e-03, -6.

torch.Size([1, 256])

## Basic demo code for obtaining embedding in batch.

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load pre-trained Caduceus model and tokenizer
model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure the model is in evaluation mode
model.eval()

# Sample DNA sequences for batch input (adjust as needed)
sequences = [
    "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT",
    "TGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA",
    # Add more sequences as needed for the batch
]

# Tokenize the batch of sequences
tokens = tokenizer(sequences, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Move tokens to GPU
tokens = {key: val.to(device) for key, val in tokens.items()}

# Forward pass to compute last layer embeddings for the batch
with torch.no_grad():
    outputs = model(**tokens, output_hidden_states=True)  # Enable output of hidden states
    hidden_states = outputs.hidden_states  # Access all hidden states
    last_layer_embeddings = hidden_states[-1]  # Get the last layer embeddings (batch_size, seq_len, hidden_size)

# Compute the mean of the last layer embeddings across the token (sequence) dimension for each sequence in the batch
# Dimension 1 corresponds to the token/sequence length, so we compute the mean along this axis
mean_embeddings = torch.mean(last_layer_embeddings, dim=1)  # (batch_size, hidden_size)

# If needed, squeeze out any extra dimensions (though this shouldn't be necessary after mean calculation)
mean_embeddings_squeezed = mean_embeddings.squeeze(dim=1)

# Print the result for the batch
print(mean_embeddings_squeezed)
print(mean_embeddings_squeezed.shape)  # Shape will be (batch_size, hidden_size)


tensor([[-2.1096e-06, -3.7544e-04,  1.9827e-04,  3.5334e-02, -2.9696e-03,
         -1.8678e-04, -6.5049e-04, -3.4604e-03,  5.7491e-03, -6.4005e-06,
         -2.0113e-03,  1.7668e-03, -9.1775e-05, -3.6099e-04, -2.5900e-06,
         -1.7774e-02, -2.7480e-03,  7.0062e-02,  1.7794e-02, -2.4911e-03,
          1.9951e-04, -3.9112e-04, -5.6183e-04, -1.8959e-04, -5.1693e-05,
         -1.6575e-05, -5.9166e-03, -7.4726e-04, -1.1724e-03, -1.1964e-03,
          4.2923e-04,  7.2305e-05, -2.0687e-03,  4.0896e-03, -2.4146e-04,
         -3.9139e-05, -1.4967e-03, -1.8557e-04,  1.8382e-03,  1.1577e-03,
          3.4902e-05,  2.8084e-04, -2.9456e-04, -6.7833e-04,  7.9137e-04,
          4.0997e-03,  2.9320e-03, -4.0373e-04, -3.6454e-04, -1.1570e-03,
          3.1037e-03, -7.1050e-05,  1.1772e-04, -1.7331e-03, -1.7106e-04,
         -1.9987e-04,  1.2386e-03, -1.6647e-02, -9.4180e-05, -1.1572e-03,
         -7.5842e-05,  5.4473e-03,  1.4634e-03, -1.4866e-05,  1.2515e-02,
         -1.4388e-05, -5.9815e-05,  2.

## working code: for handling data with dask chunck, batch, parallellizing version

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load pre-trained Caduceus model and tokenizer
model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure the model is in evaluation mode
model.eval()


  from .autonotebook import tqdm as notebook_tqdm
A new version of the following files was downloaded from https://huggingface.co/kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16:
- tokenization_caduceus.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16:
- configuration_caduceus.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16:
- modeling_rcps.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revisio

CaduceusForMaskedLM(
  (caduceus): Caduceus(
    (backbone): CaduceusMixerModel(
      (embeddings): CaduceusEmbeddings(
        (word_embeddings): Embedding(16, 256)
      )
      (layers): ModuleList(
        (0-15): 16 x Block(
          (mixer): BiMambaWrapper(
            (mamba_fwd): Mamba(
              (in_proj): Linear(in_features=256, out_features=1024, bias=False)
              (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
              (act): SiLU()
              (x_proj): Linear(in_features=512, out_features=48, bias=False)
              (dt_proj): Linear(in_features=16, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=256, bias=False)
            )
            (mamba_rev): Mamba(
              (in_proj): Linear(in_features=256, out_features=1024, bias=False)
              (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
              (act): SiLU()
          

In [2]:
# %%time

import dask.dataframe as dd
import pandas as pd
import jax.numpy as jnp
from dask.diagnostics import ProgressBar

# Vectorized tokenization function
def vectorized_tokenizer(subsequences):
    # Assuming tokenizer.batch_tokenize can handle a list of subsequences
    # tokens_ids = [b[1] for b in tokenizer.batch_tokenize(subsequences)]
    # return jnp.asarray(tokens_ids, dtype=jnp.int32)

    # Tokenize the batch of sequences
    tokens = tokenizer(subsequences, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Move tokens to GPU
    tokens = {key: val.to(device) for key, val in tokens.items()}
    return tokens

# Vectorized embedding function
def vectorized_embedding(tokens):
    # random_key = jax.random.PRNGKey(0)
    # # Assuming forward_fn.apply() can handle batches of tokens
    # outs = forward_fn.apply(parameters, random_key, tokens)
    # return outs["embeddings_20"][:, 0, :]


    # Forward pass to compute last layer embeddings for the batch
    with torch.no_grad():
        outputs = model(**tokens, output_hidden_states=True)  # Enable output of hidden states
        hidden_states = outputs.hidden_states  # Access all hidden states
        last_layer_embeddings = hidden_states[-1]  # Get the last layer embeddings (batch_size, seq_len, hidden_size)
    
    # Compute the mean of the last layer embeddings across the token (sequence) dimension for each sequence in the batch
    # Dimension 1 corresponds to the token/sequence length, so we compute the mean along this axis
    mean_embeddings = torch.mean(last_layer_embeddings, dim=1)  # (batch_size, hidden_size)
    
    # If needed, squeeze out any extra dimensions (though this shouldn't be necessary after mean calculation)
    mean_embeddings_squeezed = mean_embeddings.squeeze(dim=1)

    return mean_embeddings_squeezed

# Tokenization and embedding combined in a batch-wise function
def process_batch(subsequences):
    tokens = vectorized_tokenizer(subsequences)
    embeddings = vectorized_embedding(tokens)
    return embeddings

# Function to apply on each Dask partition
# def apply_get_embeddings_dask(df):
#     # subsequences = df['ref_forward_sequence'].values  # Get all subsequences in the batch
#     subsequences = df['ref_forward_sequence'].tolist() 
#     embeddings = process_batch(subsequences)  # Process in a vectorized manner
#     embeddings_cpu = embeddings.cpu().numpy()
#     df['embedding'] = list(embeddings_cpu)  # Assign embeddings back to the DataFrame
#     return df


def apply_get_embeddings_dask(df):
    # subsequences = df['ref_forward_sequence'].values  # Get all subsequences in the batch
    subsequences = df['ref_forward_sequence'].tolist() 
    embeddings = process_batch(subsequences)  # Process in a vectorized manner
    embeddings_cpu = embeddings.cpu().numpy()
    
    # df['embedding'] = list(embeddings_cpu)  # Assign embeddings back to the DataFrame
    df2 = pd.DataFrame(embeddings_cpu, columns=[f'{i+1}' for i in range(embeddings_cpu.shape[1])])
    df = pd.concat([df.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
    return df

In [3]:
%%time

import numpy as np
import pandas as pd
import dask.dataframe as dd
from datasets import load_dataset, load_from_disk


# Define chunk size (number of rows per chunk)
# chunksize = 10000  # Adjust chunk size according to your memory capacity
# num_parallel = 10

#================dataset -> df================
# code for loading homo_sapiens_dataset/train
# save_path="homo_sapiens_dataset"
# mydatasets = load_from_disk(save_path)
# # print(mydatasets)
# train = mydatasets['train']
# df = train.to_pandas()


# chunkid=0
typename="noncoding"
# typename="missense"
for chunkid in range(0,4):
    save_path="pathgenicity_"+typename+"_multisets"
    chunk_save_path = f"{save_path}/train_{chunkid}"
    dataset_chunk = load_from_disk(chunk_save_path)
    df = dataset_chunk.to_pandas()
    
    
    # code for loading pathgenicity_noncoding_multisets/combined_dataset
    # save_path = "pathgenicity_noncoding_multisets"
    # df = load_from_disk(save_path+"/combined_dataset")
    
    
    #================df -> df's chunks================
    # Assume df is your huge DataFrame
    chunk_size = 10000  # Define the number of rows per chunk
    num_parallel = 10
    
    num_chunks = int(np.ceil(len(df) / chunk_size))  # Calculate the number of chunks
    
    # Split the DataFrame into chunks using array_split
    chunks = np.array_split(df, num_chunks)
    
    # Initialize an empty list to store the processed chunks
    processed_chunks = []
    # num_parallel=10
    
    #================process each chunk with dask's ddf================
    # Iterate over each chunk
    for chunk in chunks:
        # Process each chunk
        # print(chunk.head())  # Example: print the first few rows of the chunk
    
        ddf = dd.from_pandas(chunk, npartitions=num_parallel)  # Adjust 'npartitions' based on resources
    
        # Create metadata for Dask to understand the structure of the DataFrame
        # meta = chunk.copy()
        # meta['embedding'] = object  # Tokens will be of type object (adjust type accordingly)



        # sample_subsequences = chunk['ref_forward_sequence'].tolist()
        # sample_embeddings = process_batch(sample_subsequences).cpu().numpy()  # Get sample embeddings
        # num_embedding_columns = sample_embeddings.shape[1]
        num_embedding_columns = 256
        
        # Define metadata for the new DataFrame structure
        # Drop the 'embedding' column since it no longer exists
        meta = chunk.copy()
        meta = meta.drop(columns=['embedding'], errors='ignore')  # Drop 'embedding' if it exists
        # Add new embedding columns to the metadata
        for i in range(num_embedding_columns):
            meta[f'{i+1}'] = float  # Adjust type as necessary, float is common for embeddings


        
    
        # Apply the function in parallel using Dask
        ddf = ddf.map_partitions(apply_get_embeddings_dask, meta=meta)
    
        # Compute the result with progress tracking
        with ProgressBar():
            processed_chunk = ddf.compute()
    
        # Append processed chunk to list
        processed_chunks.append(processed_chunk)

        
    # Concatenate all processed chunks into a final DataFrame
    final_df = pd.concat(processed_chunks, ignore_index=True)
    
    final_df.to_csv(typename+'_train_'+str(chunkid)+'_with_embedding.csv', index=False)
    print(f"{typename}_train_{chunkid}_with_embedding.csv is created.")

final_df

  return bound(*args, **kwds)


[########################################] | 100% Completed | 15.99 s




[########################################] | 100% Completed | 6.70 ss




[########################################] | 100% Completed | 6.46 ss
noncoding_train_0_with_embedding.csv is created.


  return bound(*args, **kwds)


[########################################] | 100% Completed | 6.52 ss




[########################################] | 100% Completed | 6.57 ss




[########################################] | 100% Completed | 7.01 ss
noncoding_train_1_with_embedding.csv is created.


  return bound(*args, **kwds)


[########################################] | 100% Completed | 6.54 ss




[########################################] | 100% Completed | 6.47 ss




[########################################] | 100% Completed | 6.46 ss
noncoding_train_2_with_embedding.csv is created.


  return bound(*args, **kwds)


[########################################] | 100% Completed | 3.79 sms
noncoding_train_3_with_embedding.csv is created.
CPU times: user 1min 26s, sys: 755 ms, total: 1min 27s
Wall time: 1min 46s


Unnamed: 0,label,chromosome,ref_forward_sequence,alt_forward_sequence,position,1,2,3,4,5,...,247,248,249,250,251,252,253,254,255,256
0,0,22,GGAAGTGGATCAACCAGAACCAGCAGATACTCAGCCGGAGGATATT...,GGAAGTGGATCAACCAGAACCAGCAGATACTCAGCCGGAGGATATT...,41152025,0.000017,-0.000240,-0.000110,0.057381,-0.001361,...,0.005826,0.001369,-0.003496,-0.003906,0.000322,-0.011505,-0.000539,-0.000321,0.000190,-0.001063
1,0,22,CATGAGAAAGGGTGTTCAGATTACTGATTCCCAACTAGATATCTTT...,CATGAGAAAGGGTGTTCAGATTACTGATTCCCAACTAGATATCTTT...,41152192,0.000017,-0.000337,-0.000089,0.102056,-0.000506,...,0.006111,0.001604,-0.004834,-0.003429,0.000239,0.001787,-0.000494,-0.000286,0.000165,-0.000973
2,0,22,ATGAGAAAGGGTGTTCAGATTACTGATTCCCAACTAGATATCTTTG...,ATGAGAAAGGGTGTTCAGATTACTGATTCCCAACTAGATATCTTTG...,41152193,0.000017,-0.000305,-0.000080,0.102382,-0.000540,...,0.006139,0.001599,-0.004818,-0.003553,0.000241,0.001517,-0.000501,-0.000289,0.000172,-0.000983
3,0,22,ACTTCAGCTACCCAGTCATCTCCGGCTCCAGGACAGTCAAAGAAAA...,ACTTCAGCTACCCAGTCATCTCCGGCTCCAGGACAGTCAAAGAAAA...,41152366,0.000011,-0.000275,0.000160,0.007671,-0.001822,...,0.007020,0.001294,-0.003227,-0.006749,0.000325,0.009516,-0.000511,-0.000336,0.000217,-0.001074
4,0,22,TCAGCTACCCAGTCATCTCCGGCTCCAGGACAGTCAAAGAAAAAGA...,TCAGCTACCCAGTCATCTCCGGCTCCAGGACAGTCAAAGAAAAAGA...,41152369,0.000011,-0.000250,0.000160,0.001368,-0.001909,...,0.007053,0.001245,-0.003123,-0.006624,0.000338,0.006517,-0.000529,-0.000329,0.000212,-0.001080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5755,0,X,ACATCATAGCTAAGGTCAAGCAATGGAAGGGGAAAGACAGAGAGAA...,ACATCATAGCTAAGGTCAAGCAATGGAAGGGGAAAGACAGAGAGAA...,154991042,0.000014,-0.000334,-0.000022,0.071434,-0.000454,...,0.005395,0.001485,-0.004062,-0.004267,0.000270,0.011107,-0.000492,-0.000287,0.000193,-0.000951
5756,0,X,AAATGCTAACTGTTATAGATAGATTCAGTTGTTTGTACTTCTCTGC...,AAATGCTAACTGTTATAGATAGATTCAGTTGTTTGTACTTCTCTGC...,154992918,0.000013,-0.000320,0.000135,0.100437,0.000079,...,0.005537,0.001538,-0.004999,-0.004278,0.000259,0.073901,-0.000487,-0.000297,0.000270,-0.000904
5757,0,X,GGAAGACTTTATCATCTTCTTTCTCCCTTTGACTGGTCTGATCATC...,GGAAGACTTTATCATCTTCTTTCTCCCTTTGACTGGTCTGATCATC...,154993156,0.000016,-0.000294,0.000037,0.110021,-0.000296,...,0.005250,0.001535,-0.004957,-0.004208,0.000293,0.057179,-0.000498,-0.000335,0.000315,-0.000913
5758,0,X,GACCACTGTATCATAAACCTCAGCCTGGATGGTAGGACCTAGCAGA...,GACCACTGTATCATAAACCTCAGCCTGGATGGTAGGACCTAGCAGA...,154997113,0.000014,-0.000287,0.000014,0.043964,-0.001280,...,0.006462,0.001331,-0.003545,-0.005477,0.000327,0.003653,-0.000526,-0.000335,0.000234,-0.001057


## Demo code:Process huggingface dataset by dask and chunk

In [None]:
import dask.dataframe as dd
from datasets import load_dataset, load_from_disk

# # Load your huge Hugging Face dataset
# dataset = load_dataset('your_dataset_name', split='train')  # Replace with your actual dataset

# # Convert Hugging Face dataset to a Pandas DataFrame
# df = dataset.to_pandas()


save_path="homo_sapiens_dataset"
mydatasets = load_from_disk(save_path)
# print(mydatasets)
train = mydatasets['train']
df = train.to_pandas()


# Convert Pandas DataFrame to a Dask DataFrame (you can specify the chunk size)
dask_df = dd.from_pandas(df, npartitions=10)  # Adjust `npartitions` for the number of chunks/partitions you want

# Define a processing function for a chunk
def process_chunk(df_chunk):
    # Perform operations on the chunk (e.g., computation, filtering, transformation)
    df_chunk['new_column'] = df_chunk['existing_column'].apply(lambda x: x * 2)  # Example operation
    return df_chunk

# Apply the processing function to each partition
processed_dask_df = dask_df.map_partitions(process_chunk)

# Trigger the computation and convert back to Pandas (if needed)
final_df = processed_dask_df.compute()  # This will run the computation and collect results
print(final_df.head())  # Display the first few rows

# Optionally save the processed data to disk
final_df.to_csv('processed_hf_dataset.csv', index=False)


In [1]:
save_path="homo_sapiens_dataset"

from datasets import load_dataset, load_from_disk

mydatasets = load_from_disk(save_path)
# print(mydatasets)
train = mydatasets['train']

df = train.to_pandas()
df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,label,chromosome,ref_forward_sequence,alt_forward_sequence,position
0,0,7,GCAGAAGCCTGTGATATGGAACATTTATATCTTTCTTTCTGTTTCA...,GCAGAAGCCTGTGATATGGAACATTTATATCTTTCTTTCTGTTTCA...,116953616
1,0,12,AGGATTGCTTGAGTTCAGGAGATAGAGACCAGCCTGGGCAACATAG...,AGGATTGCTTGAGTTCAGGAGATAGAGACCAGCCTGGGCAACATAG...,54241830
2,0,20,GAAAGAACCTTGTTTTGTCACATTACCAGAATTGTTTTTCTGGTTC...,GAAAGAACCTTGTTTTGTCACATTACCAGAATTGTTTTTCTGGTTC...,38034023
3,0,15,AGAGAATCAGCCTGTTCTTCAAAGCTTTGAAGCCAGAAACTGACTT...,AGAGAATCAGCCTGTTCTTCAAAGCTTTGAAGCCAGAAACTGACTT...,82659635
4,0,9,AAAGGGCCCAGTAGCTGAGGCATCACCTGACTCCTTCCCCTCCCCT...,AAAGGGCCCAGTAGCTGAGGCATCACCTGACTCCTTCCCCTCCCCT...,131579572
...,...,...,...,...,...
34738,6,17,TTTTTTTTTTTTTGAGATAGGGTTTCTCTTTTGTTGCCCAGGCTGG...,TTTTTTTTTTTTTGAGATAGGGTTTCTCTTTTGTTGCCCAGGCTGG...,19189955
34739,6,17,CCAACTCTTGGCTACACTCATCCAGGACTTAGTCTCAGCAACAGGT...,CCAACTCTTGGCTACACTCATCCAGGACTTAGTCTCAGCAACAGGT...,19188365
34740,6,17,CCGCAGGATCCACTGTTGGAAAACCAGAATAGTATCAGAAATAGTG...,CCGCAGGATCCACTGTTGGAAAACCAGAATAGTATCAGAAATAGTG...,19557561
34741,6,17,TGGCAGAGGTCTCCTGTTCCTAGGCTTTTCTCCTAATGGGGGACCC...,TGGCAGAGGTCTCCTGTTCCTAGGCTTTTCTCCTAATGGGGGACCC...,19062290
