In [6]:
import numpy as np
import pandas as pd

def one_hot_encode_sequence(sequence):
    mapping = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0],
               'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1]}
    valid_bases = [base for base in sequence if base in mapping]
    return [mapping[base] for base in valid_bases]
file_path = '/content/healthy_vs_mutated_sequences.csv'
data = pd.read_csv(file_path)

print(f"Original data shape: {data.shape}")

print("Number of missing values per column:")
print(data.isnull().sum())

if data.isnull().values.any():
    print("Data contains missing values. Filling missing values with 'N'.")
    data = data.fillna('N')

healthy_sequences = data['Healthy Sequence'].str.strip()
mutated_sequences = data['Mutated Sequence'].str.strip()

healthy_sequences = healthy_sequences[healthy_sequences.apply(lambda x: set(x).issubset({'A', 'T', 'C', 'G'}))]
mutated_sequences = mutated_sequences[mutated_sequences.apply(lambda x: set(x).issubset({'A', 'T', 'C', 'G'}))]

print(f"Filtered Healthy Sequences (Count): {len(healthy_sequences)}")
print(f"Filtered Mutated Sequences (Count): {len(mutated_sequences)}")

healthy_encoded = [one_hot_encode_sequence(seq) for seq in healthy_sequences]
mutated_encoded = [one_hot_encode_sequence(seq) for seq in mutated_sequences]

max_length = max(max(len(seq) for seq in healthy_encoded), max(len(seq) for seq in mutated_encoded))

healthy_encoded = [np.pad(seq, ((0, max_length - len(seq)), (0, 0)), 'constant') for seq in healthy_encoded]
mutated_encoded = [np.pad(seq, ((0, max_length - len(seq)), (0, 0)), 'constant') for seq in mutated_encoded]

healthy_encoded = np.array(healthy_encoded)
mutated_encoded = np.array(mutated_encoded)

print(f"Shape of healthy_encoded before reshape: {healthy_encoded.shape}")

if healthy_encoded.size > 0 and mutated_encoded.size > 0:
    healthy_encoded = healthy_encoded.reshape(healthy_encoded.shape[0], -1, 4)
    mutated_encoded = mutated_encoded.reshape(mutated_encoded.shape[0], -1, 4)

    print("Healthy Encoded Sequences Shape:", healthy_encoded.shape)
    print("Mutated Encoded Sequences Shape:", mutated_encoded.shape)

    # Print the first encoded sequence for both healthy and mutated
    if healthy_encoded.shape[0] > 0:
        print("First Healthy Encoded Sequence:\n", healthy_encoded[0])
    if mutated_encoded.shape[0] > 0:
        print("First Mutated Encoded Sequence:\n", mutated_encoded[0])


Original data shape: (101, 5)
Number of missing values per column:
Healthy Sequence            99
Mutated Sequence             1
Mutation Type                1
Mutated Nucleotide(s)      100
Mutated Nucleotide(s).1    101
dtype: int64
Data contains missing values. Filling missing values with 'N'.
Filtered Healthy Sequences (Count): 1
Filtered Mutated Sequences (Count): 97
Shape of healthy_encoded before reshape: (1, 6129, 4)
Healthy Encoded Sequences Shape: (1, 6129, 4)
Mutated Encoded Sequences Shape: (97, 6129, 4)
First Healthy Encoded Sequence:
 [[0 1 0 0]
 [1 0 0 0]
 [1 0 0 0]
 ...
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
First Mutated Encoded Sequence:
 [[0 1 0 0]
 [0 1 0 0]
 [0 1 0 0]
 ...
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Conv1D, Conv1DTranspose
from tensorflow.keras.optimizers import Adam

# Encoder
def build_encoder(input_shape, latent_dim):
    inputs = Input(shape=input_shape)
    x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(inputs)
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(x)
    x = Flatten()(x)
    latent = Dense(latent_dim, activation='relu')(x)
    return Model(inputs, latent, name='encoder')

# Decoder
def build_decoder(latent_dim, output_shape):
    latent_inputs = Input(shape=(latent_dim,))
    x = Dense(np.prod(output_shape), activation='relu')(latent_inputs)
    x = Reshape(output_shape)(x)
    x = Conv1DTranspose(64, kernel_size=3, activation='relu', padding='same')(x)
    x = Conv1DTranspose(32, kernel_size=3, activation='relu', padding='same')(x)
    outputs = Conv1DTranspose(4, kernel_size=3, activation='softmax', padding='same')(x)
    return Model(latent_inputs, outputs, name='decoder')

# Autoencoder
def build_autoencoder(input_shape, latent_dim):
    encoder = build_encoder(input_shape, latent_dim)
    decoder = build_decoder(latent_dim, input_shape)
    autoencoder = Model(encoder.input, decoder(encoder.output), name='autoencoder')
    return autoencoder, encoder, decoder


sequence_length = max_length
input_shape = (sequence_length, 4)
latent_dim = 64

autoencoder, encoder, decoder = build_autoencoder(input_shape, latent_dim)
autoencoder.compile(optimizer=Adam(), loss='binary_crossentropy')

autoencoder.summary()

In [8]:
noise_factor = 0.1
# Get latent representation of the healthy sequence
healthy_latent = encoder.predict(healthy_encoded)
# Add noise to the latent representation
mutated_latent = healthy_latent + noise_factor * np.random.normal(size=healthy_latent.shape)

mutated_sequences_reconstructed = decoder.predict(mutated_latent)

print("Original Healthy Sequence (One-Hot Encoded):")
print(healthy_encoded[0])

print("\nReconstructed Mutated Sequence (One-Hot Encoded):")
print(mutated_sequences_reconstructed[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step
Original Healthy Sequence (One-Hot Encoded):
[[0 1 0 0]
 [1 0 0 0]
 [1 0 0 0]
 ...
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]

Reconstructed Mutated Sequence (One-Hot Encoded):
[[0.25017077 0.25005585 0.24985592 0.24991748]
 [0.25042698 0.24974622 0.24988386 0.24994291]
 [0.25022727 0.25011373 0.24984291 0.24981605]
 ...
 [0.25005797 0.24997556 0.24993367 0.25003272]
 [0.2500031  0.2500094  0.24998389 0.25000358]
 [0.25000843 0.25000194 0.24999596 0.24999368]]


In [9]:
pip install transformers datasets tokenizers

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:

In [10]:
def generate_kmers(sequence, k=6):
    """Convert sequence into k-mers of length k."""
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return " ".join(kmers)

# Example sequence
sequence = "ATGCGTACGTTAGC"
kmers_sequence = generate_kmers(sequence, k=6)
print(kmers_sequence)

ATGCGT TGCGTA GCGTAC CGTACG GTACGT TACGTT ACGTTA CGTTAG GTTAGC


In [11]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

sequences = ["ATGCGTACGTTAGC", "CGTAGCTAGCGT", "TGCGTACGTGCA"]
kmers_data = [generate_kmers(seq, k=6) for seq in sequences]

tokenizer.train_from_iterator(kmers_data, trainer)
tokenizer.save("dna_tokenizer.json")

tokenizer = Tokenizer.from_file("dna_tokenizer.json")
encoded = tokenizer.encode("ATGCGTACGTTAGC")
print(encoded.tokens)

['A', 'TGCGTA', 'CGT', 'TAGC']


In [12]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    vocab_size=tokenizer.get_vocab_size(),
    max_position_embeddings=512,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1
)

model = BertForMaskedLM(config)
model.save_pretrained("dna_bert_model")

In [13]:
from transformers import PreTrainedTokenizerFast, DataCollatorForLanguageModeling

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="dna_tokenizer.json", model_max_length=512)

hf_tokenizer.add_special_tokens({
    'pad_token': '[PAD]',
    'mask_token': '[MASK]'
})

encoded_sequences = hf_tokenizer(kmers_data, truncation=True, padding=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=hf_tokenizer, mlm=True, mlm_probability=0.15)

print("Tokenization and data collator setup complete.")


Tokenization and data collator setup complete.




In [14]:
import torch
from torch.utils.data import Dataset

class DNADataset(Dataset):
    def __init__(self, encoded_sequences):
        self.input_ids = encoded_sequences['input_ids']
        self.attention_mask = encoded_sequences['attention_mask']
        self.labels = encoded_sequences['input_ids']  # For MLM, labels are the input ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

dna_dataset = DNADataset(encoded_sequences)


In [15]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dna_dataset, batch_size=16, shuffle=True)


In [16]:
from transformers import BertForMaskedLM, Trainer, TrainingArguments

model = BertForMaskedLM.from_pretrained("bert-base-uncased")

model.resize_token_embeddings(len(hf_tokenizer))

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dna_dataset,
    data_collator=data_collator,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Step,Training Loss


TrainOutput(global_step=3, training_loss=3.2887792587280273, metrics={'train_runtime': 15.8087, 'train_samples_per_second': 0.569, 'train_steps_per_second': 0.19, 'total_flos': 41625012078.0, 'train_loss': 3.2887792587280273, 'epoch': 3.0})

In [17]:
trainer.save_model('./dna_bert_model')

In [18]:
import os
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="dna_tokenizer.json")

if tokenizer.mask_token is None:
    tokenizer.add_special_tokens({'mask_token': '[MASK]'})
    if not os.path.exists("dna_tokenizer"):
        os.makedirs("dna_tokenizer")

    tokenizer.save_pretrained("dna_tokenizer")

