# Part 1: Build CpG Detector

Here we have a simple problem, given a DNA sequence (of N, A, C, G, T), count the number of CpGs in the sequence (consecutive CGs).

We have defined a few helper functions / parameters for performing this task.

We need you to build a LSTM model and train it to complish this task in PyTorch.

A good solution will be a model that can be trained, with high confidence in correctness.

In [3]:
from typing import Sequence
from functools import partial
import random
import torch
import numpy as np
import random

In [5]:
# DO NOT CHANGE HERE
def set_seed(seed=13):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(13)

# Use this for getting x label
def rand_sequence(n_seqs: int, seq_len: int=128) -> Sequence[int]:
    for i in range(n_seqs):
        yield [random.randint(0, 4) for _ in range(seq_len)]

# Use this for getting y label
def count_cpgs(seq: str) -> int:
    cgs = 0
    for i in range(0, len(seq) - 1):
        dimer = seq[i:i+2]
        # note that seq is a string, not a list
        if dimer == "CG":
            cgs += 1
    return cgs

# Alphabet helpers   
alphabet = 'NACGT'
dna2int = { a: i for a, i in zip(alphabet, range(5))}
int2dna = { i: a for a, i in zip(alphabet, range(5))}

intseq_to_dnaseq = partial(map, int2dna.get)
dnaseq_to_intseq = partial(map, dna2int.get)

In [6]:
#understanding the code
dna_sequence = "ACGTN"
int_sequence = list(dnaseq_to_intseq(dna_sequence))
int_sequence


[1, 2, 3, 4, 0]

In [7]:
#understanding the code
for a, i in zip(alphabet, range(5)):
    print(a,i)
for a, i in zip(alphabet, range(5)):
    print(i,a)

N 0
A 1
C 2
G 3
T 4
0 N
1 A
2 C
3 G
4 T


In [9]:
#understanding the code
X_dna_seqs_train = list(rand_sequence(5))
print(X_dna_seqs_train)
temp = ["".join(intseq_to_dnaseq(seq)) for seq in X_dna_seqs_train]
temp

[[2, 2, 1, 1, 1, 1, 1, 1, 0, 4, 1, 2, 0, 3, 1, 4, 0, 2, 1, 0, 2, 3, 3, 1, 2, 2, 1, 3, 4, 4, 3, 2, 3, 2, 0, 2, 4, 2, 3, 4, 4, 1, 3, 3, 4, 1, 2, 1, 1, 4, 2, 2, 2, 3, 2, 4, 2, 3, 1, 4, 3, 4, 1, 4, 1, 1, 2, 1, 0, 3, 3, 3, 0, 3, 0, 1, 1, 3, 3, 2, 1, 3, 2, 2, 1, 2, 4, 1, 4, 1, 3, 1, 4, 0, 4, 4, 0, 2, 4, 4, 2, 1, 2, 1, 1, 4, 0, 1, 1, 0, 3, 4, 4, 4, 0, 3, 2, 0, 4, 0, 1, 2, 0, 3, 2, 2, 4, 4], [0, 3, 4, 2, 0, 0, 2, 2, 1, 3, 4, 4, 4, 3, 1, 1, 2, 0, 3, 4, 1, 1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 4, 2, 3, 1, 2, 3, 0, 2, 3, 3, 0, 0, 0, 4, 0, 2, 1, 3, 3, 4, 3, 2, 0, 2, 4, 3, 1, 1, 0, 0, 2, 1, 1, 2, 2, 4, 3, 2, 0, 2, 1, 0, 2, 3, 3, 4, 4, 3, 0, 2, 4, 2, 1, 0, 2, 1, 4, 1, 0, 1, 1, 0, 3, 2, 0, 2, 2, 1, 0, 1, 1, 4, 3, 1, 4, 3, 4, 0, 4, 0, 0, 4, 1, 4, 2, 4, 3, 2, 1, 4, 2, 2, 0, 2, 0, 0, 1], [0, 0, 4, 2, 0, 2, 3, 1, 4, 2, 0, 0, 0, 2, 3, 3, 3, 3, 2, 1, 1, 3, 2, 0, 4, 0, 4, 2, 2, 2, 1, 1, 2, 3, 1, 3, 3, 2, 1, 3, 4, 4, 3, 1, 3, 3, 1, 0, 4, 0, 3, 4, 1, 1, 0, 0, 4, 1, 2, 0, 3, 4, 4, 0, 1, 3, 0, 4, 0, 1, 2, 4, 1, 0, 4, 3

['CCAAAAAANTACNGATNCANCGGACCAGTTGCGCNCTCGTTAGGTACAATCCCGCTCGATGTATAACANGGGNGNAAGGCAGCCACTATAGATNTTNCTTCACAATNAANGTTTNGCNTNACNGCCTT',
 'NGTCNNCCAGTTTGAACNGTAACGACGTACATCGACGNCGGNNNTNCAGGTGCNCTGAANNCAACCTGCNCANCGGTTGNCTCANCATANAANGCNCCANAATGATGTNTNNTATCTGCATCCNCNNA',
 'NNTCNCGATCNNNCGGGGCAAGCNTNTCCCAACGAGGCAGTTGAGGANTNGTAANNTACNGTTNAGNTNACTANTGCAGNTNACTTNTGCATNTGATNNCAGCACACTNCAAATNNTATTNGCGTTGT',
 'TGNGTCCGNACGTNGGCNGGGGTNNCANATCCTAGTCNANNANTGACANAANNAGTAGNGTANNCTNTTANGCGCACCTTGCCANNANNCAATCNNGGTNNTTGAGAGCCNNGNNCGNGANTAACAGG',
 'NCCCCANAATAGNTGNGGTAATGNGGCGGCNTAGATTNGGNTTGATNTANAGNGCGNCAGGTAGAGTGNGGAATGANNNGTTGGTAGNGGTGNTTTNNGAANNGTAGGNGNACGTAAANATNCAGAGN']

In [10]:
# we prepared two datasets for training and evaluation
# training data scale we set to 2048
# we test on 512

def prepare_data(num_samples=100):
    # prepared the training and test data
    # you need to call rand_sequence and count_cpgs here to create the dataset
    # step 1
    X_dna_seqs_train = list(rand_sequence(num_samples))
    """
    hint:
        1. You can check X_dna_seqs_train by print, the data is ids which is your training X 
        2. You first convert ids back to DNA sequence
        3. Then you run count_cpgs which will yield CGs counts - this will be the labels (Y)
    """
    #step2
    temp = ["".join(intseq_to_dnaseq(seq)) for seq in X_dna_seqs_train] # use intseq_to_dnaseq here to convert ids back to DNA seqs
    #step3
    y_dna_seqs =[count_cpgs(seq) for seq in temp] # use count_cpgs here to generate labels with temp generated in step2
    
    return X_dna_seqs_train, y_dna_seqs
    
train_x, train_y = prepare_data(2048)
test_x, test_y = prepare_data(512)

In [12]:

LSTM_HIDDEN = 128          # Number of hidden units in LSTM layer (can be increased for more complex models)
LSTM_LAYER = 2             # Number of LSTM layers (2-3 is typical for moderate complexity)
batch_size = 64            # Size of each mini-batch during training
learning_rate = 0.001      # Learning rate for optimization (you can try reducing it for better accuracy)
epoch_num = 20 
input_size = 128
output_size = 1

In [13]:
# create data loader
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

# Convert data to PyTorch tensors
train_x = torch.tensor(train_x)
train_y = torch.tensor(train_y)
test_x = torch.tensor(test_x)
test_y = torch.tensor(test_y)

train_data_loader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size)
test_Data_loader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size)

  train_x = torch.tensor(train_x)
  train_y = torch.tensor(train_y)
  test_x = torch.tensor(test_x)
  test_y = torch.tensor(test_y)


In [29]:
print(type(train_data_loader))
for a in train_data_loader:
    print(len(a))
    print(type(a))
    break
   
# the

<class 'torch.utils.data.dataloader.DataLoader'>
2
<class 'list'>


In [30]:
# Model
# class CpGPredictor(torch.nn.Module):
#     ''' Simple model that uses a LSTM to count the number of CpGs in a sequence '''
#     def __init__(self):
#         super(CpGPredictor, self).__init__()
#         # TODO complete model, you are free to add whatever layers you need here
#         # We do need a lstm and a classifier layer here but you are free to implement them in your way
#         self.lstm = ???
#         self.classifier = ???

#     def forward(self, x):
#         # TODO complete forward function
#         return logits
    
class CpGPredictor(torch.nn.Module):
    ''' Simple model that uses a LSTM to count the number of CpGs in a sequence '''
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(CpGPredictor, self).__init__()
        # We do need a lstm and a classifier layer here but you are free to implement them in your way
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = torch.nn.Embedding(5, input_size)
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [35]:

import torch.nn as nn

class CpGPredictor(nn.Module):
    ''' Simple model that uses an LSTM to count the number of CpGs in a sequence '''
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(CpGPredictor, self).__init__()
        
        # LSTM Layer
        # input_size: Number of features in the input sequence (e.g., size of each element in the sequence)
        # hidden_size: Size of the hidden state (how much information each LSTM unit carries)
        # num_layers: Number of LSTM layers stacked together
        # batch_first=True: The input tensor should have shape (batch_size, sequence_length, input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Classifier Layer (fully connected)
        # Takes the output from the LSTM and maps it to the final output size (e.g., number of classes)
        self.classifier = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        '''
        Forward pass for the model
        x: Input tensor of shape (batch_size, sequence_length, input_size)
        '''
        # Pass input through LSTM
        # lstm_out: The output of the LSTM, containing the hidden states for each timestep
        # _ : The hidden state and cell state (not used in this case)
        lstm_out, _ = self.lstm(x)
        
        # Take the output from the last timestep
        # We use the output from the final timestep to make predictions
        final_hidden_state = lstm_out[:, -1, :]
        
        # Pass the final hidden state through the classifier to get the logits
        logits = self.classifier(final_hidden_state)
        
        return logits


In [47]:
import torch
import torch.nn as nn

class CpGPredictor(nn.Module):
    ''' Simple model that uses an LSTM to count the number of CpGs in a sequence '''
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(CpGPredictor, self).__init__()
        
        # LSTM Layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Classifier Layer (fully connected)
        self.classifier = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # Pass input through LSTM
        lstm_out, _ = self.lstm(x)
        
        # Take the output from the last timestep
        # We use the output from the final timestep to make predictions
        # Ensure that we handle the case where the sequence length is 1
        if lstm_out.dim() == 2:
            final_hidden_state = lstm_out  # If no sequence dimension (e.g., batch_size=1)
        else:
            final_hidden_state = lstm_out[:, -1, :]  # Last timestep for each sequence in the batch
        
        # Pass the final hidden state through the classifier to get the logits
        logits = self.classifier(final_hidden_state)
        # Take the output from the last timestep
        
        # Pass through classifier to get the logits
        logits = self.classifier(final_hidden_state)
        
        return logits


In [62]:
class CpGPredictor(torch.nn.Module):
    ''' Simple model that uses an LSTM to count the number of CpGs in a sequence '''
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.5):
        super(CpGPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = torch.nn.Embedding(5, input_size)
        self.lstm = torch.nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob if num_layers > 1 else 0.0
        )
        # self.dropout = torch.nn.Dropout(dropout_prob)  # Dropout layer
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)  # Apply embedding
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))  # LSTM with initialized states
        # out = self.dropout(out[:, -1, :])  # Apply dropout to the output of the last LSTM step
        out = self.fc(out)  # Fully connected layer
        return out


In [63]:
# init model / loss function / optimizer etc.

# model = CpGPredictor(input_size, LSTM_HIDDEN, LSTM_LAYER, output_size)
# loss_fn = torch.nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Instantiate the model
model = CpGPredictor(input_size=input_size, hidden_size=LSTM_HIDDEN, num_layers=LSTM_LAYER, output_size=output_size)


loss_fn = torch.nn.MSELoss()

# Define the optimizer (Adam optimizer)
optimizer = optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)  # Reduced learning rate


In [None]:
# training (you can modify the code below)
t_loss = .0
model.train()
model.zero_grad()
for _ in range(epoch_num):
    for batch in train_data_loader:
        #TODO complete training loop
        t_loss += loss.item()
        loss.backward()

    print(t_loss)
    t_loss = .0

In [65]:

t_loss = 0.0
model.train()


for epoch in range(epoch_num):
    # Initialize the running total loss for this epoch
    epoch_loss = 0.0
    for batch in train_data_loader:
        inputs, targets = batch  # Unpack the batch (assuming each batch contains inputs and targets)    
        # inputs = inputs.float()
        targets = targets.view(-1, 1).float()  # Reshaping targets to (batch_size, 1)
        optimizer.zero_grad()# Zero the gradients from the previous step
        logits = model(inputs) # Forward pass: Compute predicted outputs by passing inputs to the model      
        loss = loss_fn(logits, targets)# Calculate the loss (use BCEWithLogitsLoss for binary classification) 
        epoch_loss += loss.item()# Accumulate the loss for the epoch      
        loss.backward() # Backward pass: Compute gradients of the loss w.r.t. model parameters  
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()  # Update the model parameters
    print(f"Epoch [{epoch+1}/{epoch_num}], Loss: {epoch_loss}")
    t_loss = 0.0 # Reset the epoch loss for the next epoch


RuntimeError: The size of tensor a (128) must match the size of tensor b (64) at non-singleton dimension 1

In [None]:
# eval (you can modify the code below)
model.eval()

res_gs = []
res_pred = []

for batch in test_data_loader:
    # TODO complete inference loop

In [None]:
# TODO complete evaluation of the model

# Part 2: what if the DNA sequences are not the same length

In [None]:
# hint we will need following imports
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [None]:
# DO NOT CHANGE HERE
random.seed(13)

# Use this for getting x label
def rand_sequence_var_len(n_seqs: int, lb: int=16, ub: int=128) -> Sequence[int]:
    for i in range(n_seqs):
        seq_len = random.randint(lb, ub)
        yield [random.randint(1, 5) for _ in range(seq_len)]


# Use this for getting y label
def count_cpgs(seq: str) -> int:
    cgs = 0
    for i in range(0, len(seq) - 1):
        dimer = seq[i:i+2]
        # note that seq is a string, not a list
        if dimer == "CG":
            cgs += 1
    return cgs


# Alphabet helpers   
alphabet = 'NACGT'
dna2int = {a: i for a, i in zip(alphabet, range(1, 6))}
int2dna = {i: a for a, i in zip(alphabet, range(1, 6))}
dna2int.update({"pad": 0})
int2dna.update({0: "<pad>"})

intseq_to_dnaseq = partial(map, int2dna.get)
dnaseq_to_intseq = partial(map, dna2int.get)

In [None]:
# TODO complete the task based on the change
def prepare_data(num_samples=100, min_len=16, max_len=128):
    # TODO prepared the training and test data
    # you need to call rand_sequence and count_cpgs here to create the dataset
    #step 1
    X_dna_seqs_train = list(rand_sequence_var_len(num_samples, min_len, max_len))
    #step 2
    temp = ???
    #step3
    y_dna_seqs = ???
    
    return X_dna_seqs_train, y_dna_seqs
    
    
min_len, max_len = 64, 128
train_x, train_y = prepare_data(2048, min_len, max_len)
test_x, test_y = prepare_data(512, min_len, max_len)

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, lists, labels) -> None:
        self.lists = lists
        self.labels = labels

    def __getitem__(self, index):
        return torch.LongTensor(self.lists[index]), self.labels[index]

    def __len__(self):
        return len(self.lists)

    
# this will be a collate_fn for dataloader to pad sequence  
class PadSequence:
    #TODO

In [None]:
# TODO complete the rest