In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import os
import math
import matplotlib.pyplot as plt
import numpy as np

from Load_Data import load_data

device = (
    "mps"
    if torch.backends.mps.is_available()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
device = torch.device(device)
print(f"Using {device} device")

Using mps device


# Load Data

In [None]:
def get_data_loaders(batch_size=64):
    Data = load_data(os.path.join(os.getcwd(), 'Data', 'Parsed_Data'), 
                        train_val_data_to_load=1000, 
                        test_data_to_load=100
                        train_val_data_to_load=2000, 
                        test_data_to_load=1
                        )
    
    training_dataset, validation_dataset, testing_dataset = Data

    train_loader = DataLoader(dataset=training_dataset,
                              batch_size=batch_size,shuffle=True)
    val_loader = DataLoader(dataset=validation_dataset,
                              batch_size=batch_size,shuffle=True)
    test_loader = DataLoader(dataset=testing_dataset,
                              batch_size=batch_size,shuffle=True)
    return train_loader, val_loader, test_loader

train_loader, val_loader, test_loader = get_data_loaders()

Loading sequences from sequences.fasta
Loading coverage from SRX391990.faste
Loading coverage from SRX9770779.faste
Loading coverage from SRX9770784.faste
Loading coverage from SRX9770786.faste
Loading coverage from SRX391992.faste
Loading coverage from SRX391996.faste
Loading coverage from SRX9770782.faste
Loading coverage from SRX1098138.faste
Loading coverage from SRX9770780.faste
Loading coverage from SRX391994.faste
Loading coverage from SRX9770787.faste
Loading coverage from SRX391993.faste
Loading coverage from SRX9770778.faste
Loading coverage from SRX391991.faste
Loading coverage from SRX9770785.faste


# Build Model

In [86]:
class DnaCnn(nn.Module):
    def __init__(self, num_kernels=[512, 256, 128], kernel_size=[1028,256,128],
                 dropout=0):
        super(DnaCnn, self).__init__()
        self.input_channels=4
        self.num_kernels=num_kernels
        self.kernel_size=kernel_size
        self.dropout=dropout
        self.conv_block = nn.Sequential(
            # first layer
            nn.Conv1d(in_channels=self.input_channels,
                      out_channels=num_kernels[0],
                      kernel_size=kernel_size[0]),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.MaxPool1d(kernel_size=2),
        )
        # second layer
        # self.conv_block.append(nn.Sequential(
        #     nn.Conv1d(in_channels=self.num_kernels[0],
        #               out_channels=num_kernels[1],
        #               kernel_size=kernel_size[1]),
        #     nn.ReLU(),
        #     nn.MaxPool1d(kernel_size=2),
        #     nn.Dropout(p=self.dropout),            
        # ))
        # Add a third convolutional layer
        # self.conv_block.append(nn.Sequential(
        #     # second layer
        #     nn.Conv1d(in_channels=self.num_kernels[1],
        #               out_channels=num_kernels[2],
        #               kernel_size=kernel_size[2]),
        #     nn.ReLU(),
        #     nn.MaxPool1d(kernel_size=2),
        #     nn.Dropout(p=self.dropout),            
        # ))
        self.regression_block = nn.Sequential(
            nn.Linear(num_kernels[0], 37),
            nn.ReLU(),  # ReLU ensures positive outputs
            # nn.LogSoftmax(dim=1)  # Apply log softmax if necessary for your task
        )  

    def forward(self, x):
        x = self.conv_block(x)
        x,_ = torch.max(x, dim=2)        
        x = self.regression_block(x)
        return x

# Train Model

### Training functions

In [87]:
def train_epoch(dataloader, model, loss_fn, optimizer, epoch):
    size = len(dataloader)
    num_batches = len(dataloader)
    total_loss = 0
    # set the model to training mode - important when you have 
    # batch normalization and dropout layers
    model.train()
    for batch_idx, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        # Compute prediction and loss
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        # backpropagation
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0 :
        print(f"training loss: {total_loss/num_batches:>7f}")
    return total_loss / num_batches

def validation(dataloader, model, loss_fn, epoch):
    # set the model to evaluation mode 
    model.eval()
    # size of dataset
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    validation_loss, correct = 0, 0
    # Evaluating the model with torch.no_grad() ensures that no gradients 
    # are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage 
    # for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            validation_loss += loss_fn(y_pred, y).item()
    validation_loss /= num_batches
    if epoch%10 == 0 :
        print(f"Validation Loss: {validation_loss:>8f} \n")
    return validation_loss

def train_model(train_loader, val_loader, model, optimizer):
    epochs = 1000
    loss_fn = nn.PoissonNLLLoss(log_input=True, full=True)
    patience = math.inf
    p = patience
    
    
    train_loss = []
    validation_loss = []
    best_loss = math.inf
    for t in range(epochs):
        if t % 10 == 0 :
            print(f"Epoch {t}\n-------------------------------")
        loss = train_epoch(train_loader, model, loss_fn, optimizer, t)
        train_loss.append(loss)
        loss = validation(val_loader, model, loss_fn, t)
        validation_loss.append(loss)
    
        if loss < best_loss:
            best_loss = loss    
            p = patience
        else:
            p -= 1
            if p == 0:
                print("Early Stopping!")
                break    
    print("Done!")

    def plot_loss(train_loss, validation_loss):
        plt.figure(figsize=(4,3))
        plt.plot(np.arange(len(train_loss)), train_loss, label='Training')
        plt.plot(np.arange(len(validation_loss)), validation_loss, label='Validation')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()
    plot_loss(train_loss, validation_loss)


### Train

In [89]:
model = DnaCnn().to(device)
lr = 0.00001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
train_model(train_loader, val_loader, model, optimizer)

Epoch 0
-------------------------------
training loss: 23923.468040
Validation Loss: 23127.523047 

Epoch 10
-------------------------------
training loss: 17688.430309
Validation Loss: 16815.430859 

Epoch 20
-------------------------------
training loss: 9894.751953
Validation Loss: 9261.502246 

Epoch 30
-------------------------------
training loss: 6178.538841
Validation Loss: 5938.173047 

Epoch 40
-------------------------------
training loss: 6084.803356
Validation Loss: 5859.529785 

Epoch 50
-------------------------------
training loss: 6031.431729
Validation Loss: 5864.342578 

Epoch 60
-------------------------------
training loss: 5963.293324
Validation Loss: 5834.663184 

Epoch 70
-------------------------------
training loss: 5718.509455
Validation Loss: 5653.118604 

Epoch 80
-------------------------------
training loss: 4149.658802
Validation Loss: 4095.127100 

Epoch 90
-------------------------------
training loss: 3920.775746
Validation Loss: 4028.951807 

Epoch 1

KeyboardInterrupt: 

### Save Model

In [90]:
torch.save(model, "model.pth")

### Train Model

In [91]:
model = torch.load("model.pth", weights_only=False)
model.to(device)
model.eval()

DnaCnn(
  (conv_block): Sequential(
    (0): Conv1d(4, 512, kernel_size=(1028,), stride=(1,))
    (1): ReLU()
    (2): Dropout(p=0, inplace=False)
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (regression_block): Sequential(
    (0): Linear(in_features=512, out_features=37, bias=True)
    (1): ReLU()
  )
)

In [92]:
input, y = next(iter(train_loader))
input = input.to(device)

output = model.forward(input)
print(output.shape)

torch.Size([64, 37])


In [93]:
i = 35
print('Tissue: Predicted, True')
for s, (y_p, y_t) in enumerate(zip(output[i], y[i])):
    print(f'{s}: {torch.exp(y_p):.1f}, {y_t:.1f}')


Tissue: Predicted, True
0: 1217.1, 1217.8
1: 852.3, 786.4
2: 864.1, 856.1
3: 944.2, 812.9
4: 828.1, 910.6
5: 903.6, 973.1
6: 1.0, 684.8
7: 786.9, 811.3
8: 454.0, 0.0
9: 891.8, 819.1
10: 779.4, 818.5
11: 907.4, 871.0
12: 800.8, 494.1
13: 707.8, 648.6
14: 385.8, 0.0
15: 867.6, 891.0
16: 792.5, 737.0
17: 1151.5, 1107.7
18: 747.1, 826.8
19: 613.8, 532.3
20: 768.8, 596.3
21: 1.0, 1016.2
22: 1163.0, 1646.8
23: 822.5, 654.2
24: 1.0, 356.1
25: 2030.9, 2050.0
26: 869.6, 790.0
27: 977.5, 1042.5
28: 1065.3, 1217.7
29: 1096.7, 1324.2
30: 1104.5, 1189.1
31: 846.7, 809.0
32: 930.2, 858.1
33: 1.0, 379.3
34: 847.8, 787.6
35: 1143.2, 1208.7
36: 1.0, 362.0
