In [1]:
import unittest
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

torch.manual_seed(42)

<torch._C.Generator at 0x7ff1f80730b0>

In [0]:
import unittest
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(42)


#print(torch.cuda.is_available())
#if torch.cuda.is_available() or True:
#    device = torch.device("cuda")          # a CUDA device object
#    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
#    x = x.to(device)                       # or just use strings ``.to("cuda")``
#    z = x + y
#    print(z)
#    print(z.to("cpu", torch.double))   

In [0]:
def random_seq(l: int, vocab: list) -> str:
    return "".join(np.random.choice(vocab, l))


def random_data(m: int, l: int, c: int) -> tuple:
    """
    Generates random dataset consisting of m sequences of length l with up to c cpg sites per sequence.
    """
    assert c < l

    seqs = [random_seq() for _ in range(m)]
    cpgs = [[(np.random.randint(0,l), np.random.random()) for _ in range(np.random.randint(0, c))] for _ in range(m)]
    exprs = [float(np.random.binomial(n=1, p=x.count('N')/len(x), size=1)[0]) for x in seqs]

    return seqs, cpgs, exprs


def base2tensor(base: str) -> torch.tensor:
    tensor = torch.zeros(1, VOCAB_SIZE)
    tensor[0][BASE2IDX[base]] = 1
    return tensor


def seq2tensor(seq: str) -> torch.tensor:
    tensor = torch.zeros(len(seq), 1, VOCAB_SIZE)
    for idx, base in enumerate(seq):
        tensor[idx][0][BASE2IDX[base]] = 1
    return tensor


def methylate_seq(seq_tensor: torch.tensor, loc: int, value: float, meth_idx: int, mask_idx: int) -> torch.tensor:
    assert (seq_tensor[meth_idx, loc] == 1) or ()# assert it is a "C" in first place
    seq_tensor[meth_idx, loc] = value
    return seq_tensor


class PromoterDataset(Dataset):
    """"""

    def __init__(self, data: tuple):
        
        self.seqs, self.cpgs, self.exprs = data
        
        # get some utile attributes
        self.vocab = sorted(set("".join(self.seqs)))  # returns list
        self.vocab_size = len(self.vocab)
        self.base_to_idx = {}
        for i, base in enumerate(VOCAB):
            base_to_idx[base] = i
        
        self.meth_base = "C"  #set methylated base
        self.mask_base = "N"
        self.meth_idx = self.base_to_idx[self.meth_base]
        self.mask_idx = self.baset_to_idx[self.mask_base]


    def __len__(self):
        return len(self.exprs)


    def __getitem__(self, idx):
        """Getter."""
        seq = seq2tensor(self.seqs[idx])
        cpgs = self.cpgs[idx]
        expr = self.expr[idx]

        for cpg in cpgs:
            # insert cpg value in row for meth_base (cytosine)
            cpg_loc = cpg[0]
            cpg_value = cpg[1]
            seq = methylate_seq(seq, loc, value, self.meth_idx, self.mask_idx)

        return torch.tensor(seq, expr)
            

m = 100
l = 50
c = 5
data = random_data(m, l, c)

# test_loader, train_loader
train_set = PromoterDataset(data)
train_loader = DataLoader(train_set, batch_size=4, shuffle=True, num_workers=0)

In [0]:
class RNNRegressor(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNRegressor, self).__init__()
        self.hidden_dim = hidden_dim
        self.i2h = nn.Linear(input_dim + hidden_dim, hidden_dim)
        self.i2o = nn.Linear(input_dim + hidden_dim, output_dim)
        self.tanh = nn.Tanh()
    
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.tanh(output)

        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_dim)


rnn = RNNRegressor(train_set.vocab_size, HIDDEN_DIM, 1)

In [52]:
base2tensor("N")
seq2tensor("ACGTN")

input = seq2tensor('ACGTN')
hidden = torch.zeros(1, HIDDEN_DIM)

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-0.0117]], grad_fn=<TanhBackward>)


In [114]:
criterion = nn.MSELoss()
lr = 0.005

def train(output_tensor, seq_tensor):

    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(seq_tensor.size()[0]):
        output, hidden = rnn(seq_tensor[i], hidden)
    
    loss = criterion(output, output_tensor)
    loss.backward()

    for p in rnn.parameters():
        p.data.add_(-lr, p.grad.data)
    
    return output, loss.item()

train(torch.tensor([-0.0117]), seq2tensor("ACGTN"))

(tensor([[-0.0245]], grad_fn=<TanhBackward>), 0.0001632583880564198)

In [131]:
n_iters = 50

current_loss = 0
for i in range(n_iters):
    output_tensor, seq_tensor = torch.tensor(y_train[i]), seq2tensor(X_train[i])
    output, loss = train(output_tensor, seq_tensor)
    print(loss)

0.01711277849972248
0.008758061565458775
0.007915392518043518
0.017463384196162224
0.0056884572841227055
0.0034697987139225006
0.010249307379126549
0.009265918284654617
0.050086669623851776
0.011109867133200169
0.0008259211317636073
0.0031845399644225836
0.08377565443515778
0.004317635670304298
0.05308886244893074
0.008003836497664452
2.849230895662913e-07
0.00017776194727048278
0.0008829082362353802
7.3491000875947066e-06
0.011529708281159401
0.00040584205999039114
0.06075203791260719
0.0008441368117928505
0.009047843515872955
0.00704139145091176
0.010068253614008427
3.899871444446035e-06
0.004074526019394398
0.0028674909844994545
0.00014958357496652752
1.00400972366333
6.395148375304416e-07
1.2135016918182373
0.005960062611848116
0.00030325434636324644
0.0002610151714179665
0.10282783955335617
0.08487041294574738
2.831971869454719e-05
0.0010409540263935924
0.001631260267458856
0.005946222227066755
0.00013461834168992937
0.06925702095031738
0.0022358756978064775
0.000483204290503636
0