In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.utils as utils
import gzip

In [9]:
a_file = gzip.open("../iDeepS/datasets/clip/1_PARCLIP_AGO1234_hg19/30000/training_sample_0/sequences.fa.gz", "rb")
contents = a_file.readlines()

for i in range(10):
    print(contents[i])

b'> chr1,+,897185,897285; class:0\n'
b'TGCCTCTCGGTGCCCCGTAGACTCTGCTCCCAGCCGCCAGTCTCCTGCAGCTGAATGGCG\n'
b'TCCGAGACGCTTGCTGCAAGTTTCTACTGAGTCAGCTCGAC\n'
b'> chr1,+,897570,897670; class:0\n'
b'CTGGGCCCCCCAGGAGCCTCGTCTGTGGCTCCTGACTCTGCTCGGCCCCTCCCAGTATGA\n'
b'ACACTCAGCCCCCACCTGCTAACCCTCCCTCCTAGGCATCT\n'
b'> chr1,+,898779,898879; class:0\n'
b'GCCGGAGGTGTCCATGGGCACAAGGCGAAGCTGCCTGGGTGTGGCCGCCTTGCATGGACT\n'
b'CCTGTACTCGGCCGGCGGCTATGACGGGGCCTCCTGCCTGA\n'
b'> chr1,+,899530,899630; class:0\n'


In [14]:
def read_file_helper(file_path):
    res = []
    with gzip.open(file_path, "rb") as f:
        for line in f.readlines():
            line = str(line)[2:-3]
            
            if line.startswith(">"):
                indicator = 0
                tmp = []
                loc, y = line.strip().split(";")
                chr_num, sign, start, end = loc[2:].split(",")
                tmp.extend([chr_num, sign, int(start), int(end), y[-1]])
            
            else:
                indicator += 1
                tmp.append(line)
                if indicator == 2:
                    res.append(tmp)
    df = pd.DataFrame(res, columns = ["chr_num", "sign", "start", "end", "y", "seq_part1", "seq_part2"])
    df["seq"] = df["seq_part1"] + df["seq_part2"]
    df["y"] = df["y"].astype(int)
    return df
                

def embed(sequence, instance_len, instance_stride):
    instance_num = int((len(sequence) - instance_len) / instance_stride) + 1
    bag = []
    for i in range(instance_num):
        instance = sequence[i * instance_stride:i * instance_stride + instance_len]
        instance = one_hot_encode(instance)
        bag.append(instance)
    bag = np.stack(bag).astype(float)
    return bag

def one_hot_encode(seq):
    mapping = dict(zip("ACGTN", range(5)))
    seq2 = [mapping[i] for i in seq]
    return np.eye(5)[seq2]

def create_bag(seqs, instance_len=40, instance_stride=5):
    bags = []
    for seq in seqs:
        bags.append(embed(seq, instance_len, instance_stride)) 
        
    return np.array(bags).astype(float)

class LibriSamples(torch.utils.data.Dataset):
    def __init__(self, data_path):
        df = read_file_helper(data_path)
        df["seq"] = df["seq_part1"] + df["seq_part2"]
        self.X, self.Y = create_bag(df["seq"]), df["y"].to_numpy()
        
        assert len(self.X) == len(self.Y)
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.Y[item]

In [15]:
train_data_path = "../iDeepS/datasets/clip/1_PARCLIP_AGO1234_hg19/30000/training_sample_0/sequences.fa.gz"
valid_data_path = "../iDeepS/datasets/clip/9_PARCLIP_ELAVL1MNASE_hg19/30000/test_sample_0/sequences.fa.gz"
batch_size = 1

train_data = LibriSamples(train_data_path)
valid_data = LibriSamples(valid_data_path)

train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_data, batch_size = batch_size, shuffle=True)

for x, y in train_loader:
    # print(x, y)
    print(x.shape, y.shape)
    break

torch.Size([1, 13, 40, 5]) torch.Size([1])


In [21]:
class WeakRM(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.embedding = nn.Sequential(
            nn.Conv2d(13, 32, kernel_size = 15, padding=7, stride=1),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
            nn.Conv2d(32, 16, kernel_size = 5, padding=2, stride=1),
            nn.Dropout(0.25),
            nn.Flatten()
        )
     
        self.attention_v = nn.Sequential(
            nn.Linear(640, 256),
            nn.Tanh()
        )
        
        self.attention_u = nn.Sequential(
            nn.Linear(640, 256),
            nn.Sigmoid()
        )
        
        self.part_1 = nn.Sequential(
            nn.Linear(256, 1),
            nn.Softmax()
        )
        
        self.cls =  nn.Sequential(
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
    def forward(self, inputs, training=True, mask=None):
        embedding = self.embedding(inputs)
        
        attention_v = self.attention_v(embedding)
        attention_u = self.attention_v(embedding)
        
        print(attention_u.shape, attention_v.shape)
        gated_attention = self.part_1(torch.mul(attention_u, attention_v).permute((1, 0)))
        
        print(embedding.shape, gated_attention.shape)
        bag_probability = torch.matmul(gated_attention, embedding)
        
        return bag_probability, gated_attention





In [22]:
network = WeakRM().cuda()
for x, y in train_loader:
    print(x.permute((0, 1, 3, 2)).shape)
    x = x.permute((0, 1, 3, 2)).float().cuda()
    # print(x)
    output = network(x)
    print(output[0].shape, output[1].shape)
    break

torch.Size([1, 13, 5, 40])
torch.Size([1, 256]) torch.Size([1, 256])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x1 and 256x1)

In [10]:
df = read_file("../iDeepS/datasets/clip/1_PARCLIP_AGO1234_hg19/30000/training_sample_0/sequences.fa.gz")
df["seq"] = df["seq_part1"] + df["seq_part2"]
a, b = create_bag(df["seq"], df["y"])


In [11]:
a.shape

(390000, 40, 5)

In [12]:
len(b)

390000

In [56]:
df= read_file("../iDeepS/datasets/clip/1_PARCLIP_AGO1234_hg19/30000/training_sample_0/sequences.fa.gz")


In [8]:
[1]*4

[1, 1, 1, 1]

In [48]:
len(df["seq"][0])

101

In [55]:
df = read_file("../iDeepS/datasets/clip/9_PARCLIP_ELAVL1MNASE_hg19/30000/test_sample_0/sequences.fa.gz")
df.groupby("y").count()

Unnamed: 0_level_0,chr_num,sign,start,end,seq_part1,seq_part2
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8000,8000,8000,8000,8000,8000
1,2000,2000,2000,2000,2000,2000
