# AutoEncoder

   

# Import

In [31]:
import torch 
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import DataLoader,Dataset

In [4]:
torch.manual_seed(0)

<torch._C.Generator at 0x7fb2cef52330>

 
# 读取Excel文件

In [51]:
df = pd.read_excel('data.xlsx', sheet_name='I_T（干细胞-高精确版）')

# Dataset

In [164]:
class Dataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.dic = {'@':0, 'A':1, 'C':2, 'G':3, 'T':4} # @ for padding
        self.pad_length = 120

    def __getitem__(self, idx):
#         return int(self.df.iloc[idx, 0]), self.df.iloc[idx, 1]
        rna = self.df.iloc[0,0] 
        rna = rna[:self.pad_length] if len(rna) > self.pad_length else rna + (self.pad_length-len(rna)) * '@'
        rna=[self.dic[x] for x in rna]
        ratio = self.df.iloc[idx, 1]
        return  torch.tensor(rna, dtype=torch.float32), torch.tensor([ratio], dtype=torch.float32)

    def __len__(self):
        return len(self.df)

# Model structure

In [156]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(120, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh(),
            nn.Linear(32, 16),
            nn.Tanh(),
            nn.Linear(16, 1),
#             nn.Tanh()
            nn.Sigmoid()
        )
    def forward(self, inputs):
        codes = self.encoder(inputs)
        return codes
    
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(1, 16),
            nn.Tanh(),
            nn.Linear(16, 32),
            nn.Tanh(),
            nn.Linear(32, 64),
            nn.Tanh(),
            nn.Linear(64, 120),
#             nn.Sigmoid()
        )
    def forward(self, inputs):
        outputs = self.decoder(inputs)
        return outputs

# Train

In [182]:
epochs = 20
batch_size = 64
lr = 0.001
use_cuda = 1
device = torch.device("cuda" if (torch.cuda.is_available() & use_cuda) else "cpu")
train_loader = torch.utils.data.DataLoader(
    dataset=Dataset(df[['context','ratio']]), 
    batch_size=batch_size, 
    shuffle=True)
val_loader = torch.utils.data.DataLoader(
    dataset=Dataset(df[['context','ratio']]), 
    batch_size=batch_size, 
    shuffle=True)

model_encoder = Encoder().to(device)
model_decoder = Decoder().to(device)

optimizer_En = torch.optim.Adam(model_encoder.parameters(), lr=lr)
optimizer_De = torch.optim.Adam(model_decoder.parameters(), lr=lr)
loss_function_de = nn.MSELoss().to(device)
loss_function_en = nn.MSELoss().to(device)

# Train
model_encoder.train()
model_decoder.train()
log_loss=[]
for epoch in range(epochs):
    total_loss = 0
    for rna, ratio in train_loader:
        rna = rna.to(device) 
        ratio = ratio.to(device) 
        
        model_encoder.zero_grad()
        model_decoder.zero_grad()
        
        codes = model_encoder(rna)
        decoded = model_decoder(ratio)
    
        loss_en = loss_function_en(codes, ratio)
        loss_de = loss_function_de(decoded, rna)
        
        loss_en.backward()
        loss_de.backward()
        
        optimizer_En.step()
        optimizer_De.step()
    if epoch % 2 ==0:
        print('[Cur: {}/ Total: {}] '.format(epoch+1, epochs))
torch.save(model_decoder, 'mode_AutoEncoder_Decoder.pth')

[Cur: 1/ Total: 20] 
[Cur: 3/ Total: 20] 
[Cur: 5/ Total: 20] 
[Cur: 7/ Total: 20] 
[Cur: 9/ Total: 20] 
[Cur: 11/ Total: 20] 
[Cur: 13/ Total: 20] 
[Cur: 15/ Total: 20] 
[Cur: 17/ Total: 20] 
[Cur: 19/ Total: 20] 


# Test

In [184]:
dataset = Dataset(df[['context','ratio']])
rna, ratio = dataset[0]

rna = rna.unsqueeze(dim=0)
ratio = ratio.unsqueeze(dim=0)

rna = rna.to(device) 
ratio = ratio.to(device) 

model_decoder = torch.load('mode_AutoEncoder_Decoder.pth')
model_decoder.eval()
outputs = model_decoder(ratio)
outputs = outputs.detach().cpu().numpy()
outputs = np.round(outputs)
outputs[outputs>4]=4
print('Generated RNA Seq by AE')
print(outputs)
print('Original RNA Seq')
print(rna.detach().cpu().numpy())

# plt.show()

Generated RNA Seq by AE
[[1. 4. 3. 2. 4. 4. 4. 3. 4. 3. 4. 3. 3. 1. 1. 1. 3. 2. 4. 4. 2. 1. 3. 4.
  3. 4. 3. 1. 3. 1. 1. 4. 3. 4. 1. 2. 1. 1. 3. 1. 3. 1. 4. 1. 2. 2. 4. 3.
  4. 1. 4. 4. 4. 3. 3. 1. 1. 4. 4. 3. 4. 3. 2. 2. 4. 3. 2. 4. 1. 4. 4. 1.
  4. 4. 2. 1. 1. 1. 2. 3. 2. 2. 4. 1. 3. 4. 2. 3. 1. 3. 3. 2. 1. 2. 2. 1.
  1. 1. 4. 3. 4. 4. 3. 3. 3. 3. 4. 3. 4. 3. 3. 1. 4. 4. 4. 2. 2. 1. 3. 2.]]
Original RNA Seq
[[1. 4. 3. 2. 4. 4. 4. 3. 4. 3. 4. 3. 3. 1. 1. 1. 3. 2. 4. 4. 2. 1. 3. 4.
  3. 4. 3. 1. 3. 1. 1. 4. 3. 4. 1. 2. 1. 1. 3. 1. 3. 1. 4. 1. 2. 2. 4. 3.
  4. 1. 4. 4. 4. 3. 3. 1. 1. 4. 4. 3. 4. 3. 2. 2. 4. 3. 2. 4. 1. 4. 4. 1.
  4. 4. 2. 1. 1. 1. 2. 3. 2. 2. 4. 1. 3. 4. 2. 3. 1. 3. 3. 2. 1. 2. 2. 1.
  1. 1. 4. 3. 4. 4. 3. 3. 3. 3. 4. 3. 4. 3. 3. 1. 4. 4. 4. 2. 2. 1. 3. 2.]]
