In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.switch_backend('agg')

import os
import re
import random

import sys
sys.path.insert(1, '/gscratch/ml4ml/sidlak/superfold/superfold');

In [2]:
# import pyrosetta
# from pyrosetta.rosetta.core.pose import Pose

# pyrosetta.init("-ignore_unrecognized_res true")

# def get_rmsd(design: Pose, prediction: Pose) -> float:
#     """Calculate Ca-RMSD of prediction to design"""
#     import pyrosetta
#     rmsd_calc = pyrosetta.rosetta.core.simple_metrics.metrics.RMSDMetric()
#     # https://graylab.jhu.edu/PyRosetta.documentation/pyrosetta.rosetta.core.scoring.html?highlight=rmsd_atoms#pyrosetta.rosetta.core.scoring.rmsd_atoms
#     rmsd_calc.set_rmsd_type(pyrosetta.rosetta.core.scoring.rmsd_atoms(3)) # change to the rmsd atom type desired, 3 = Ca only
#     rmsd_calc.set_run_superimpose(True)
#     rmsd_calc.set_comparison_pose(design)
#     rmsd = float(rmsd_calc.calculate(prediction))
#     return rmsd

# def get_tm_score(design: Pose, prediction: Pose) -> float:
#     """Calculate Ca-RMSD of prediction to design"""
#     import pyrosetta
#     tm_align = pyrosetta.rosetta.protocols.hybridization.TMalign()
#     tm_align.apply(prediction, design)
#     tm_score = tm_align.TMscore(len(design))
#     return tm_score


# def fold(list_seq, list_name, chain_id):
#     ofile = open("./output/fastas/e0.fasta", "w+")
#     for i in range(1, len(list_seq), 2):
#         ofile.write(">" + list_name[i] + "\n" +list_seq[i] + "\n")
#     ofile.close()

#     os.system("python3 ../run_superfold.py ./output/fastas/e0.fasta --overwrite --models 5")
#     sum_tm = 0
#     count_tm = 0
#     for i in range(0, len(list_name), 2):
#         pdb1 = 0
#         success = True
#         while success:
#             try:
#                 pdb1 = pyrosetta.toolbox.rcsb.pose_from_rcsb(chain_id)
#                 success = False
#             except:
#                 print('error')
#                 success = True
#         pdb2 = pyrosetta.pose_from_file('./output/' + list_name[i + 1] + '_model_5_ptm_seed_0_unrelaxed.pdb')
#         tm = 0
#         if (len(list_seq[i]) > len(list_seq[i + 1])):
#             tm = get_tm_score(pdb1, pdb2)
#         else:
#             tm = get_tm_score(pdb2, pdb1)
#         sum_tm += tm
#         count_tm += 1

#     return sum_tm / count_tm

# def levenshtein_distance(seq_a, seq_b) -> int:
#     """
#     :param seq_a: first sequence to compare.
#     :param seq_b: second sequence to compare.
#     :return: levenshtein distance between the two sequences.
#     Calculate the levenshtein distance between two sequences.
#     """
#     # https://en.wikipedia.org/wiki/Levenshtein_distance
#     # initialize distance matrix
#     distance_matrix = numpy.zeros(seq_a.shape[0] + 1, seq_b.shape[0] + 1)
#     for id1 in range(len(seq_a) + 1):
#         distance_matrix[id1][0] = id1
#     for id2 in range(len(seq_b) + 1):
#         distance_matrix[0][id2] = id2
#     a = 0
#     b = 0
#     c = 0
#     for id1 in range(1, len(seq_a) + 1):
#         for id2 in range(1, len(seq_b) + 1):
#             if torch.eq(seq_a[id1 - 1], seq_b[id2 - 1]):
#                 distance_matrix[id1][id2] = distance_matrix[id1 - 1][id2 - 1]
#             else:
#                 a = distance_matrix[id1][id2 - 1]
#                 b = distance_matrix[id1 - 1][id2]
#                 c = distance_matrix[id1 - 1][id2 - 1]
#                 if a <= b and a <= c:
#                     distance_matrix[id1][id2] = a + 1
#                 elif b <= a and b <= c:
#                     distance_matrix[id1][id2] = b + 1
#                 else:
#                     distance_matrix[id1][id2] = c + 1
#     levenshtein_distance = int(distance_matrix[id1][id2])
#     return levenshtein_distance

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 600
DATA_DIR = './data/PDB-2021AUG02.csv'

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [4]:
def load_dataset(max_length, data_dir=''):
    print ("loading dataset...")
    data = pd.read_csv(data_dir)
    dirname = data_dir.split('.')
    dirfilename = (dirname[1].split('/'))[-1]
    print(dirfilename)
    print(data.keys())
    lines = list(set((x[0:4], y) for x, y in zip(data['CHAINID'].tolist(), data['SEQUENCE'].tolist())))
    lines = [l for l in lines if ('X' not in l[1])]
    
    lines = [l for l in lines if (len(l[1]) <= max_length)]
    lines = [(l[0], tuple(l[1] + '0'*(MAX_LENGTH - len(l[1])))) for l in lines] # pad with 0
    print("loaded {} lines in dataset".format(len(lines)))
    np.random.shuffle(lines) 
    return lines

In [16]:
class ProteinDataset(Dataset):
    def __init__(self, data_file):
        print ("loading dataset...")
        self.data = pd.read_csv(data_file)
        print(self.data.shape)
        self.data = self.data.drop_duplicates(subset=['SEQUENCE'])
        self.data = self.data[~(self.data['s4pred_truth'].str.contains('X'))]
        print(self.data.shape)
        sequence_map = {}
        for i, c in enumerate('ARNDCQEGHILKMFPSTWYV-'):
            sequence_map[c] = i;
        
        structure_map = {}
        for i, c in enumerate('CEH-'):
            structure_map[c] = i;
        
        self.data['SEQUENCE'] = self.data['SEQUENCE'].str.pad(width=600, side='right', fillchar='-')
        self.data['SEQUENCE_encode'] = self.data['SEQUENCE'].map(lambda a: self.transform(a, sequence_map))
        self.data['s4pred_truth'] = self.data['s4pred_truth'].str.pad(width=600, side='right', fillchar='-')
        self.data['s4pred_truth_encode'] = self.data['s4pred_truth'].map(lambda a: self.transform(a, structure_map))
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = self.data.iloc[idx]['SEQUENCE']
        structure = self.data.iloc[idx]['s4pred_truth']
        sample_e = self.data.iloc[idx]['SEQUENCE_encode']
        structure_e = self.data.iloc[idx]['s4pred_truth_encode']
        return (sample, structure, sample_e, structure_e)

    def transform(self, a, source):
        indexes = [source[c] for c in list(a)]
        return F.one_hot(torch.tensor(indexes, dtype=torch.long, device=device), num_classes=len(source))
        
        

pdb = ProteinDataset('./data/PDB-2021AUG02_noX_dssp_ss.csv')
for i in range(5):
    sample = pdb[i]
    print(sample)
        

loading dataset...
(505296, 8)
(97616, 8)
('MGSSHHHHHHSSGLEVLFQGPEENGAHTIANNHTDMMEVDGDVEIPSNKAVVLRGHESEVFICAWNPVSDLLASGSGDSTARIWNLSENSTSGPTQLVLRHCIREGGQDVPSNKDVTSLDWNSEGTLLATGSYDGFARIWTKDGNLASTLGQHKGPIFALKWNKKGNFILSAGVDKTTIIWDAHTGEAKQQFPFHSAPALDVDWQSNNTFASCSTDMCIHVCKLGQDRPIKTFQGHTNEVNAIKWDPTGNLLASCSDDMTLKIWSMKQDNCVHDLQAHNKEIYTIKWSPTGPGTNNPNANLMLASASFDSTVRLWDVDRGICIHTLTKHQEPVYSVAFSPDGRYLASGSFDKCVHIWNTQTGALVHSYRGTGGIFEVCWNAAGDKVGASASDGSVCVLDLRK------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------', 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCECCHHHEEEECCCCCCEEEEEECCCCCEEEEEECCCEEEEEECCCCCCCCCEEEEEECCCCCCCCCCCCCCCEEEEEECCCCCEEEEEECCCEEEEEECCCCEEEEEEECCCCEEEEEECCCCCEEEEEECCCCEEEEECCCCCEEEEECCCCCCEEEEEECCCCEEEEEECCCEEEEEECCCCCCCEEEECCCCCEEEEEECCCCCEEEEEECCCCEEEEECCCCCCCEEECCCCCCEEEEEECCCCCCCCCCCCCCEEEEEECCCCEEEEECCCCEEEEEECCCCCCEEEEEECCCCCEEEEEECCC

In [None]:
# class Lang:
#     def __init__(self, name):
#         self.name = name
#         self.char2index = {}
#         self.char2count = {}
#         self.char2word = {}
#         self.n_chars = 0

#     def addSequence(self, seq):
#         for c in list(seq):
#             self.addChar(c)

#     def addChar(self, c):
#         if c not in self.char2index:
#             self.char2index[c] = self.n_chars
#             self.char2count[c] = 1
#             self.char2word[self.n_chars] = c
#             self.n_chars += 1
#         else:
#             self.char2count[c] += 1

In [18]:
# def prepare_data(max_len=MAX_LENGTH, data_dir=DATA_DIR):
#     lines = load_dataset(max_len, data_dir)
#     lang = Lang("PDB")
#     lang.addSequence("ARNDCQEGHILKMFPSTWYV0")
#     for line in lines:
#         lang.addSequence(line[1])
#     retlines = []
#     print(lines[0])
#     for s in lines:
#         retlines.append((s[0], F.one_hot(tensorFromSequence(lang, s), num_classes=lang.n_chars).float()))
#     print((retlines[0])[1].size())
#     return (lang, lang, [[s, s] for s in retlines])

# def indexesFromSequence(lang, sequence):
#     return [lang.char2index[c] for c in list(sequence[1])]

# def tensorFromSequence(lang, sequence):
#     indexes = indexesFromSequence(lang, sequence)
#     return torch.tensor(indexes, dtype=torch.long, device=device)

# input_lang, output_lang, pairs = prepare_data()
# print(input_lang.char2word)
# def tensorsFromPair(pair):
#     input_tensor = tensorFromSequence(input_lang, pair[0])
#     target_tensor = tensorFromSequence(output_lang, pair[1])
#     return (input_tensor, target_tensor)

# def decodeTensor(lang, t):
#     oneHot = torch.argmax((t[1]).view(MAX_LENGTH, lang.n_chars), dim=1)
#     return (''.join([lang.char2word[i.item()] for i in oneHot])).replace('0', '')

loading dataset...
PDB-2021AUG02
Index(['CHAINID', 'DEPOSITION', 'RESOLUTION', 'HASH', 'CLUSTER', 'SEQUENCE'], dtype='object')
loaded 242971 lines in dataset
('6jm9', ('A', 'K', 'T', 'R', 'S', 'S', 'R', 'A', 'G', 'L', 'Q', 'F', 'P', 'V', 'G', 'R', 'V', 'H', 'R', 'L', 'L', 'R', 'K', 'G', 'N', 'Y', 'A', 'E', 'R', 'V', 'G', 'A', 'G', 'A', 'P', 'V', 'Y', 'L', 'A', 'A', 'V', 'L', 'E', 'Y', 'L', 'T', 'A', 'E', 'I', 'L', 'E', 'L', 'A', 'G', 'N', 'A', 'A', 'R', 'D', 'N', 'K', 'K', 'T', 'R', 'I', 'I', 'P', 'R', 'H', 'L', 'Q', 'L', 'A', 'V', 'R', 'N', 'D', 'E', 'E', 'L', 'N', 'K', 'L', 'L', 'G', 'R', 'V', 'T', 'I', 'A', 'Q', 'G', 'G', 'V', 'L', 'P', 'N', 'I', 'Q', 'S', 'V', 'L', 'L', 'P', 'K', 'K', 'T', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 23.64 GiB total capacity; 22.41 GiB already allocated; 1.38 MiB free; 22.62 GiB reserved in total by PyTorch)

In [None]:
# class EncoderRNN(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers=1):
#         super(EncoderRNN, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.gru = nn.GRU(hidden_size, hidden_size, num_layers=num_layers)

#     def forward(self, input, hidden):
#         embedded = self.embedding(input).view(1, 1, -1)
#         output = embedded
#         if hidden.shape[0] != self.num_layers:
#             hidden = hidden.repeat(self.num_layers, 1, 1)
#         output, hidden = self.gru(output, hidden)
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size, device=device)

class EncoderRNN(nn.Module):
    def __init__(self, input_size=21*MAX_LENGTH, hidden_size=100):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.layers = nn.Sequential(nn.Linear(input_size, 10000), 
                                    nn.LeakyReLU(), 
                                    nn.Linear(10000, 5000), 
                                    nn.LeakyReLU(),
                                    nn.Linear(5000, 1000), 
                                    nn.LeakyReLU(),
                                    nn.Linear(1000, 500), 
                                    nn.LeakyReLU(),
                                    nn.Linear(500, 300), 
                                    nn.LeakyReLU(),
                                    nn.Linear(300, hidden_size), 
                                    nn.LeakyReLU())

    def forward(self, input):
        output = self.layers(input)
        return output

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size=100, output_size=21*MAX_LENGTH):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.layers = nn.Sequential(nn.Linear(hidden_size, 300), 
                                    nn.LeakyReLU(), 
                                    nn.Linear(500, 1000), 
                                    nn.LeakyReLU(),
                                    nn.Linear(1000, 5000), 
                                    nn.LeakyReLU(),
                                    nn.Linear(5000, 10000),
                                    nn.LeakyReLU())
        self.out = nn.Linear(10000, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input):
        output = self.layers(input)
        output = self.sigmoid(self.out(output))
        return output

In [None]:
"""
Created on Thu Jun  4 13:44:17 2020
@author:
    Lewis Moffat
    Bioinformatics Group - Comp. Sci. Dep., University College London (UCL)
    Github: CraftyColossus
Inference Only Version of S4PRED - Single Sequence Secondary Structure Pred
This is culled down to exclude the various DropConnect/Dropout etc. from the 
training methods so that it is more clear.
If you'd like a training version of the model please raise an issue or submit a PR.
The AWD-GRU training script model was a tweak on the offical Salesforce 
AWD-LSTM (https://github.com/salesforce/awd-lstm-lm/). It needed to be adapted to 
take multiple layers of RNNs. 
"""

import torch
import torch.nn as nn
import torch.nn.functional as F




class ResidueEmbedding(nn.Embedding):
    def __init__(self, vocab_size=21, embed_size=128, padding_idx=None):
        super().__init__(vocab_size, embed_size, padding_idx=padding_idx)

        
        
class GRUnet(nn.Module):
    def __init__(self,lstm_hdim=1024, embed_size=128, num_layers=3,bidirectional=True,lstm=False,outsize=3):
        super().__init__()
        """
            This version of the model has all the bells & whistles (e.g. 
            dropconnect) ripped out so its slimmed down for inference
            
        """
        
        self.lstm_hdim = lstm_hdim
        self.embed=ResidueEmbedding(vocab_size=22, embed_size=embed_size, padding_idx=21)
        self.lstm = nn.GRU(128, 1024, num_layers=3, bidirectional=True, batch_first=True,dropout=0.0)
        self.outlayer = nn.Linear(lstm_hdim*2, outsize)
        self.finalact=F.log_softmax

    
    def forward(self, x):
        """
            Assumes a batch size of one currently but can be changed
        """
        x=self.embed(x)
        x, _ = self.lstm(x)
        x=self.outlayer(x)
        x=self.finalact(x,dim=-1)
        return x.squeeze()        
        
        
class S4PRED(nn.Module):
    def __init__(self):
        super().__init__()
        """
            This loads the ensemble of models in a lazy way but its clear and 
            leaves the weight loading out of the run_model script. 
        """
                                            
        # Manually listing for clarity and hot swapping in future
        self.model_1=GRUnet()
        self.model_2=GRUnet()
        self.model_3=GRUnet()
        self.model_4=GRUnet()
        self.model_5=GRUnet()
        
    def forward(self, x):
        y_1=self.model_1(x)
        y_2=self.model_2(x)
        y_3=self.model_3(x)
        y_4=self.model_4(x)
        y_5=self.model_5(x)
        y_out=y_1*0.2+y_2*0.2+y_3*0.2+y_4*0.2+y_5*0.2
        return y_out
    
s4pred = S4PRED().to(device)
s4pred.requires_grad=True
scriptdir = '../s4pred'
weight_files=['/weights/weights_1.pt',
              '/weights/weights_2.pt',
              '/weights/weights_3.pt',
              '/weights/weights_4.pt',
              '/weights/weights_5.pt']

# Manually listing for clarity and hot swapping in future
# Inelegant, ugly ugly, to be cleaned up in the future
s4pred.model_1.load_state_dict(torch.load(scriptdir + weight_files[0], map_location=lambda storage, loc: storage))
s4pred.model_2.load_state_dict(torch.load(scriptdir + weight_files[1], map_location=lambda storage, loc: storage))
s4pred.model_3.load_state_dict(torch.load(scriptdir + weight_files[2], map_location=lambda storage, loc: storage))
s4pred.model_4.load_state_dict(torch.load(scriptdir + weight_files[3], map_location=lambda storage, loc: storage))
s4pred.model_5.load_state_dict(torch.load(scriptdir + weight_files[4], map_location=lambda storage, loc: storage))


In [None]:
# class Model(nn.Module):
#     def __init__(self, input_size, encoding_size, hidden=[], h_act=nn.ReLU(), out_act=nn.Tanh()):
#         super(Model, self).__init__()
#         self.encoder = Encoder(input_size, encoding_size, hidden, h_act, out_act)
#         self.decoder = Decoder(encoding_size, input_size, hidden, h_act)
    
#     def forward(self, x):
#         seq_len = x.shape[0]
#         x = self.encoder(x)
#         x = self.decoder(x, seq_len)
#         return x

In [None]:
def decodeTensorValue(lang, t):
    # torch.argmax((t[1]).view(MAX_LENGTH, lang.n_chars), dim=1)
    input = (t[1]).view(MAX_LENGTH, lang.n_chars)
    *_, n = input.shape
    input = nn.functional.softmax(100 * input, dim=-1)
    indices = torch.linspace(0, 1, n).to(device)
    result = torch.sum((n - 1) * input * indices, dim=-1)
    return result

def train(input_tensor, target_tensor, chain_id, encoder, decoder, encoder_optimizer, decoder_optimizer, ss_optimizer, criterion, iter, e):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    ss_optimizer.zero_grad()
    
    encoder_hidden = encoder(input_tensor)
    decoder_output = decoder(encoder_hidden)
    
    inp = decodeTensorValue(input_lang, (chain_id, input_tensor)).int()
    out = decodeTensorValue(input_lang, (chain_id, decoder_output)).int()
#     list_name = ['r', 'f']
#     ofile = open("./output/fastas/i0.fas", "w+")
#     ofile.write(">" + list_name[1] + "\n" + list_seqs[1] + "\n")
#     ofile.close()
#     os.system("python3 ../s4pred/run_model.py --outfmt fas ./output/fastas/i0.fas > ./output/i0ss.fas")
#     f1 = open('./output/i0ss.fas')
#     s1 = f1.readlines()[2]
#     pose = 0
#     success = True
#     while success:
#         try:
#             pose = pyrosetta.toolbox.rcsb.pose_from_rcsb(chain_id)
#             success = False
#         except:
#             print(chain_id)
#             success = True
#     ss = pyrosetta.rosetta.core.scoring.dssp.Dssp(pose) 
#     s2 = "".join([ss.get_dssp_secstruct(pos) for pos in range(1, len(pose.sequence()))])
#     s2.replace('L', 'C')
    # loss = torch.mul(loss, levenshtein_distance(s1, s2))

    ss_inp = s4pred(inp[None, :])
    ss_out = s4pred(out[None, :])
        
    loss = criterion(input_tensor, decoder_output)
    
    
#     if (iter % 1000 == 1):
#         list_seqs = [decodeTensor(input_lang, (chain_id, input_tensor)), decodeTensor(input_lang, (chain_id, decoder_output))]
#         list_name = ['r' + str(e), 'f' + str(e) + '_' + chain_id + '_']
#         alpha = fold(list_seqs, list_name, chain_id)
#         print(alpha)
#         alpha = 1.0 / alpha
    
#     loss = torch.mul(loss, alpha)
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
    #ss_optimizer.step()

    return decoder_output, loss.item()

In [None]:
def trainIters(encoder, decoder, n_iters, epochs, print_every=1, plot_every=1, learning_rate=1):
    start = time.time()
    with open('./output/seqs.txt', 'w+') as f:
        print("Sequences: out, in\n", file=f)
    with open('./output/losses.txt', 'w+') as f:
        print("Losses\n", file=f)
    plot_losses = []
    losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    ss_optimizer = optim.SGD(s4pred.parameters(), lr=learning_rate)
    criterion = nn.BCELoss()
    
    for e in range(1, epochs + 1):
        if e % print_every == 0:
            print_loss_avg = print_loss_total / n_iters
            print_loss_total = 0
            print('%s (%d %d%%) Loss: %.4f' % (timeSince(start, e / epochs),
                                         e, e / epochs * 100, print_loss_avg))
#             f = open('./output/' + str(e) + '_e_weights.pt', 'w+')
#             g = open('./output/' + str(e) + '_d_weights.pt', 'w+')
            with open('./output/encoder.pt', 'w+') as f:
                print({
                    'epoch': e,
                    'model_state_dict': encoder.state_dict(),
                    'optimizer_state_dict': encoder_optimizer.state_dict(),
                    'loss': print_loss_avg,
                }, file=f)
            with open('./output/decoder.pt', 'w+') as f:
                print({
                    'epoch': e,
                    'model_state_dict': decoder.state_dict(),
                    'optimizer_state_dict': decoder_optimizer.state_dict(),
                    'loss': print_loss_avg,
                }, file=f)
            
        training_pairs = [random.choice(pairs) for i in range(n_iters)]
        for iter in range(1, n_iters + 1):
            training_pair = training_pairs[iter - 1]
            input_tensor = training_pair[0][1].view(-1, MAX_LENGTH * input_lang.n_chars)
            target_tensor = training_pair[1][1].view(-1, MAX_LENGTH * input_lang.n_chars)
            chain_id = training_pair[0][0]

            output_tensor, loss = train(input_tensor, target_tensor, chain_id, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, ss_optimizer, criterion, iter, e)
            print_loss_total += loss
            plot_loss_total += loss
            with torch.no_grad():
                if iter % 1000 == 0:
                    with open('./output/seqs.txt', 'a') as f:
                        print('\tIteration %d Loss: %.4f' % (iter, loss))
                        print(decodeTensor(input_lang, (chain_id, output_tensor)), file=f)
                        print(decodeTensor(input_lang, training_pair[1]), file=f)
    
                if iter % 1000 == 0:
                    losses.append(loss)
                    fig, ax = plt.subplots(1, figsize=(15,10))
                    ax.plot(range(1, len(losses) + 1), losses)
                    ax.grid()
                    ax.set_title('Losses of Autoencoder')
                    ax.set_xlabel('Iteration (every 1000)')
                    ax.set_ylabel('Loss')
                    fig.savefig("./output/losses_iter.png")
                    with open('./output/losses.txt', 'a') as f:
                        print('\tIteration %d Loss: %.4f' % (iter, loss), file=f)

        if e % plot_every == 0:
            plot_loss_avg = plot_loss_total / n_iters
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
            fig, ax = plt.subplots(1, figsize=(15,10))
            ax.plot(range(1, len(plot_losses) + 1), plot_losses)
            ax.grid()
            ax.set_title('Losses of Autoencoder')
            ax.set_xlabel('Epoch')
            ax.set_ylabel('Loss')
            fig.savefig("./output/losses_epoch.png")

In [None]:
encoder1 = nn.DataParallel(EncoderRNN()).to(device)
decoder1 = nn.DataParallel(DecoderRNN()).to(device)

trainIters(encoder1, decoder1, 100000, 50)

In [None]:
plot_losses = []

with open('./output/seqs.txt', 'r') as f:
    lines = f.readlines()
    for i in range(2, len(lines), 2):
        ss_inp = s4pred(lines[i])
        ss_out = s4pred(lines[i+1])
        plot_losses.append(nn.MSELoss(ss_inp, ss_out))
        print(nn.MSELoss(ss_inp, ss_out))

fig, ax = plt.subplots(1, figsize=(15,10))
ax.plot(range(1, len(plot_losses) + 1), plot_losses)
ax.grid()
ax.set_title('Losses of Autoencoder')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
fig.savefig("./output/losses_test.png")