In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pandas as pd
from fastText import load_model
from matplotlib import pylab
from sklearn.manifold import TSNE
from tensorboardX import SummaryWriter

writer = SummaryWriter()

In [2]:
# Fasttext embeddings trained on train and val sets
# ./fasttext skipgram -input input_text_file -output output_model -dim 128 (fastText-0.1.0)
fasttext_model = load_model('word_vectors/fasttext_model.bin')
num_dims = 128

In [3]:
# vocab contains frequent words apperaing in the text along with their frequencies
# minimum frequency = 6
vocab_file = open('finished_files/vocab')
# Store appearing words
vocab_words = {}
for line in vocab_file:
    li = line.split()
    if len(li) == 2:
        word, freq = li
        vocab_words[word] = freq
# Final word to id dictionary    
word2id = {}
tokens = ['<pad>', '<unk>', '<eos>']
for token in tokens:
    word2id[token] = len(word2id)
# Retrieve words from fasttext model and keep only those which are also present in 'vocab'
fasttext_words = fasttext_model.get_words()
for word in fasttext_words:
    if word in vocab_words:
        word2id[word] = len(word2id)        
vocab_size = len(word2id)
# Reverse dictionary
id2word = dict(zip(word2id.values(), word2id.keys()))
# Embeddings
embeddings = np.zeros((vocab_size, num_dims))
# <pad> token vector contains all zeros. Rest sampled from a normal distribution
mu, sigma = 0, 0.05
for i in range(1, len(tokens)):
    embeddings[i] = np.random.normal(mu, sigma, num_dims)
# Get word vectors from fasttext model and store in embeddings matrix
for i in range(len(tokens), vocab_size):
    embeddings[i] = fasttext_model.get_word_vector(id2word[i])
    
del fasttext_model, vocab_words

In [4]:
temp = {}
for i in range(10000):
    temp[i] = id2word[i]
id2word = temp
embeddings = embeddings[:10000]
word2id = dict(zip(id2word.values(), id2word.keys()))

vocab_size = len(word2id)

num_points = 500

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(embeddings[1:num_points+1, :])

def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15,15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    pylab.show()

words = [id2word[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

In [5]:
batch_size = 64
max_article_size = 50 #400
max_abstract_size = 15 #100
hidden_size = 512

In [6]:
class Batch:
    def __init__(self):
        self.abstract = (None, None)
        self.article = (None, None)

In [7]:
class BatchGenerator: 
    
    def __init__(self, batch_size, dataframe):
        self.batch_size = batch_size
        # train, valid, or test dataframe imported from csv
        self.df = dataframe
        self.generator = self.row_generator()
        
        
    def row_generator(self):
        for row in self.df.itertuples(index=False):
            yield row
            
    def build_batch(self, rows):
        # If number of rows less than batch size, get extra rows from the beginning of the dataframe
        if len(rows) < self.batch_size:
            temp_generator = self.row_generator()
            for i in range(self.batch_size - len(rows)):
                rows.append(self.get_row(temp_generator))
                
        # Get lengths of all the sequences in the batch upto max number of tokens
        # + 1 is for the <eos> token
        abstract_lengths = torch.cuda.LongTensor(
            [len(row.abstract.split()[:max_abstract_size]) for row in rows]) + 1
        article_lengths = torch.cuda.LongTensor(
            [len(row.article.split()[:max_article_size]) for row in rows]) + 1 
        abs_len = torch.max(abstract_lengths)
        art_len = torch.max(article_lengths) 
        
        # Variables containing abstracts and articles of the batch
        abstracts = torch.cuda.LongTensor(abs_len, self.batch_size).fill_(0) # zero padding
        articles = torch.cuda.LongTensor(art_len, self.batch_size).fill_(0) # zero padding
        
        # Sort rows in descending order of sequence (article) lengths
        article_lengths, indices = torch.sort(article_lengths, descending=True)
        rows = [rows[i] for i in indices]
        abstract_lengths = torch.cuda.LongTensor([abstract_lengths[i] for i in indices])
        
        for i in range(self.batch_size):
            # Tokenize abstract and take max_abstract_size number of tokens
            tokens = rows[i].abstract.split()[:max_abstract_size]
            tokens.append('<eos>')
            # Convert each token to word index
            # <unk> token index for unknown words
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            # Store as column in abstracts variable with zero padding
            abstracts[:,i][:len(token_list)] = token_list
            
            # Same for articles
            tokens = rows[i].article.split()[:max_article_size]
            tokens.append('<eos>')
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            articles[:,i][:len(token_list)] = token_list
            
        batch = Batch()
        batch.article = (Variable(articles), article_lengths)
        batch.abstract = (Variable(abstracts), abstract_lengths)
        return batch
            
    def get_row(self, generator):
        row = generator.__next__()
        while not isinstance(row.article, str):
            row = generator.__next__()
        return row
        
        
    def get_batch(self):
        rows = []
        for b in range(self.batch_size):
            try: rows.append(self.get_row(self.generator))
            except StopIteration: break
        if rows: return self.build_batch(rows)
        else: raise StopIteration

In [8]:
class Encoder(nn.Module):
    
    def __init__(self, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        
        # Hidden states of the encoder
        self.hidden_states = None
        self.sequence_lengths = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, hidden_size, num_layers=1, bidirectional=True)
        self.lstm.cuda()
        
        # Linear transformation 
        self.linear_transform = nn.Linear(2*hidden_size, vocab_size)
    
    def forward(self, articles, article_lengths):
        # Embedding lookup
        input = self.embed(articles) # (T,B,N)
        # batch is sorted in descending order of sequence lengths
        packed_input = pack_padded_sequence(input, list(article_lengths))
        packed_output, last_hidden = self.lstm(packed_input)
        self.hidden_states, self.sequence_lengths = pad_packed_sequence(packed_output) # hidden_states (T,B,2H)
        
        # Sum hidden states for all time steps for bidirectional lstm
        #self.hidden_states = unpacked[:,:,:hidden_size] + unpacked[:,:,hidden_size:] for summing hidden states
        
        # Concatenate hidden and cell states of last time step for bidirectional lstm
        h_n = torch.cat((last_hidden[0][0], last_hidden[0][1]), dim=1) #(B,2H)
        c_n = torch.cat((last_hidden[1][0], last_hidden[1][1]), dim=1) #(B,2H)
        
        hiddenT = (h_n, c_n)
        output = self.linear_transform(h_n)
        
        # Final hidden state
        return hiddenT, output
    
encoder = Encoder(batch_size)
encoder.cuda()

Encoder(
  (embed): Embedding(10000, 128)
  (lstm): LSTM(128, 512, bidirectional=True)
  (linear_transform): Linear(in_features=1024, out_features=10000, bias=True)
)

In [9]:
class Decoder(nn.Module):
    
    def __init__(self):
        super(Decoder, self).__init__()
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
    
        self.hidden = None
        self.lstm_cell = nn.LSTMCell(num_dims, 2*hidden_size).cuda()

        # Linear transformations 
        self.output_lt = nn.Linear(2*hidden_size, vocab_size)
        self.context_lt = nn.Linear(2*hidden_size, 2*hidden_size)
        
        #self.W_att = nn.Parameter()
            
    
    def attention(self, h_t, batch_size):
        score = self.score_function(h_t, batch_size)
        #for b in range(batch_size):
            #score[:,b][encoder.sequence_lengths[b]:] = float('-inf')
        softmax = nn.Softmax(dim=0)
        a_t = softmax(score).unsqueeze(2) # (T,B,1)
        c_t = encoder.hidden_states * a_t # c_t (T,B,2H)
        context = torch.sum(c_t, dim=0) # (B,2H)
        return context
        
    def score_function(self, h_t, batch_size):
        seq_len = encoder.hidden_states.shape[0]
        score = Variable(torch.cuda.FloatTensor(seq_len, batch_size))
        for t in range(seq_len):
            score[t] = torch.diag(torch.mm(encoder.hidden_states[t], torch.transpose(h_t, 1, 0)))
        return score #(T,B)
    

    def forward(self, input, batch_size):
        # input is a LongTensor of size B
        input = self.embed(input) #(B,N)

        # Attention mechanism
        context = self.attention(self.hidden[0], batch_size) #(B,2H)
        h_0 = self.context_lt(context) + self.hidden[0] #(B,2H)
        self.hidden = (h_0, self.hidden[1])
        
        self.hidden = self.lstm_cell(input, self.hidden)

        output = self.output_lt(self.hidden[0]) #(B,V)
        return output
    
decoder = Decoder()
decoder.cuda()

Decoder(
  (embed): Embedding(10000, 128)
  (lstm_cell): LSTMCell(128, 1024)
  (output_lt): Linear(in_features=1024, out_features=10000, bias=True)
  (context_lt): Linear(in_features=1024, out_features=1024, bias=True)
)

for name, param in decoder.named_parameters():
    if param.requires_grad:
        print (name, param.data)

In [10]:
learning_rate = 0.5 #3.0, 3.5

# Filter parameters that do not require gradients
encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
# Optimizers
encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
# Loss function
# Way to accumulate loss on sequences with variable lengths in batches :
# size_average: By default, the losses are averaged over observations for each minibatch.
# However, if the field size_average is set to False, the losses are instead summed for each minibatch. 
# Ignored if reduce is False.
# Set size_average to False and divide the loss by the number of non-padding tokens.
# ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. 
# When size_average is True, the loss is averaged over non-ignored targets.
# Set ignore_index to the padding value
loss_function = nn.NLLLoss(size_average=False, ignore_index=0).cuda() # 0 is the index of <pad>

def train_model(batch):
    loss = 0
    # Clear optimizer gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # articles, abstracts are LongTensor vairables of shape (max_sequence_length, B)
    # containig word indices from the respective vocabs
    # lengths are LongTensor varibles of shape batch_size containing
    # lengths of all the sequences in the batch
    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    hiddenT, output = encoder(articles, article_lengths)
    
    # Initialize decoder hidden state
    
    decoder.hidden = hiddenT
    
    # First input to the decoder is the predicted word from the last state of encoder
    output = softmax(output, batch_size)
    _, input = torch.topk(output, 1, dim=1)
    # Looping over all the sequences
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input, batch_size)
        output = softmax(output, batch_size)
        _, input = torch.topk(output, 1, dim=1)
        loss += loss_function(output, abstracts[t])
        
    loss = loss/torch.sum(abstract_lengths)
    loss.backward()
    
    #nn.utils.clip_grad_norm(encoder.parameters(), 0.5)
    #nn.utils.clip_grad_norm(decoder.parameters(), 0.5)

    encoder_optimizer.step()
    decoder_optimizer.step()
    
    # Initialize hidden_list for next batch of inputs
    decoder.hidden_list = []
    
    return loss.data[0]

In [11]:
def validation_loss(df):
    batch_size = 1
    generator = BatchGenerator(batch_size, df)
    loss = 0
    step = 0
    while True:
        try:
            batch = generator.get_batch()
            step += 1
        except StopIteration: break
        loss += calc_loss(batch, batch_size)
    loss = loss/step
    return loss

def calc_loss(batch, batch_size):
    loss = 0
    encoder.hidden = encoder.init_hidden(batch_size, volatile=True)
    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    
    articles.volatile = True
    abstracts.volatile = True
        
    hiddenT, output = encoder(articles, article_lengths) ###
    for layer in range(hidden_layers):
        decoder.hidden_list.append((hiddenT[0][layer], hiddenT[1][layer])) 
    #input = Variable(torch.cuda.LongTensor(batch_size).fill_(2), volatile=True)
    input = most_likely(output, batch_size)
    
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input)
        input = most_likely(output, batch_size)
        loss += loss_function(output, abstracts[t])
    loss = loss/torch.sum(abstract_lengths)
    decoder.hidden_list = []
    return loss

In [12]:
def softmax(output, batch_size):
    if batch_size > 1:
        log_softmax = nn.LogSoftmax(dim=1)
        output = log_softmax(output)
    else: 
        log_softmax = nn.LogSoftmax(dim=0)
        output = log_softmax(output)
        _, next_input = torch.topk(output, 1)##
    return output

In [13]:
cudnn.benchmark = True
cudnn.fasttest = True
epochs = 7000 #7000

encoder.train()
decoder.train()

train_df = pd.read_csv('datasets/train.csv')
val_df = pd.read_csv('datasets/val.csv')
iteration = 1

for epoch in range(epochs):
    
    if epoch % 1500 == 0: #500, 275
        learning_rate = learning_rate / 2 #2
        # Filter parameters that do not require gradients
        encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
        # Optimizers
        encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
        decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
        print('')
        print('learning rate: %f' % learning_rate)
        print('')
        
    generator = BatchGenerator(batch_size, train_df[:64]) #64

    while True:
        try: 
            batch = generator.get_batch()
        except StopIteration: break
        loss = train_model(batch)
        
        if iteration % 2 == 0:
            print('Average minibatch loss at step %d: %.3f' % (iteration, loss))
            writer.add_scalar('train_loss', loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
        
        """if iteration % 8 == 0:    
            encoder.eval()
            decoder.eval()
            val_loss = validation_loss(val_df[:8]) # truncating validation dataframe
            print('Validation loss: %.3f' % val_loss)
            
            writer.add_scalar('valid_loss', val_loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
            
            encoder.train()
            decoder.train()"""
        iteration += 1


learning rate: 0.250000

Average minibatch loss at step 2: 9.199
Average minibatch loss at step 4: 9.188
Average minibatch loss at step 6: 9.175
Average minibatch loss at step 8: 9.162
Average minibatch loss at step 10: 9.149
Average minibatch loss at step 12: 9.135
Average minibatch loss at step 14: 9.121
Average minibatch loss at step 16: 9.106
Average minibatch loss at step 18: 9.091
Average minibatch loss at step 20: 9.074
Average minibatch loss at step 22: 9.056
Average minibatch loss at step 24: 9.036
Average minibatch loss at step 26: 9.013
Average minibatch loss at step 28: 8.987
Average minibatch loss at step 30: 8.956
Average minibatch loss at step 32: 8.918
Average minibatch loss at step 34: 8.868
Average minibatch loss at step 36: 8.800
Average minibatch loss at step 38: 8.697
Average minibatch loss at step 40: 8.524
Average minibatch loss at step 42: 8.215
Average minibatch loss at step 44: 7.827
Average minibatch loss at step 46: 7.676
Average minibatch loss at step 48: 

Average minibatch loss at step 394: 5.160
Average minibatch loss at step 396: 4.645
Average minibatch loss at step 398: 4.744
Average minibatch loss at step 400: 4.859
Average minibatch loss at step 402: 4.640
Average minibatch loss at step 404: 4.758
Average minibatch loss at step 406: 4.665
Average minibatch loss at step 408: 4.836
Average minibatch loss at step 410: 4.702
Average minibatch loss at step 412: 4.802
Average minibatch loss at step 414: 4.468
Average minibatch loss at step 416: 4.470
Average minibatch loss at step 418: 4.954
Average minibatch loss at step 420: 4.490
Average minibatch loss at step 422: 4.441
Average minibatch loss at step 424: 4.749
Average minibatch loss at step 426: 4.939
Average minibatch loss at step 428: 4.448
Average minibatch loss at step 430: 4.611
Average minibatch loss at step 432: 4.967
Average minibatch loss at step 434: 4.815
Average minibatch loss at step 436: 4.550
Average minibatch loss at step 438: 4.521
Average minibatch loss at step 440

Average minibatch loss at step 786: 4.068
Average minibatch loss at step 788: 4.011
Average minibatch loss at step 790: 4.173
Average minibatch loss at step 792: 4.049
Average minibatch loss at step 794: 4.028
Average minibatch loss at step 796: 4.039
Average minibatch loss at step 798: 4.005
Average minibatch loss at step 800: 4.068
Average minibatch loss at step 802: 4.122
Average minibatch loss at step 804: 4.239
Average minibatch loss at step 806: 3.976
Average minibatch loss at step 808: 3.973
Average minibatch loss at step 810: 3.941
Average minibatch loss at step 812: 3.929
Average minibatch loss at step 814: 3.961
Average minibatch loss at step 816: 3.980
Average minibatch loss at step 818: 3.989
Average minibatch loss at step 820: 4.302
Average minibatch loss at step 822: 4.221
Average minibatch loss at step 824: 4.048
Average minibatch loss at step 826: 3.899
Average minibatch loss at step 828: 3.930
Average minibatch loss at step 830: 4.062
Average minibatch loss at step 832

Average minibatch loss at step 1174: 3.922
Average minibatch loss at step 1176: 3.902
Average minibatch loss at step 1178: 3.908
Average minibatch loss at step 1180: 3.891
Average minibatch loss at step 1182: 3.912
Average minibatch loss at step 1184: 3.886
Average minibatch loss at step 1186: 3.889
Average minibatch loss at step 1188: 3.912
Average minibatch loss at step 1190: 3.884
Average minibatch loss at step 1192: 3.890
Average minibatch loss at step 1194: 3.871
Average minibatch loss at step 1196: 3.879
Average minibatch loss at step 1198: 3.849
Average minibatch loss at step 1200: 3.850
Average minibatch loss at step 1202: 3.881
Average minibatch loss at step 1204: 3.862
Average minibatch loss at step 1206: 3.880
Average minibatch loss at step 1208: 3.889
Average minibatch loss at step 1210: 3.849
Average minibatch loss at step 1212: 3.849
Average minibatch loss at step 1214: 3.832
Average minibatch loss at step 1216: 3.843
Average minibatch loss at step 1218: 3.822
Average min

Average minibatch loss at step 1554: 3.329
Average minibatch loss at step 1556: 3.327
Average minibatch loss at step 1558: 3.325
Average minibatch loss at step 1560: 3.323
Average minibatch loss at step 1562: 3.322
Average minibatch loss at step 1564: 3.320
Average minibatch loss at step 1566: 3.318
Average minibatch loss at step 1568: 3.317
Average minibatch loss at step 1570: 3.316
Average minibatch loss at step 1572: 3.314
Average minibatch loss at step 1574: 3.312
Average minibatch loss at step 1576: 3.311
Average minibatch loss at step 1578: 3.309
Average minibatch loss at step 1580: 3.308
Average minibatch loss at step 1582: 3.306
Average minibatch loss at step 1584: 3.305
Average minibatch loss at step 1586: 3.303
Average minibatch loss at step 1588: 3.301
Average minibatch loss at step 1590: 3.300
Average minibatch loss at step 1592: 3.298
Average minibatch loss at step 1594: 3.296
Average minibatch loss at step 1596: 3.295
Average minibatch loss at step 1598: 3.293
Average min

Average minibatch loss at step 1936: 2.996
Average minibatch loss at step 1938: 2.995
Average minibatch loss at step 1940: 2.994
Average minibatch loss at step 1942: 2.992
Average minibatch loss at step 1944: 2.992
Average minibatch loss at step 1946: 2.994
Average minibatch loss at step 1948: 2.991
Average minibatch loss at step 1950: 2.984
Average minibatch loss at step 1952: 2.984
Average minibatch loss at step 1954: 2.983
Average minibatch loss at step 1956: 2.983
Average minibatch loss at step 1958: 2.979
Average minibatch loss at step 1960: 2.975
Average minibatch loss at step 1962: 2.974
Average minibatch loss at step 1964: 2.972
Average minibatch loss at step 1966: 2.970
Average minibatch loss at step 1968: 2.966
Average minibatch loss at step 1970: 2.965
Average minibatch loss at step 1972: 2.962
Average minibatch loss at step 1974: 2.961
Average minibatch loss at step 1976: 2.960
Average minibatch loss at step 1978: 2.956
Average minibatch loss at step 1980: 2.953
Average min

Average minibatch loss at step 2318: 2.596
Average minibatch loss at step 2320: 2.586
Average minibatch loss at step 2322: 2.582
Average minibatch loss at step 2324: 2.574
Average minibatch loss at step 2326: 2.571
Average minibatch loss at step 2328: 2.567
Average minibatch loss at step 2330: 2.566
Average minibatch loss at step 2332: 2.561
Average minibatch loss at step 2334: 2.560
Average minibatch loss at step 2336: 2.558
Average minibatch loss at step 2338: 2.552
Average minibatch loss at step 2340: 2.552
Average minibatch loss at step 2342: 2.550
Average minibatch loss at step 2344: 2.547
Average minibatch loss at step 2346: 2.543
Average minibatch loss at step 2348: 2.541
Average minibatch loss at step 2350: 2.542
Average minibatch loss at step 2352: 2.542
Average minibatch loss at step 2354: 2.542
Average minibatch loss at step 2356: 2.540
Average minibatch loss at step 2358: 2.542
Average minibatch loss at step 2360: 2.569
Average minibatch loss at step 2362: 2.601
Average min

Average minibatch loss at step 2700: 2.076
Average minibatch loss at step 2702: 2.101
Average minibatch loss at step 2704: 2.089
Average minibatch loss at step 2706: 2.044
Average minibatch loss at step 2708: 2.041
Average minibatch loss at step 2710: 2.070
Average minibatch loss at step 2712: 2.108
Average minibatch loss at step 2714: 2.131
Average minibatch loss at step 2716: 2.100
Average minibatch loss at step 2718: 2.111
Average minibatch loss at step 2720: 2.176
Average minibatch loss at step 2722: 2.369
Average minibatch loss at step 2724: 3.412
Average minibatch loss at step 2726: 2.890
Average minibatch loss at step 2728: 4.126
Average minibatch loss at step 2730: 2.284
Average minibatch loss at step 2732: 2.086
Average minibatch loss at step 2734: 2.043
Average minibatch loss at step 2736: 2.027
Average minibatch loss at step 2738: 2.021
Average minibatch loss at step 2740: 2.030
Average minibatch loss at step 2742: 2.062
Average minibatch loss at step 2744: 2.088
Average min

Average minibatch loss at step 3080: 1.421
Average minibatch loss at step 3082: 1.419
Average minibatch loss at step 3084: 1.418
Average minibatch loss at step 3086: 1.417
Average minibatch loss at step 3088: 1.416
Average minibatch loss at step 3090: 1.416
Average minibatch loss at step 3092: 1.417
Average minibatch loss at step 3094: 1.422
Average minibatch loss at step 3096: 1.421
Average minibatch loss at step 3098: 1.414
Average minibatch loss at step 3100: 1.408
Average minibatch loss at step 3102: 1.404
Average minibatch loss at step 3104: 1.398
Average minibatch loss at step 3106: 1.393
Average minibatch loss at step 3108: 1.393
Average minibatch loss at step 3110: 1.397
Average minibatch loss at step 3112: 1.397
Average minibatch loss at step 3114: 1.389
Average minibatch loss at step 3116: 1.382
Average minibatch loss at step 3118: 1.378
Average minibatch loss at step 3120: 1.375
Average minibatch loss at step 3122: 1.374
Average minibatch loss at step 3124: 1.376
Average min

KeyboardInterrupt: 

torch.save(encoder.state_dict(), 'encoder')
torch.save(decoder.state_dict(), 'decoder')