In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.data import Field, BucketIterator
from torchtext import data

import spacy
import numpy as np
import os
import random
import pandas as pd
import time
import math

from sklearn.model_selection import train_test_split

In [2]:
stories = []
summaries = []
num = 1000
for idx, f in enumerate(os.listdir('./cnn/cnn_story')):
    file = './cnn/cnn_story/' + str(f)
    file = open(file)
    lines = ''.join(file.readlines())
    stories.append(lines)
    if idx == num:
        break
for idx, f in enumerate(os.listdir('./cnn/cnn_summary')):
    file = './cnn/cnn_summary/' + str(f)
    file = open(file)
    lines = ''.join(file.readlines())
    summaries.append(lines)
    if idx == num:
        break
assert len(stories) == len(summaries)
DATA = {"story": [story for story in stories], "summary": [summary for summary in summaries]}

In [3]:
df = pd.DataFrame(DATA, columns=["story", "summary"])
train, test = train_test_split(df, test_size=0.1)
train, validation = train_test_split(train, test_size=0.1)

train.to_json("train_data.json", orient="records", lines=True)
test.to_json("test_data.json", orient="records", lines=True)
validation.to_json("validation_data.json", orient="records", lines=True)

In [4]:
spacy_en = spacy.load('en')

def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
story_field = Field(tokenize = tokenize, init_token = '<sos>', eos_token = '<eos>', lower = True)
summary_field = Field(tokenize =tokenize, init_token = '<sos>', eos_token = '<eos>', lower = True)
fields = {'story': ('story', story_field), 'summary': ('summary', summary_field)}



In [6]:
train_data, validation_data, test_data = data.TabularDataset.splits(path = '',
                                        train = 'train_data.json',
                                        validation = 'validation_data.json',
                                        test = 'test_data.json',
                                        format = 'json',
                                        fields = fields)




In [7]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, validation_data, test_data), batch_size = BATCH_SIZE)



In [8]:
story_field.build_vocab(train_data, min_freq = 2)
summary_field.build_vocab(train_data, min_freq = 2)


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout_p):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional = True)
        self.encoder_hidden_to_context = nn.Linear(2*encoder_hidden_dim, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout_p)
        
    def forward(self, inp):
        #inp --> [source_len, batch_size]
        
        Embedding = self.dropout(self.embedding(inp))
        
        output, hidden = self.rnn(Embedding)
        #hidden --> [n_layers, batch, hidden_dim]
        
        context = torch.tanh(self.encoder_hidden_to_context(torch.cat((hidden[0,:,:], hidden[1,:,:]), dim = 1)))
        return output, context 

In [11]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden, decoder_hidden):
        super().__init__()
        
        self.attn = nn.Linear(2*encoder_hidden + decoder_hidden, decoder_hidden)
        self.v = nn.Linear(decoder_hidden, 1)
        
    def forward(self, decoder_hidden, encoder_outputs):
        
        #encoder_outputs --> [source_len, batch_size, encoder_hidden * 2]
        #decoder_hidden --> [batch_size, decoder_hidden]
        
        source_len = encoder_outputs.shape[0]
    
        hidden = hidden.unsqueeze(1).repeat(1, source_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        energy = torch.tanh(self.attn(torch.cat((encoder_outputs, decoder_hidden), dim = 2)))
        
        
        weights = self.v(energy).squeeze(2)
        
        return F.softmax(weights, dim = 1)
        

In [12]:
class Decoder(nn.Module):
    def __init__(self, output_size, decoder_emb_size, decoder_hidden_size, encoder_hidden_size, dropout_p, attention):
        super().__init__()
        
        self.attention = attention
        self.output_size = output_size
        
        self.embedding = nn.Embedding(output_size, decoder_emb_size)
        self.rnn = nn.GRU(encoder_hidden_size*2 + decoder_emb_size, decoder_hidden_size)
        self.fc_out = nn.Linear(encoder_hidden_size*2 + decoder_hidden_size + decoder_emb_size, output_size)
        self.droput = nn.Dropout(dropout_p)
    
    def forward(self, inp, hidden, encoder_outputs):
        
        inp.unsqueeze(0)
        embedding = self.embedding(inp)
        weights = self.attention(hidden, encoder_outputs)
        
        weights = weights.unsqueeze(1)
        #weights --> [batch_size, 1, src_len]
        
        #encoder_outputs --> [src_len, batch_size, encoder_hidden_dim*2]
        encoder_outputs.permute(1, 0, 2)
        #encoder_outputs --> [batch_size, src_len, encoder_hidden_dim*2]
        
        weighted = torch.bmm(weights, encoder_outputs)
        #weighted --> [batch_size, 1, encoder_hidden_dim*2]
        weighted.permute(1, 0, 2)
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        #rnn_input --> [1, batch_size, decoder_emb_size + 2*encoder_hidden_size]
        
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        output = output.squeeze(0)
        embedding = embedding.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = slef.fc_out(torch.cat((output, hidden, weighted), dim = 1))
        
        return prediction, hidden.squeeze(0)
        

In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.device = device
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, inp, expected_out, teacher_forcing_ratio = 0.5):
        
        #inp --> [source_len, batch_size]
        #expected_out --> [target_len, batch_size]
        
        batch_size = inp.shape[0]
        target_len = expected_out.shape[0]
        target_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)
        
        encoder_outputs, hidden = self.encoder(inp)
        
        inp = expected_out[0, :]
        for i in range(1, target_len):
            output, hidden = self.decoder(inp, hidden, encoder_outputs)
            outputs[i] = output
            
            teacher_force = True if random.random() > teacher_forcing_ration else False
            
            top1 = output.argmax(1)
            
            if teacher_force:
                inp = expected_out[i]
            else:
                inp = top1
                
        return outputs

In [14]:
INPUT_DIM = len(story_field.vocab)
OUTPUT_DIM = len(summary_field.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, ENC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(encoder, decoder, device).to(device)

In [15]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(17866, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (encoder_hidden_to_context): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=True)
    )
    (embedding): Embedding(3717, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=3717, bias=True)
    (droput): Dropout(p=0.5, inplace=False)
  )
)

In [18]:
optimizer = optim.Adam(model.parameters())

In [19]:
TRG_PAD_IDX = summary_field.vocab.stoi[summary_field.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [21]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for batch in iterator:
        story = batch.story
        summary = batch.summary
        
        optimizer.zero_grad()
        output = model(story, summary)
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        target = summary.view(-1)
        
        loss = criterion(output, target)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [22]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 1
CLIP = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    validation_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if validation_loss < best_valid_loss:
        best_valid_loss = validation_loss
        torch.save(model.state_dict(), 'seq2seq_attention.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

