<a href="https://colab.research.google.com/github/yixish/NLPLearning/blob/master/%E2%80%9CSeq2Seq(Attention).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as Data
import numpy as np

import random
import math
import time

Set the random seeds for reproducability.

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import pandas as pd 
dir = '/content/gdrive/My Drive/dataset/'
df = pd.read_csv(dir+"Sentiment_Extraction103/train.csv")

Load the German and English spaCy models.

In [23]:
texts = df['text'].values
selected_texts = df['selected_text'].values
texts = texts[:512]
selected_texts = selected_texts[:512]
pairs = []
for i in range(512):
    pairs.append([texts[i],selected_texts[i]])

In [24]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
vocab = set()
vocab.add('<sos>')
vocab.add('<eos>')
vocab.add('<pad>')
vocab.add('<unk>')

def build_vocab(vocab,texts):
    max_l = 0
    for text in texts:
        words = nltk.word_tokenize(text)
        if max_l < len(words):
            max_l = len(words)
        for word in words:
            vocab.add(word)
    print(max_l)
build_vocab(vocab,texts)
build_vocab(vocab,selected_texts)
print("vocal size : {}".format(len(vocab)))

35
32
vocal size : 2452


In [26]:
word2idx = { word:i for i,word in enumerate(list(vocab))}
idx2word = { i:word for i,word in enumerate(list(vocab))}                         

In [27]:
n_step = 40
batch_size = 128
n_hidden = 128
emb_dim = 100
vocab_size = len(vocab)

def make_data(seq_data):
    enc_input_all, dec_input_all, dec_output_all = [], [], []

    def word_2_idx(word):
        if word in vocab:
            return word2idx[word]
        else: 
            return word2idx['<unk>']

    for seq in seq_data:

        enc_input = [word_2_idx(n) for n in  nltk.word_tokenize(seq[0])]
        enc_input.append(word2idx['<eos>'])
        dec_input = []
        dec_input.append(word2idx['<sos>'])
        dec_input.extend([word_2_idx(n) for n in nltk.word_tokenize(seq[1])])

        dec_output = [word_2_idx(n) for n in nltk.word_tokenize(seq[1])] 
        dec_output.append(word2idx['<eos>'])

        for i in range(n_step - len(enc_input)):
            enc_input.append(word2idx['<pad>'])
        for i in range(n_step - len(dec_input)):
            dec_input.append(word2idx['<pad>'])
        for i in range(n_step - len(dec_output)):
            dec_output.append(word2idx['<pad>'])

        # enc_input_all.append(np.eye(vocab_size)[enc_input])
        # dec_input_all.append(np.eye(vocab_size)[dec_input])
        enc_input_all.append(enc_input)
        dec_input_all.append(dec_input)
        dec_output_all.append(dec_output) 


    # make tensor
    return torch.LongTensor(enc_input_all), torch.LongTensor(dec_input_all), torch.LongTensor(dec_output_all)

enc_input_all, dec_input_all, dec_output_all = make_data(pairs)

In [28]:
class TranslateDataSet(Data.Dataset):
    def __init__(self, enc_input_all, dec_input_all, dec_output_all):
        self.enc_input_all = enc_input_all
        self.dec_input_all = dec_input_all
        self.dec_output_all = dec_output_all
    
    def __len__(self): # return dataset size
        return len(self.enc_input_all)
    
    def __getitem__(self, idx):
        return self.enc_input_all[idx], self.dec_input_all[idx], self.dec_output_all[idx]

loader = Data.DataLoader(TranslateDataSet(enc_input_all, dec_input_all, dec_output_all), batch_size, True)

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

Create the iterators.

In [39]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src): 
        '''
        src = [src_len, batch_size]
        '''
        src = src.transpose(0, 1) # src = [batch_size, src_len]

        embedded = self.dropout(self.embedding(src)).transpose(0, 1) # embedded = [src_len, batch_size, emb_dim]

        # enc_output = [src_len, batch_size, hid_dim * num_directions]
        # enc_hidden = [n_layers * num_directions, batch_size, hid_dim]
        enc_output, enc_hidden = self.rnn(embedded) # if h_0 is not give, it will be set 0 acquiescently

        # enc_hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        # enc_output are always from the last layer
        
        # enc_hidden [-2, :, : ] is the last of the forwards RNN 
        # enc_hidden [-1, :, : ] is the last of the backwards RNN
        
        # initial decoder hidden is final hidden state of the forwards and backwards 
        # encoder RNNs fed through a linear layer
        # s = [batch_size, dec_hid_dim]
        s = torch.tanh(self.fc(torch.cat((enc_hidden[-2,:,:], enc_hidden[-1,:,:]), dim = 1)))
        
        return enc_output, s

In [40]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim, bias=False)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, s, enc_output):
        
        # s = [batch_size, dec_hid_dim]
        # enc_output = [src_len, batch_size, enc_hid_dim * 2]
        
        batch_size = enc_output.shape[1]
        src_len = enc_output.shape[0]
        
        # repeat decoder hidden state src_len times
        # s = [batch_size, src_len, dec_hid_dim]
        # enc_output = [batch_size, src_len, enc_hid_dim * 2]
        s = s.unsqueeze(1).repeat(1, src_len, 1)
        enc_output = enc_output.transpose(0, 1)
        
        # energy = [batch_size, src_len, dec_hid_dim]
        energy = torch.tanh(self.attn(torch.cat((s, enc_output), dim = 2)))
        
        # attention = [batch_size, src_len]
        attention = self.v(energy).squeeze(2)
        
        return F.softmax(attention, dim=1)

In [41]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, dec_input, s, enc_output):
             
        # dec_input = [batch_size]
        # s = [batch_size, dec_hid_dim]
        # enc_output = [src_len, batch_size, enc_hid_dim * 2]
        
        dec_input = dec_input.unsqueeze(1) # dec_input = [batch_size, 1]
        
        embedded = self.dropout(self.embedding(dec_input)).transpose(0, 1) # embedded = [1, batch_size, emb_dim]
        
        # a = [batch_size, 1, src_len]  
        a = self.attention(s, enc_output).unsqueeze(1)
        
        # enc_output = [batch_size, src_len, enc_hid_dim * 2]
        enc_output = enc_output.transpose(0, 1)

        # c = [1, batch_size, enc_hid_dim * 2]
        c = torch.bmm(a, enc_output).transpose(0, 1)

        # rnn_input = [1, batch_size, (enc_hid_dim * 2) + emb_dim]
        rnn_input = torch.cat((embedded, c), dim = 2)
            
        # dec_output = [src_len(=1), batch_size, dec_hid_dim]
        # dec_hidden = [n_layers * num_directions, batch_size, dec_hid_dim]
        dec_output, dec_hidden = self.rnn(rnn_input, s.unsqueeze(0))
        
        # embedded = [batch_size, emb_dim]
        # dec_output = [batch_size, dec_hid_dim]
        # c = [batch_size, enc_hid_dim * 2]
        embedded = embedded.squeeze(0)
        dec_output = dec_output.squeeze(0)
        c = c.squeeze(0)
        
        # pred = [batch_size, output_dim]
        pred = self.fc_out(torch.cat((dec_output, c, embedded), dim = 1))
        
        return pred, dec_hidden.squeeze(0)

In [42]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        # src = [src_len, batch_size]
        # trg = [trg_len, batch_size]
        # teacher_forcing_ratio is probability to use teacher forcing

        src = src.transpose(0,1)
        trg = trg.transpose(0,1)
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)


        # enc_output is all hidden states of the input sequence, back and forwards
        # s is the final forward and backward hidden states, passed through a linear layer
        enc_output, s = self.encoder(src)
                
        # first input to the decoder is the <sos> tokens
        dec_input = trg[0,:]
        
        for t in range(1, trg_len):
            
            # insert dec_input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state
            dec_output, s = self.decoder(dec_input, s, enc_output)
            
            # place predictions in a tensor holding predictions for each token
            outputs[t] = dec_output
            
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            # get the highest predicted token from our predictions
            top1 = dec_output.argmax(1) 
            
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            dec_input = trg[t] if teacher_force else top1

        return outputs

In [43]:
# INPUT_DIM = len(SRC.vocab)
# OUTPUT_DIM = len(TRG.vocab)
INPUT_DIM = vocab_size
OUTPUT_DIM  = vocab_size
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 256
DEC_HID_DIM = 256
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [44]:
for epoch in range(500):
  for enc_input_batch, dec_input_batch, dec_output_batch in loader:
      # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
    #   h_0 = torch.zeros(1, batch_size, n_hidden).to(device)

      (enc_input_batch, dec_intput_batch, dec_output_batch) = (enc_input_batch.to(device), dec_input_batch.to(device), dec_output_batch.to(device))
      # enc_input_batch : [batch_size, n_step+1, n_class]
      # dec_intput_batch : [batch_size, n_step+1, n_class]
      # dec_output_batch : [batch_size, n_step+1], not one-hot


      pred = model(enc_input_batch,dec_intput_batch,0)
      # pred : [n_step+1, batch_size, n_class]
      pred = pred.transpose(0, 1) # [batch_size, n_step+1(=6), n_class]
      loss = 0
      for i in range(len(dec_output_batch)):
          # pred[i] : [n_step+1, n_class]
          # dec_output_batch[i] : [n_step+1]
          loss += criterion(pred[i], dec_output_batch[i])
      if (epoch + 1) % 10 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
          
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0010 cost = 198.478806
Epoch: 0010 cost = 205.915100
Epoch: 0010 cost = 202.548828
Epoch: 0010 cost = 187.783661
Epoch: 0020 cost = 195.139252
Epoch: 0020 cost = 205.033325
Epoch: 0020 cost = 167.998169
Epoch: 0020 cost = 188.339142
Epoch: 0030 cost = 177.108459
Epoch: 0030 cost = 179.107635
Epoch: 0030 cost = 173.080536
Epoch: 0030 cost = 157.075287
Epoch: 0040 cost = 143.527313
Epoch: 0040 cost = 165.433548
Epoch: 0040 cost = 154.091919
Epoch: 0040 cost = 166.976456
Epoch: 0050 cost = 127.797264
Epoch: 0050 cost = 143.636932
Epoch: 0050 cost = 151.816650
Epoch: 0050 cost = 152.664948
Epoch: 0060 cost = 141.294754
Epoch: 0060 cost = 126.696762
Epoch: 0060 cost = 133.156570
Epoch: 0060 cost = 123.331192
Epoch: 0070 cost = 137.483170
Epoch: 0070 cost = 113.408058
Epoch: 0070 cost = 123.395325
Epoch: 0070 cost = 111.496490
Epoch: 0080 cost = 113.469193
Epoch: 0080 cost = 105.890053
Epoch: 0080 cost = 105.625763
Epoch: 0080 cost = 114.709244
Epoch: 0090 cost = 95.989594
Epoch: 0090

In [47]:
# Test
def translate(word):
    enc_input, dec_input, _ = make_data([[word, '']])
    enc_input, dec_input = enc_input.to(device), dec_input.to(device)
    # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
    # hidden = torch.zeros(1, 1, n_hidden).to(device)
    # output = model(enc_input, hidden, dec_input)
    output = model(enc_input, dec_input, 0)

    # output : [n_step+1, batch_size, n_class]

    predict = output.data.max(2, keepdim=True)[1] # select n_class dimension

    decoded = [idx2word[i.item()] for i in predict]
    translated = ' '.join(decoded[:decoded.index('<pad>')])
    translated = translated.replace('<eos>','')
    return translated

In [48]:
# pairs[:5]
for sent in pairs[:10]:
    print("{} => {}".format(sent[1], translate(sent[0])))

love => ~ 
Starbucks I`m lovin` it => ~ I`m lovin` it 
.yummmmy! => ~ ! 
Hello, I see your online, can u talk to me pleeez!  From a fellow BAMF. lol => ~ , I see your online , can u talk to me pleeez ! From a fellow BAMF . lol 
fun => ~ 
it did, i didnt really watch it haha => ~ did , i didnt really watch it haha 
Wish => ~ 
Check this video out -- Bylaurenluke ~ Make up Launch~ They are here available now   http://tinyurl.com/cudamo => ~ this video out -- Bylaurenluke ~ Make up Launch~ They are here available now http : //tinyurl.com/cudamo 
Re-direct that energy into creating men`s jewelry.    And frequent walks to => ~ that energy into creating men`s jewelry . And frequent walks 
what`s wrong with dressing in fifties fashion? => ~ wrong with dressing in fifties fashion ? 
