In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
import pickle

tr = pickle.load(open('Dataset/tr.pkl', 'rb'))
vl = pickle.load(open('Dataset/vl.pkl', 'rb'))
ts = pickle.load(open('Dataset/ts.pkl', 'rb'))

print(len(tr), len(vl), len(ts))

In [None]:
VOCAB = 10
LEN = 15

In [None]:
import torch as T
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
from tqdm import tqdm_notebook as tqdm

class DS(Dataset):
    def __init__(self, dat):
        super().__init__()
        
        self.dat = dat
    
    def __len__(self):
        return len(self.dat)
    
    def __getitem__(self, idx):
        inp, dec_inp, dec_out = self.dat[idx]
        inp, dec_inp, dec_out = np.asarray(inp), np.asarray(dec_inp), np.asarray(dec_out)
        wgt = 1.0-(dec_inp==(VOCAB+1))
        
        return inp, dec_inp, dec_out, wgt

ld_tr = DataLoader(DS(tr), batch_size=50, shuffle=True)
ld_vl = DataLoader(DS(vl), batch_size=100)
ld_ts = DataLoader(DS(ts), batch_size=100)

for inp, dec_inp, dec_out, wgt in ld_tr:
    print(inp.shape, dec_inp.shape, dec_out.shape, wgt.shape)
    
    print(inp[0], dec_inp[0], dec_out[0], wgt[0])
    
    break

In [None]:
import math

class Model(nn.Module):
    def __init__(self, EMB=8, HID=64, DP=0.5):
        super().__init__()
        
        self.EMB = EMB
        self.HID = HID
        self.DP = nn.Dropout(DP)
        
        self.emb = nn.Embedding(VOCAB+2, self.EMB)
        
        self.enc = nn.GRU(self.EMB, self.HID, 
                          batch_first=True, bidirectional=True)
        self.dec = nn.GRU(self.EMB, self.HID*2, 
                          batch_first=True)
        self.att = nn.Parameter(T.FloatTensor(self.HID*2, self.HID*2))
        
        self.fc = nn.Linear(self.HID*2*2, VOCAB+2)
        
        self.init()
        
    def init(self):
        stdv = 1/math.sqrt(self.HID*2)
        
        self.att.data.uniform_(stdv, -stdv)
    
    def run_dec(self, dec_inp, out_enc, h):
        dec_inp = self.emb(dec_inp)
        out_dec, h = self.dec(dec_inp, h)
        out_dec = self.DP(out_dec)
        
        att_wgt = nn.functional.softmax(T.bmm(T.matmul(out_dec, self.att), out_enc.transpose(1, 2)), dim=2)
        att_cxt = T.bmm(att_wgt, out_enc)
            
        out = T.cat([out_dec, att_cxt], dim=2)
        out = self.fc(out)
        
        return out, h
    
    def forward(self, inp, 
                is_tr=False, dec_inp=None):
        
        inp = self.emb(inp)
        out_enc, h = self.enc(inp)
        out_enc = self.DP(out_enc)
        h = h.view((1, inp.shape[0], 2*self.HID))
        
        if is_tr==True:
            out, _ = self.run_dec(dec_inp, out_enc, h)
            
            return out
        
        else:
            dec_inp = T.ones((inp.shape[0], 1)).long().cuda() * VOCAB # START
            
            outs = []
            for _ in range(LEN):
                out, h = self.run_dec(dec_inp, out_enc, h)
                dec_inp = T.argmax(out, dim=2, keepdim=False)
                
                outs.append(out)
                
            out = T.cat(outs, dim=1)
            
            return out

model = Model().cuda()
print(model)

out = model(inp.long().cuda(), is_tr=True, dec_inp=dec_inp.long().cuda())
print(out.shape)

out = model(inp.long().cuda())
print(out.shape)

In [None]:
from nltk.translate.bleu_score import sentence_bleu as BLEU

ref = [[1, 2, 3, 4, 5, 6]]
cnd = [1, 3, 4, 5, 6]
bleu = BLEU(ref, cnd)

print('BLEU: %.4f%%'%(bleu*100))

def get_bleu(out, dec_out):
    out = out.tolist()
    dec_out = dec_out.tolist()
    
    if VOCAB+1 in out:
        cnd = out[:out.index(VOCAB+1)]
    else:
        cnd = out
    
    if VOCAB+1 in dec_out:
        ref = [dec_out[:dec_out.index(VOCAB+1)]]
    else:
        ref = [dec_out]
    
    bleu = BLEU(ref, cnd)
    
    return bleu

In [None]:
EPOCHS = 40
LR = 0.0002
DECAY = 0.97

loss_seq2seq = nn.CrossEntropyLoss(reduction='none').cuda()
optim = T.optim.Adam(model.parameters(), lr=LR)

In [None]:
rec = []

for e in tqdm(range(EPOCHS)):
    ls_ep = 0
    
    model.train()
    with tqdm(ld_tr) as TQ:
        for inp, dec_inp, dec_out, wgt in TQ:
            out = model(inp.long().cuda(), is_tr=True, dec_inp=dec_inp.long().cuda())
            
            out = out.view((out.shape[0]*out.shape[1], out.shape[2]))
            dec_out = dec_out.view((dec_out.shape[0]*dec_out.shape[1], ))
            wgt = wgt.view((wgt.shape[0]*wgt.shape[1], )).float().cuda()
            
            ls_bh = loss_seq2seq(out, dec_out.long().cuda())
            ls_bh = (ls_bh*wgt).sum() / wgt.sum()
            
            optim.zero_grad()
            ls_bh.backward()
            optim.step()
            
            ls_bh = ls_bh.cpu().detach().numpy()
            ls_ep += ls_bh
            
            TQ.set_postfix(ls_bh='%.3f'%(ls_bh))
        
        ls_ep /= len(TQ)
        print('Ep %d: %.4f' % (e+1, ls_ep))
    
    for pg in optim.param_groups:
        pg['lr'] *= DECAY
    
    T.save(model.state_dict(), 'Model/model_seq2seq_%d.pt'%(e+1))
    
    bleu_ep = 0
    
    model.eval()
    with tqdm(ld_vl) as TQ:
        for inp, _, dec_out, _ in TQ:
            out = model(inp.long().cuda())
            
            out = np.argmax(out.detach().cpu().numpy(), axis=2)
            dec_out = dec_out.numpy()
            
            bleus = []
            for i in range(out.shape[0]):
                bleu = get_bleu(out[i], dec_out[i])
                
                bleus.append(bleu)
            
            bleu_bh = np.average(bleus)
            bleu_ep += bleu_bh
            
            TQ.set_postfix(bleu_bh='%.3f%%'%(bleu_bh*100))
        
        bleu_ep /= len(TQ)
        print('Valid: %.4f%%' % (bleu_ep*100))
    
    rec.append([ls_ep, bleu_ep])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure()
plt.plot([v[0] for v in rec], label='loss_train')
plt.legend()
plt.show()

plt.figure()
plt.plot([v[1]*100 for v in rec], label='bleu_valid')
plt.legend()
plt.show()

In [None]:
model.load_state_dict(T.load('Model/model_seq2seq_%d.pt'%(40)))

bleu_ep = 0
    
model.eval()
with tqdm(ld_ts) as TQ:
    for inp, _, dec_out, _ in TQ:
        out = model(inp.long().cuda())
            
        out = np.argmax(out.detach().cpu().numpy(), axis=2)
        dec_out = dec_out.numpy()
            
        bleus = []
        for i in range(out.shape[0]):
            bleu = get_bleu(out[i], dec_out[i])
                
            bleus.append(bleu)
            
        bleu_bh = np.average(bleus)
        bleu_ep += bleu_bh
            
        TQ.set_postfix(bleu_bh='%.3f%%'%(bleu_bh*100))
        
    bleu_ep /= len(TQ)
    print('Test: %.4f%%' % (bleu_ep*100))