In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [None]:
import pickle

tr = pickle.load(open('Dataset/tr.pkl', 'rb'))
vl = pickle.load(open('Dataset/vl.pkl', 'rb'))
ts = pickle.load(open('Dataset/ts.pkl', 'rb'))

print(len(tr), len(vl), len(ts))

In [None]:
import torch as T
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np

import spacy

NLP = spacy.load('en_core_web_lg')

In [None]:
LEN_MAX = 404

class DS_Imdb(Dataset):
    def __init__(self, dat):
        self.dat = dat
        self.END = NLP('。')[0].vector
        
    def __len__(self):
        return len(self.dat)
    
    def __getitem__(self, idx):
        sent, lbl  = self.dat[idx]
        l = len(sent)
        
        inp = np.zeros((LEN_MAX, 300))
        for i in range(l):
            inp[i] = sent[i].vector
        for i in range(l, LEN_MAX):
            inp[i] = self.END
        
        return l, inp, lbl

ld_tr = DataLoader(DS_Imdb(tr), batch_size=32, shuffle=True)
ld_vl = DataLoader(DS_Imdb(vl), batch_size=64)
ld_ts = DataLoader(DS_Imdb(ts), batch_size=64)

for l, inp, ans in ld_tr:
    print(l.shape, inp.shape, ans.shape)
    
    break

In [None]:
class Model_LSTM(nn.Module):
    def __init__(self, hid_size=256, mode='jump', R=20, K=40, N=5):
        super(Model_LSTM, self).__init__()
        
        self.hid_size = hid_size
        self.mode = mode
        
        self.R = R
        self.K = K+1 # includes 0
        self.N = N
        
        self.lstm = nn.LSTM(300, self.hid_size, num_layers=2, 
                            batch_first=True, dropout=0.2)
        self.fc_out = nn.Sequential(*[nn.Linear(self.hid_size, 100), nn.ReLU(), nn.Dropout(0.25), 
                                      nn.Linear(100, 2)]) 
        self.fc_jump = nn.Sequential(*[nn.Linear(self.hid_size, self.K)])
    
    def rollout(self, l, inp, M, policy='sample'):
        pbs, outs = [], []
        
        for t in range(M):
            pb = []
            pos = 0
            
            for i in range(self.N):
                pos_r = (pos+self.R) if (pos+self.R)<l else l
                
                if i==0:
                    out, (h, c) = self.lstm(inp[:, pos:pos_r])
                else:
                    out, (h, c) = self.lstm(inp[:, pos:pos_r], (h, c))
                out = out[:, -1]
            
                jp = nn.functional.log_softmax(self.fc_jump(out), dim=1)
                if policy=='sample':
                    pp = np.random.choice(self.K, p=T.exp(jp.data).cpu().numpy()[0])
                else:
                    pp = np.argmax(jp.data.cpu().numpy()[0])
                
                pb.append(jp[:, pp])
                if pp==0 or (pos_r+pp-1)>=l:
                    break
                else:
                    pos = (pos_r+pp-1)
                
            pbs.append(pb)
            outs.append(out)
        
        return pbs, outs
    
    def forward(self, l, inp, is_pg=True, M=16):
        l = l.numpy()
        batch_size = inp.shape[0]
        
        pbs, outs = [], []
        for i in range(batch_size):
            if is_pg==False:
                pb, out = self.rollout(l[i], inp[i:i+1, :l[i]], 1, policy='greedy')
            else:
                pb, out = self.rollout(l[i], inp[i:i+1, :l[i]], M, policy='sample')
            
            pbs += pb
            outs.append(T.cat(out, dim=0))
        
        outs = T.cat(outs, dim=0)
        outs = self.fc_out(outs)
        
        return pbs, outs

model = Model_LSTM().cuda()

pb, out = model(l, inp.float().cuda(), is_pg=False)
print(len(pb), out.shape)

pb, out = model(l, inp.float().cuda(), is_pg=True)
print(len(pb), out.shape)

In [None]:
print(pb[0])
print(out[0])

In [None]:
class Loss_PG(nn.Module):
    def __init__(self):
        super(Loss_PG, self).__init__()
        
        self.bl = 0
        self.nb = 0
    
    def forward(self, pb, out, ans):
        ans = ans.numpy()
        out = np.argmax(out.data.cpu().numpy(), axis=1)
        
        rwd = [1 if c==True else -1 for c in (out==ans)]
        
        ls = 0
        cnt = 0
        for i in range(len(ans)):
            for p in pb[i]:
                cnt += 1
                ls += -p*(rwd[i]-self.bl)
        ls /= cnt
        
        self.bl = (self.bl*self.nb + np.average(rwd))/(self.nb+1)
        self.nb += 1
        
        return ls

loss_ce = nn.CrossEntropyLoss().cuda()
loss_pg = Loss_PG().cuda()
optim = T.optim.Adam(model.parameters(), lr=0.0008)

EPOCHS = 200

In [None]:
class EarlyStop:
    def __init__(self, threshold=10):
        self.threshold = threshold

        self.acc_max = 0
        self.cnt = 0

    def add(self, acc):
        if acc<self.acc_max:
            self.cnt += 1
        else:
            self.cnt = 0
            self.acc_max = acc

        if self.cnt>=self.threshold:
            return True
        else:
            return False

ES = EarlyStop()

In [None]:
from tqdm import tqdm_notebook as tqdm

In [None]:
for e in tqdm(range(EPOCHS)):
    ls_ep = 0
    
    model.train()
    with tqdm(ld_tr) as TQ:
        for l, inp, ans in TQ:
            pb, out = model(l, inp.float().cuda(), is_pg=True)
            
            ans = ans.numpy()
            ans = np.repeat(ans, 16)
            ans = T.from_numpy(ans)
            
            ls_bh = loss_pg(pb, out, ans)+loss_ce(out, ans.cuda())
            
            optim.zero_grad()
            ls_bh.backward()
            optim.step()
            
            ls_bh = ls_bh.cpu().detach().numpy()
            TQ.set_postfix(ls_bh='%.3f'%(ls_bh))
            ls_ep += ls_bh
        
        ls_ep /= len(TQ)
        print('Ep %d: %.4f' % (e+1, ls_ep))
        
        T.save(model.state_dict(), 'Model/lstm-jump_%d.pt' % (e+1))
        
    acc_ep = 0
    
    model.eval()
    with tqdm(ld_vl) as TQ:
        for l, inp, ans in TQ:
            _, out = model(l, inp.float().cuda(), is_pg=False)
            
            out = out.cpu().detach().numpy()
            out = np.argmax(out, axis=1)
            ans = ans.numpy()
            
            acc_bh = np.average(out==ans)
            TQ.set_postfix(acc_bh='%.3f'%(acc_bh))
            acc_ep += acc_bh
        
        acc_ep /= len(TQ)
        print('%.4f'%(acc_ep))
    
    if ES.add(acc_ep)==True:
        print('Finish training in ep=%d'%(e+1))
        
        break

In [None]:
model.load_state_dict(T.load('Model/lstm-jump.pt'))

In [None]:
acc_ep = 0

model.eval()
with tqdm(ld_ts) as TQ:
    for l, inp, ans in TQ:
        _, out = model(l, inp.float().cuda(), is_pg=False)
            
        out = out.cpu().detach().numpy()
        out = np.argmax(out, axis=1)
        ans = ans.numpy()
            
        acc_bh = np.average(out==ans)
        TQ.set_postfix(acc_bh='%.3f'%(acc_bh))
        acc_ep += acc_bh
        
    acc_ep /= len(TQ)
    print('%.4f'%(acc_ep))