<a href="https://colab.research.google.com/github/tomoya-ichikawa/NMT-Transformer/blob/main/transformer_ver2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import(Dataset, DataLoader, TensorDataset)
import torch.nn as nn
import math
from torch.autograd import Variable
import tqdm
import re
import collections
import itertools
import numpy as np
import copy
import torch.nn.functional as F


In [15]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:

def normalizeString(s,l):
    if s == l[0]:
      s = s.lower().strip()
   
  
    s = re.sub(r"([.!?])", r" \1", s)
    #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def build_vocab(tokens):
  counts = collections.Counter(tokens)
  sorted_counts = sorted(counts.items(), key=lambda c: c[1], reverse=True)
  word_list = ["<PAD>","<SOS>", "<EOS>", "<UNK>"]+[x[0] for x in sorted_counts]
  word_dict = dict((w, i) for i, w in enumerate(word_list))
  return word_list, word_dict

def word2tensor(words, word_dict, max_len, trgg=False, padding=0):
  if trgg:
    words = ["<SOS>"] + words
  words = [word_dict.get(w, 3) for w in words]
  seq_len = len(words)
  if seq_len < max_len:
    words = words + [padding]*(max_len-seq_len)
  return torch.tensor(words, dtype=torch.int64), seq_len
def word2ltensor(words, word_dict, max_len, trgg=False, padding=0):
  if trgg:
    words =  words +["<EOS>"]
  words = [word_dict.get(w, 3) for w in words]
  seq_len = len(words)
  if seq_len < max_len:
    words = words + [padding]*(max_len-seq_len)
  return torch.tensor(words, dtype=torch.int64), seq_len


In [6]:
class MyDataset(torch.utils.data.Dataset):

  def __init__(self, path, max_len=17, test=False):
    lines = open(path, encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s,l) for s in l.split('\t')] for l in lines]

    src = [p[0] for p in pairs]
    src = [l.split(" ") for l in src]
    trg = [p[1] for p in pairs]
    trg = [l.split(" ") for l in trg]
    if test:
      self.src_word_list, self.src_word_dict = train_data.src_word_list, train_data.src_word_dict
      self.trg_word_list, self.trg_word_dict = train_data.trg_word_list, train_data.trg_word_dict
    #リストを要素として持つリスト（2次元リスト）を平坦化する
    else:
      self.src_word_list,self.src_word_dict = build_vocab(itertools.chain.from_iterable(src))
      self.trg_word_list,self.trg_word_dict = build_vocab(itertools.chain.from_iterable(trg))

    self.src_data = [word2tensor(words, self.src_word_dict, max_len) for words in src]
    self.trg_data = [word2tensor(words, self.trg_word_dict, max_len, trgg=True) for words in trg]
    self.trg_label = [word2ltensor(words, self.trg_word_dict, max_len, trgg=True, padding = -100) for words in trg]

  def __len__(self):
    return len(self.trg_data)
  def __getitem__(self, idx):
    src, lsrc = self.src_data[idx]
    trg, ltrg = self.trg_data[idx]
    label, llabel = self.trg_label[idx]
    return src, lsrc, trg, ltrg,label, llabel

In [17]:
batch_size = 256
max_len =30
path = "drive/MyDrive/Colab-Notebooks/eng-ja.txt"
path2 = "drive/MyDrive/Colab-Notebooks/eng-ja2.txt"
train_data = MyDataset(path, max_len=max_len)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
test_data = MyDataset(path2, max_len=max_len, test=True)
test_loader = DataLoader(test_data)

  cpuset_checked))


In [18]:
class Embedder(nn.Module):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, d_model, padding_idx=0)
  def forward(self, x):
    return self.embed(x)

In [19]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 80, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_seq_len, d_model)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        pe = pe.to(device)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos/ (10000 ** ((2 * i)/d_model)))
                pe[pos, i+1] = \
                math.cos(pos/(10000**((2*(i+1))/d_model)))
        
        self.pe = pe.unsqueeze(0)
        self.pe.requires_grad = False
        
    def forward(self, x):
        seq_len = x.size(1)
        
        ret = math.sqrt(self.d_model)*x + self.pe[:,:seq_len,:]
        return self.dropout(x)

In [20]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [21]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.h_attention = get_clones(Attention(d_model, self.d_k), heads)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None, trg_masked=False):
        for i in range(self.h):
            if i==0:
                concat = self.h_attention[0](q, k, v, mask, trg_masked)
            else:
                h_atten = self.h_attention[i](q, k, v, mask, trg_masked)
                concat = torch.cat((concat, h_atten), 2)
    
        output = self.out(concat)
        return output

In [22]:
class Attention(nn.Module):
    def __init__(self, d_model, d_k, dropout=0.1):
        super().__init__()
        self.q_linear = nn.Linear(d_model, d_k)
        self.v_linear = nn.Linear(d_model, d_k)
        self.k_linear = nn.Linear(d_model, d_k)
        self.nopeak_mask = torch.tensor(np.triu(np.ones((max_len, max_len)), k=1).astype('uint8')).unsqueeze(0).to(device)
        self.d_k = d_k
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, q, k, v, pad_mask=None,trg_masked=False):
        k = self.k_linear(k)
        q = self.q_linear(q)
        v = self.v_linear(v)

        weights = torch.matmul(q, k.transpose(1,2)) / math.sqrt(self.d_k)
        if pad_mask is not None:
            pad_mask = pad_mask.unsqueeze(1)
            weights = weights.masked_fill(pad_mask == 0, -1e9)
        
        if trg_masked:
            weights = weights.masked_fill(self.nopeak_mask == 1, -1e9)
            
        normlized_weights =  F.softmax(weights, dim=-1)
        normlized_weights = self.dropout(normlized_weights)
        output = torch.matmul(normlized_weights, v)

        return output

In [23]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.linear_1(x)
        x = self.dropout(F.relu(x))
        x = self.linear_2(x)
        return x

In [24]:
class TransformerEncLayer(nn.Module):
    def __init__(self, d_model, head, dropout=0.1):
        super().__init__()
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.multi_attn = MultiHeadAttention(head, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):

        x_attn = self.multi_attn(x, x, x, mask)
        x = x + self.dropout_1(x_attn)
        x_norm = self.norm_1(x)

        output = x_norm + self.dropout_2(self.ff(x_norm))
        output = self.norm_2(output)
        return output

In [25]:
class TransformerDecLayer(nn.Module):
    def __init__(self, d_model, head, dropout=0.1):
        super().__init__()
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.norm_3 = nn.LayerNorm(d_model)
        self.multi_attn = MultiHeadAttention(head, d_model)
        self.src_trg_attn = MultiHeadAttention(head, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
    
    def forward(self, y, memory, src_mask=None, trg_mask=None, masked=False):
        
        y_attn = self.multi_attn(y, y, y, trg_mask, trg_masked=masked)
        y = y + self.dropout_1(y_attn)
        y_norm = self.norm_1(y)

        y_attn = self.src_trg_attn(y_norm, memory, memory, src_mask)
        y2 = y_norm + self.dropout_2(y_attn)
        y_norm2 = self.norm_2(y2)

        output = y_norm2 + self.dropout_3(self.ff(y_norm2))
        output = self.norm_3(output)
        return output

In [26]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, head, num_layer):
        super().__init__()
        self.embedding = Embedder(vocab_size, d_model)
        self.pos_enc = PositionalEncoder(d_model, max_len)
        self.num_layer = num_layer
        self.transformerlayer = get_clones(TransformerEncLayer(d_model, head), num_layer)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.pos_enc(x)

        for i in range(self.num_layer):
            x = self.transformerlayer[i](x, mask)
        return x

In [27]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, head, num_layer):
        super().__init__()
        self.embedding = Embedder(vocab_size, d_model)
        self.pos_enc = PositionalEncoder(d_model, max_len)
        self.num_layer = num_layer
        self.transformerlayer = get_clones(TransformerDecLayer(d_model, head), num_layer)

    def forward(self, y, memory, src_mask=None, trg_mask=None, masked=False):
        
        y = self.embedding(y)
        y = self.pos_enc(y)

        for i in range(self.num_layer):
            y = self.transformerlayer[i](y, memory, src_mask, trg_mask, masked)

        return y
        

In [28]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, head, num_layer):
        super().__init__()
        self.encoder = TransformerEncoder(src_vocab_size, d_model, head, num_layer)
        self.decoder = TransformerDecoder(trg_vocab_size, d_model, head, num_layer)
        self.linear = nn.Linear(d_model, trg_vocab_size)
    
    def forward(self, src, trg, src_mask=None, trg_mask=None, masked=False):
     
        memory = self.encoder(src, src_mask)
        out = self.decoder(trg, memory, src_mask, trg_mask, masked)
        out = self.linear(out)
        return out

In [29]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target, device="cpu"):
        pred = pred.log_softmax(dim=self.dim).to(device)
        
        with torch.no_grad():
            # true_dist = pred.data.clone()
            true_dist = torch.zeros_like(pred).to(device)
            
            true_dist.fill_(self.smoothing / (self.cls - 1))
           
            
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [39]:
d_model = 512
heads = 8
N = 6
warmup_steps = 4000
device = "cuda:0"
step_num = 0
src_vocab_size = len(train_data.src_word_list)
trg_vocab_size = len(train_data.trg_word_list)

model = Transformer(src_vocab_size, trg_vocab_size, d_model, heads, N)

for p in model.parameters():
    if p.dim() > 1:
       nn.init.xavier_uniform_(p)
model.to(device)

optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
#optimizer = ScheduledOptim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),2.0, d_model, n_warmup_steps)
#loss_f = LabelSmoothingLoss(trg_vocab_size, smoothing=0.1)

loss_f = nn.CrossEntropyLoss()

In [31]:
def to2D(x):
  shapes = x.shape
  return x.reshape(shapes[0]*shapes[1], -1)

In [40]:
from statistics import mean

model.train()
pad_idx = 0
for epoc in range(40):
    losses = []
    for x, lx, y, ly, label, llabel in train_loader:
        step_num +=1
        src_mask = (x != pad_idx)
        trg_mask = (y != pad_idx)
        x, y, label = x.to(device), y.to(device), label.to(device)
        src_mask, trg_mask = src_mask.to(device), trg_mask.to(device)
        
        output = model(x, y, src_mask, trg_mask, masked=True)
        loss = loss_f(to2D(output), to2D(label).squeeze())
        optim.zero_grad()
        loss.backward()
        optim.step()
        losses.append(loss.item())
        

    print(epoc, mean(losses))

  cpuset_checked))


0 5.486537171869862
1 4.126660479574787
2 3.664931886050166
3 3.3192857662025763
4 2.957549108534443
5 2.6571999496343186
6 2.4359048957727394
7 2.2563987349977297
8 2.11081487001205
9 1.978108137845993
10 1.8661786840886485
11 1.7637063228354162
12 1.6703545475492672
13 1.5887905055162859
14 1.5105626315486675
15 1.4374683091835099
16 1.3720791826442795
17 1.3107740854730412
18 1.2520836281533143
19 1.1967053279584767
20 1.146057527284233
21 1.0974252111449534
22 1.0506788598639625
23 1.0068632419012031
24 0.9640755580396069
25 0.9264607581557059
26 0.8881586534636361
27 0.8532327991359088
28 0.8163929678955857
29 0.7837950663298977
30 0.753468353833471
31 0.723215387487898
32 0.6940340603492698
33 0.668218836796527
34 0.6390716494346151
35 0.6130103234733854
36 0.5907072788294481
37 0.5669119802056527
38 0.5434364706886058
39 0.5241334439844502


In [42]:
def translate(input_str, input_len, model, s_mask, max_len=20, device = "cpu"):
  sos_inputs = torch.tensor([[1]], dtype=torch.int64)
  input_str = input_str.to(device)
  sos_inputs = sos_inputs.to(device)
  s_mask = s_mask.to(device)
  z = sos_inputs
  results = []
  
  for i in range(max_len):
    output = model(input_str, z, s_mask)
    output = output[:, i, :]
    pred = output.detach().view(-1).max(0)[1]
    if pred.item()==2:
      break
    results.append(pred.item())

    z = torch.cat((z,pred.unsqueeze(0).unsqueeze(0)), 1)

  return " ".join(train_data.trg_word_list[i] for i in results)

In [None]:
model.eval()
ptrgs=[]
with torch.no_grad():
  for x, lx, y, ly, label, llabel in test_loader:
    src_mask = (x != pad_idx)
    ptrg = translate(x, lx, model,src_mask, device="cuda:0")
    y = y[:,1:ly]
    w = " ".join(train_data.trg_word_list[i] for i in y.squeeze())
    print(ptrg)
    ptrgs.append((ptrg, w))

In [44]:
for pred, refer in ptrgs:
    print("翻訳文："+pred)
    print("参照訳："+refer)
    print("")

翻訳文：at fact that they didn 't understand it  .
参照訳：they finally acknowledged it as true  .

翻訳文：he was not good at swimming  .
参照訳：he didn 't care for swimming  .

翻訳文：he is less kind than her sister  .
参照訳：he is no less kind than his sister  .

翻訳文：you must come back before  .
参照訳：you must be back before ten  .

翻訳文：i hope of success
参照訳：break a leg  .

翻訳文：she is our house  .
参照訳：she lives next door to us  .

翻訳文：i am looking forward to you  .
参照訳：i 'm about to tell you the answer  .

翻訳文：i know nothing for living  .
参照訳：i 'm a person who lives for the moment  .

翻訳文：i 'd like a game  .
参照訳：we have this game on ice  .

翻訳文：do such a reason you have told  .
参照訳：will you give me your reasons for doing this  ?

翻訳文：she likes the teacher  .
参照訳：she likes the teacher  .

翻訳文：that 's business  .
参照訳：it 's business  .

翻訳文：these shoes will be old  .
参照訳：these shoes will last you two years  .

翻訳文：he left the place in his own place  .
参照訳：he made his son attend the meeting in his place  .

翻