In [69]:
import os
import sys
from datatools.analyzer import *

from datatools.maneger import DataManager
import pandas as pd

import csv
import time

import random
random.seed(0)

from collections import Counter
from torchtext.vocab import Vocab

import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import (
    TransformerEncoder, TransformerDecoder,
    TransformerEncoderLayer, TransformerDecoderLayer
)
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

In [81]:
# lim = 100000
lim=5000
vocab_path = "../models/vocab/"
# vocab_name = "vocab_transformer_src_mini_lim={0}.pickle".format(lim)
vocab_name = "vocab_CModel_src_mini_lim={0}.pickle".format(lim)
vocabM = DataManager(vocab_path)
vocab_src = vocabM.load_data(vocab_name)
# vocab_name = "vocab_transformer_tgt_mini_lim={0}.pickle".format(lim)
vocab_name = "vocab_CModel_tgt_mini_lim={0}.pickle".format(lim)
vocab_tgt = vocabM.load_data(vocab_name)

PAD_IDX = vocab_src['<pad>']
START_IDX = vocab_src['<fos>']
END_IDX = vocab_src['<eos>']



tokenizer_src = mecab_tokenize
tokenizer_tgt = mecab_tokenize

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

success load : ../models/vocab/vocab_CModel_src_mini_lim=5000.pickle
success load : ../models/vocab/vocab_CModel_src_mini_lim=5000.pickle
success load : ../models/vocab/vocab_CModel_tgt_mini_lim=5000.pickle
success load : ../models/vocab/vocab_CModel_tgt_mini_lim=5000.pickle


In [82]:
PAD_IDX

1

In [83]:
import math
class TokenEmbedding(nn.Module):
    
    def __init__(self, vocab_size, embedding_size):
        
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=PAD_IDX)
        self.embedding_size = embedding_size
        
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.embedding_size)
    
class PositionalEncoding(nn.Module):
    
    def __init__(self, embedding_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        
        den = torch.exp(-torch.arange(0, embedding_size, 2) * math.log(10000) / embedding_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        embedding_pos = torch.zeros((maxlen, embedding_size))
        embedding_pos[:, 0::2] = torch.sin(pos * den)
        embedding_pos[:, 1::2] = torch.cos(pos * den)
        embedding_pos = embedding_pos.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('embedding_pos', embedding_pos)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.embedding_pos[: token_embedding.size(0), :])

def create_mask(src, tgt, PAD_IDX):
    
    seq_len_src = src.shape[0]
    seq_len_tgt = tgt.shape[0]

    mask_tgt = generate_square_subsequent_mask(seq_len_tgt, PAD_IDX)
    mask_src = torch.zeros((seq_len_src, seq_len_src), device=device).type(torch.bool)

    padding_mask_src = (src == PAD_IDX).transpose(0, 1)
    padding_mask_tgt = (tgt == PAD_IDX).transpose(0, 1)
    
    return mask_src, mask_tgt, padding_mask_src, padding_mask_tgt

def generate_square_subsequent_mask(seq_len, PAD_IDX):
    mask = (torch.triu(torch.ones((seq_len, seq_len), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == PAD_IDX, float(0.0))
    return mask


class Seq2SeqTransformer(nn.Module):
    
    def __init__(
        self, num_encoder_layers: int, num_decoder_layers: int,
        embedding_size: int, vocab_size_src: int, vocab_size_tgt: int,
        dim_feedforward:int = 512, dropout:float = 0.1, nhead:int = 8
        ):
        
        super(Seq2SeqTransformer, self).__init__()

        self.token_embedding_src = TokenEmbedding(vocab_size_src, embedding_size)
        self.positional_encoding = PositionalEncoding(embedding_size, dropout=dropout)
        encoder_layer = TransformerEncoderLayer(
            d_model=embedding_size, nhead=nhead, dim_feedforward=dim_feedforward
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        self.token_embedding_tgt = TokenEmbedding(vocab_size_tgt, embedding_size)
        decoder_layer = TransformerDecoderLayer(
            d_model=embedding_size, nhead=nhead, dim_feedforward=dim_feedforward
        )
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        
        self.output = nn.Linear(embedding_size, vocab_size_tgt)

    def forward(
        self, src: Tensor, tgt: Tensor,
        mask_src: Tensor, mask_tgt: Tensor,
        padding_mask_src: Tensor, padding_mask_tgt: Tensor,
        memory_key_padding_mask: Tensor
        ):
        
        embedding_src = self.positional_encoding(self.token_embedding_src(src))
        memory = self.transformer_encoder(embedding_src, mask_src, padding_mask_src)
        embedding_tgt = self.positional_encoding(self.token_embedding_tgt(tgt))
        outs = self.transformer_decoder(
            embedding_tgt, memory, mask_tgt, None,
            padding_mask_tgt, memory_key_padding_mask
        )
        return self.output(outs)

    def encode(self, src: Tensor, mask_src: Tensor):
        return self.transformer_encoder(self.positional_encoding(self.token_embedding_src(src)), mask_src)

    def decode(self, tgt: Tensor, memory: Tensor, mask_tgt: Tensor):
        return self.transformer_decoder(self.positional_encoding(self.token_embedding_tgt(tgt)), memory, mask_tgt)

In [84]:
def clean_text_plain(text):
    text_ = neologdn.normalize(text)
    # text_ = re.sub(r'\([^\)]*\)', "", text_)
    # text_ = re.sub(r'\([^\)]*\)', "", text_)
    text_ = re.sub(r'\d+', "0", text_)
    if "……" in text_:
        text_ = text_.replace("……", "…")
    return text_

In [85]:
def convert_text_to_indexes(text, vocab, tokenizer, mode="src"):
    if mode=="src":
        sit = text[0]
        segments = [vocab['<sep>']]
        for s in sit:
            segments += [vocab[token] for token in tokenizer(s.strip("\n"))] + [vocab['<sep>']]
        # 最後消す
        segments[-1] = vocab['<cxt>']
        utt = text[1]
        for u in utt:
            segments += [vocab[token] for token in tokenizer(u.strip("\n"))] + [vocab['<cxt>']]
        return segments
    # 
    elif mode=="tgt":
        return [vocab['<fos>']] + [
            vocab[token] for token in tokenizer(text.strip("\n"))
        ] + [vocab['<eos>']]
    else:
        return []

def convert_text_to_indexes_ntt(text, vocab, tokenizer, mode="src"):
    if mode=="src":
        segments = [vocab['<sep>']]
        for u in text:
            segments += [vocab[token] for token in tokenizer(u.strip("\n"))] + [vocab['<sep>']]
        segments[-1] = vocab['<cxt>']
        return segments
    # 
    elif mode=="tgt":
        return [vocab['<fos>']] + [
            vocab[token] for token in tokenizer(text.strip("\n"))
        ] + [vocab['<eos>']]
    else:
        return []

In [86]:
model_path = "../models/transformer/"
model_name = "transformer_lim={0}_best.pickle".format(lim)
model_name = "transformer_lim={0}_.pickle".format(lim)
model_name = "CModel_lim={0}.pickle".format(lim)
modelM = DataManager(model_path)

model = modelM.load_data(model_name)

success load : ../models/transformer/CModel_lim=5000.pickle
success load : ../models/transformer/CModel_lim=5000.pickle


In [87]:
# def convert_text_to_indexes(text, vocab, tokenizer):
#     return [vocab['<fos>']] + [
#         vocab[token] for token in tokenizer(text.strip("\n"))
#     ] + [vocab['<eos>']]

def translate(
    model, text, vocab_src, vocab_tgt, tokenizer_src, seq_len_tgt, PAD_IDX, 
    START_IDX, END_IDX
):
    
    model.eval()
    tokens = convert_text_to_indexes_ntt(text=text, vocab=vocab_src, tokenizer=tokenizer_src)
    num_tokens = len(tokens)
    src = torch.LongTensor(tokens).reshape(num_tokens, 1)
    mask_src = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) 
    # print(mask_src)
    predicts = beam_search_decode(
        model=model, src=src,
        mask_src=mask_src, seq_len_tgt=seq_len_tgt, PAD_IDX=PAD_IDX,
        START_IDX=START_IDX, END_IDX=END_IDX
    ).flatten()
    
    return ' '.join([vocab_tgt.itos[token] for token in predicts]).replace("<fos>", "").replace("<eos>", "")


def greedy_decode(model, src, mask_src, seq_len_tgt, PAD_IDX, START_IDX, END_IDX):
    
    src = src.to(device)
    mask_src = mask_src.to(device)

    memory = model.encode(src, mask_src)
    # print(memory.shape)
    memory = model.transformer_encoder(model.positional_encoding(model.token_embedding_src(src)), mask_src)
    ys = torch.ones(1, 1).fill_(START_IDX).type(torch.long).to(device)
    # print(ys)
    
    for i in range(seq_len_tgt - 1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        mask_tgt = (generate_square_subsequent_mask(ys.size(0), PAD_IDX).type(torch.bool)).to(device)

        output = model.decode(ys, memory, mask_tgt)
        output = output.transpose(0, 1)
        output = model.output(output[:, -1])
        print(output.shape)
        _, next_word = torch.max(output, dim = 1)
        next_word = next_word.item()
        # print(next_word)

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == END_IDX:
            break
            
    return ys

In [88]:
from heapq import heappush, heappop
import copy

class BeamSearchNode(object):
    def __init__(self, h, prev_node, wid, logp, length):
        self.h = h
        self.prev_node = prev_node
        self.wid = wid
        self.logp = logp
        self.length = length

    def eval(self):
        return self.logp / float(self.length - 1 + 1e-6)

def beam_search_decode(model, src, mask_src, seq_len_tgt, PAD_IDX, START_IDX, END_IDX, width=3, n_best=5):
    n_best_list = []

    src = src.to(device)
    mask_src = mask_src.to(device)

    memory = model.encode(src, mask_src)
    # print(memory.shape)
    memory = model.transformer_encoder(model.positional_encoding(model.token_embedding_src(src)), mask_src)
    ys = torch.ones(1, 1).fill_(START_IDX).type(torch.long).to(device)

    # 開始
    node = BeamSearchNode(h=None, prev_node=copy.deepcopy(ys), wid=copy.deepcopy(ys), logp=0, length=1)
    # ビームサーチのノード全体
    nodes = []
    # EOSまでたどり着いたノード全体
    end_nodes = []

    heappush(nodes, (-node.eval(), id(node), node))
    n_dec_steps = 0
    while True:
        # 発話長
        if n_dec_steps > 20:
            break
        
        score, _, n = heappop(nodes)

        # <end>ならば終了
        if n.wid.item() == END_IDX and n.prev_node is not None:
            end_nodes.append((score, id(n), n))
            if len(end_nodes) >= n_best:
                    break
            else:
                continue
        
        mask_tgt = (generate_square_subsequent_mask(n.prev_node.size(0), PAD_IDX).type(torch.bool)).to(device)
        output = model.decode(n.prev_node, memory, mask_tgt)
        # print(output.shape)
        output = output.transpose(0, 1)
        # print(output.shape)
        output = model.output(output[:, -1])
        # print(output.shape)
        output = torch.log_softmax(output, dim=1)
        topk_log_prob, topk_indexes = torch.topk(output, width)
        # print(torch.max(output, dim = 1))
        # print(topk_log_prob ,topk_indexes)
        for new_k in range(width):
            next_word = topk_indexes[0][new_k]
            logp = topk_log_prob[0][new_k].item()
            ys_ = torch.cat([copy.deepcopy(n.prev_node), torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
            # print(ys_)
            node = BeamSearchNode(h=None, prev_node=copy.deepcopy(ys_), wid=next_word, logp=n.logp+logp, length=n.length+1)
            heappush(nodes, (-node.eval(), id(node), node))

        n_dec_steps += 1
        # break
    if len(end_nodes) == 0:
        print("can't reach end stage")
        end_nodes = [heappop(nodes) for _ in range(width)]
    
    print(end_nodes[2][2].prev_node)

In [90]:
seq_len_tgt = 20

# text = clean_text_plain('おい、大丈夫か！')
text = ["彼がこちらに向かいながら声をかけてくる。", "おい、大丈夫か！"]
print(text)

translate(
    model=model, text=text, vocab_src=vocab_src, vocab_tgt=vocab_tgt,
    tokenizer_src=tokenizer_src, seq_len_tgt=seq_len_tgt, PAD_IDX=PAD_IDX,
    START_IDX=START_IDX, END_IDX=END_IDX
)

['彼がこちらに向かいながら声をかけてくる。', 'おい、大丈夫か！']
can't reach end stage
tensor([[ 2],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [23],
        [35]], device='cuda:0')


AttributeError: 'NoneType' object has no attribute 'flatten'

In [None]:
vocab_src.itos[51]

'なっ'