# Chapter11 系列変換

In [None]:
%tensorflow_version 2.x

In [None]:
!pip install janome nltk tensorflow==2.4.0

## 系列変換モデルの実装

### データセットの準備

In [None]:
!mkdir data
!mkdir models
!wget http://www.manythings.org/anki/jpn-eng.zip -P data/
!unzip data/jpn-eng.zip -d data/

In [None]:
from collections import defaultdict

import numpy as np
import tensorflow as tf
from janome.tokenizer import Tokenizer
from nltk.translate.bleu_score import corpus_bleu
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, Embedding, GRU, Dot, Activation, Concatenate
from tensorflow.keras.models import Model, model_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
batch_size = 32
epochs = 20
model_path = 'models/mode.h5'
enc_arch = 'models/encoder.json'
dec_arch = 'models/decoder.json'
data_path = 'data/jpn.txt'
num_words = 10000
num_data = 20000

In [None]:
def load_dataset(filename):
    en_texts = []
    ja_texts = []
    with open(filename) as f:
        for line in f:
            en_text, ja_text = line.strip().split('\t')[:2] #3個目以降のデータは不要
            en_texts.append(en_text)
            ja_texts.append(ja_text)
    return en_texts, ja_texts

In [None]:
en_texts, ja_texts = load_dataset(data_path)
en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

In [None]:
t = Tokenizer(wakati=True)

#分かち書き
def tokenize(text):
    return t.tokenize(text)

#ボキャブラリの作成
def build_vocabulary(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=num_words, oov_token='<UNK>', filters=''
    )
    tokenizer.fit_on_texts(texts)
    return tokenizer

#<start> <end>記号の付与
def preprocess_dataset(texts):
    return ['<start> {} <end>'.format(text) for text in texts]

#品詞ごとにスペース区切り
def preprocess_ja(texts):
    return [' '.join(tokenize(text)) for text in texts]

def create_dataset(en_texts, ja_texts, en_vocab, ja_vocab):
    en_seqs = en_vocab.texts_to_sequences(en_texts)
    ja_seqs = ja_vocab.texts_to_sequences(ja_texts)
    en_seqs = pad_sequences(en_seqs, padding='post')
    ja_seqs = pad_sequences(ja_seqs, padding='post')
    return [en_seqs, ja_seqs[:, :-1]], ja_seqs[:, 1:]

In [None]:
ja_texts = preprocess_ja(ja_texts)
ja_texts = preprocess_dataset(ja_texts)
en_texts = preprocess_dataset(en_texts)
x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42)
en_vocab = build_vocabulary(x_train, num_words)
ja_vocab = build_vocabulary(y_train, num_words)
x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

### モデルの定義

In [None]:
# エンコーダとデコーダで共通するメソッド
class BaseModel:

    def build(self):
        raise NotImplementedError()

    #モデルアーキテクチャの保存
    def save_as_json(self, filepath):
        model = self.build()
        with open(filepath, 'w') as f:
            f.write(model.to_json())
    #モデルの読み込み
    @classmethod
    def load(cls, architecture_file, weight_file, by_name=True):
        with open(architecture_file) as f:
            model = model_from_json(f.read())
            model.load_weights(weight_file, by_name=by_name)
            return model

class Encoder(BaseModel):

    def __init__(self, input_dim, emb_dim=300, hid_dim=256, return_sequences=False):
        self.input = Input(shape=(None,), name='encoder_input')
        self.embedding = Embedding(input_dim=input_dim,
                                   output_dim=emb_dim,
                                   mask_zero=True,
                                   name='encoder_embedding') #one-hotベクトルを分散表現に変換
        self.gru = GRU(hid_dim,
                       return_sequences=return_sequences,
                       return_state=True,
                       name='encoder_gru')#RNN層

    def __call__(self):
        x = self.input
        embedding = self.embedding(x)#one-hotベクトルを分散表現に変換
        output, state = self.gru(embedding)#RNN層に入力
        return output, state 

    def build(self):
        output, state = self()
        return Model(inputs=self.input, outputs=[output, state])

class Decoder(BaseModel):

    def __init__(self, output_dim, emb_dim=300, hid_dim=256):
        self.input = Input(shape=(None,), name='decoder_input')
        self.embedding = Embedding(input_dim=output_dim,
                                   output_dim=emb_dim,
                                   mask_zero=True,
                                   name='decoder_embedding')#one-hotベクトルを分散表現に変換
        self.gru = GRU(hid_dim,
                       return_sequences=True,
                       return_state=True,
                       name='decoder_gru')
        self.dense = Dense(output_dim, activation='softmax', name='decoder_output')#出力層

        # for inference.
        self.state_input = Input(shape=(hid_dim,), name='decoder_state_in')

    def __call__(self, states, enc_output=None):
        x = self.input
        embedding = self.embedding(x)
        outputs, state = self.gru(embedding, initial_state=states)
        outputs = self.dense(outputs) 
        return outputs, state 

    def build(self):
        decoder_output, decoder_state = self(states=self.state_input)
        return Model(
            inputs=[self.input, self.state_input],
            outputs=[decoder_output, decoder_state])

class Seq2seq(BaseModel):

    def __init__(self, encoder, decoder):
        self.encoder = encoder
        self.decoder = decoder

    def build(self):
        encoder_output, state = self.encoder() 
        decoder_output, _ = self.decoder(states=state, enc_output=encoder_output)
        return Model([self.encoder.input, self.decoder.input], decoder_output)

In [None]:
encoder = Encoder(num_words)
decoder = Decoder(num_words)
seq2seq = Seq2seq(encoder, decoder)
model = seq2seq.build()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
callbacks = [
    EarlyStopping(patience=3),
    ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True)
]
model.fit(x=x_train,
          y=y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=callbacks,
          validation_split=0.1)

encoder.save_as_json(enc_arch)
decoder.save_as_json(dec_arch)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


### 予測用クラスの実装

In [None]:
class InferenceAPI:

    def __init__(self, encoder_model, decoder_model, en_vocab, ja_vocab):
        self.encoder_model = encoder_model
        self.decoder_model = decoder_model
        self.en_vocab = en_vocab
        self.ja_vocab = ja_vocab

    def predict(self, text):
        output, state = self._compute_encoder_output(text)
        sequence = self._generate_sequence(output, state)
        decoded = self._decode(sequence)
        return decoded
  
    #入力文を固定長に変換
    def _compute_encoder_output(self, text):
        x = self.en_vocab.texts_to_sequences([text])
        output, state = self.encoder_model.predict(x)
        return output, state

    def _compute_decoder_output(self, target_seq, state, enc_output=None):
        output, state = self.decoder_model.predict([target_seq, state])
        return output, state
        
    #日本語の単語に対応するIDを生成
    def _generate_sequence(self, enc_output, state, max_seq_len=50):
        target_seq = np.array([self.ja_vocab.word_index['<start>']])
        sequence = []
        for i in range(max_seq_len):
            output, state = self._compute_decoder_output(target_seq, state, enc_output)
            sampled_token_index = np.argmax(output[0, 0])
            if sampled_token_index == self.ja_vocab.word_index['<end>']:
                break
            sequence.append(sampled_token_index)
            target_seq = np.array([sampled_token_index])
        return sequence

    #IDを文字列に変換
    def _decode(self, sequence):
        decoded = self.ja_vocab.sequences_to_texts([sequence])
        decoded = decoded[0].split(' ')
        return decoded

In [None]:
def evaluate_bleu(X, y, api):
    d = defaultdict(list)
    for source, target in zip(X, y):
        d[source].append(target)
    hypothesis = []
    references = []
    for source, targets in d.items():
        pred = api.predict(source)
        hypothesis.append(pred)
        references.append(targets)
    bleu_score = corpus_bleu(references, hypothesis)
    return bleu_score