[MaLSTM](https://dl.acm.org/citation.cfm?id=3016291)を使った短いテキストの距離学習。


In [16]:
import datetime
from pathlib import Path
import re
from time import time
import itertools
import os

from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

import keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint, TensorBoard, CSVLogger

今回はGoogleNewsのword2vecの学習済モデルを使うためダウンロードし解凍しておく。
```bash
wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
gunzip GoogleNews-vectors-negative300.bin.gz
```

In [2]:
nltk.download('stopwords')
stops = set(stopwords.words('english'))

DATADIR = Path('input')
MODELDIR = Path('models')

TRAIN_CSV = DATADIR / 'train.csv'
TEST_CSV = DATADIR / 'test.csv'
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin'


train = pd.read_csv(TRAIN_CSV)
test = pd.read_csv(TEST_CSV)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## テキストの埋め込み

In [3]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [4]:
# Prepare embedding
vocabulary = dict()
inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [5]:
questions_cols = ['question1', 'question2']

# Iterate over the questions only of both training and test datasets
for dataset in [train, test]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:

            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):

                # Check for unwanted words
                if word in stops and word not in word2vec.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.set_value(index, question, q2n)
            
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)

del word2vec



## Train/Valデータ分割

In [7]:
max_seq_length = max(train.question1.map(lambda x: len(x)).max(),
                     train.question2.map(lambda x: len(x)).max(),
                     test.question1.map(lambda x: len(x)).max(),
                     test.question2.map(lambda x: len(x)).max())

# Split to train validation
validation_size = 40000
training_size = len(train) - validation_size

X = train[questions_cols]
Y = train['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test.question1, 'right': test.question2}

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

## MaLSTMモデルの構築
- 2対のインプットをEmbedding
- Embeddingされたベクトルを共通のLSTMへの入力となる
- LSTMは50次元のベクトルを出力し、右と左のベクトル同士の負の指数マンハッタン距離を出す
- 類似度は0〜1で計算されているので、正解ラベルとのMSEを小さくなるように学習する

In [30]:
def MaLSTM(input_size: np.ndarray, embedding_matrix: np.ndarray) -> keras.models.Model:
    """
    Args:
        input_size (int): input size
        embedding_matrix (np.ndarray): A ndarray shape of (XXX, XXX).
    Returns:
        model (keras.models.Model): built and compiled keras model object.
    """
    
    def build_input_layer(input_size):
        """build input layer"""
        return Input(shape=input_size)
    
    
    def build_embedding_layer(input_dim, output_dim,
                              weights, input_length, trainable=False):
        """build embedding layer"""
        return Embedding(input_dim=input_dim, output_dim=output_dim, weights=[embeddings], input_length=input_length, trainable=False)
    
    
    def build_lstm_layer(n_hidden=50):
        """build lstm layer"""
        return LSTM(units=n_hidden)
        

    def exponent_neg_manhattan_distance(left, right):
        ''' Helper function for the similarity estimate of the LSTMs outputs'''
        return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))


    left_input = build_input_layer(input_size)
    right_input =  build_input_layer(input_size)

    embedding_layer = build_embedding_layer(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=input_size, trainable=False)
    encoded_left = embedding_layer(left_input)
    encoded_right = embedding_layer(right_input)

    # Since this is a siamese network, both sides share the same LSTM
    shared_lstm = build_lstm_layer(n_hidden=50)
    left_output = shared_lstm(encoded_left)
    right_output = shared_lstm(encoded_right)

    # Calculates the distance as defined by the MaLSTM model
    malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
    
    model = Model([left_input, right_input], [malstm_distance])
    optimizer = Adadelta(clipnorm=1.25)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
    
    return model


def get_callbacks(save_root):
    if not os.path.exists(save_root):
        os.makedirs(save_root)
    tensorboard_dir = os.path.join(save_root, 'tensorboard')
    checkpoint_path = os.path.join(save_root, 'weights.{epoch:02d}-{val_loss:.4f}-{val_acc:.4f}.hdf5')
    csv_path = os.path.join(save_root, 'log.csv')
    # TensorBoard
    tensorboard = TensorBoard(log_dir=tensorboard_dir)
    # エポックごとの自動セーブ
    model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1)
    # csv結果出力
    csv_logger = CSVLogger(filename=csv_path)

    callbacks = [tensorboard, model_checkpoint, csv_logger]
    return callbacks

## 学習

In [None]:
batch_size=64
n_epoch=25
callbacks=get_callbacks('.')

model = MaLSTM(input_size=(max_seq_length,),embedding_matrix=embeddings)
malstm_trained = model.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, epochs=n_epoch,
                           validation_data=([X_validation['left'], X_validation['right']], Y_validation),
                           callbacks=callbacks)

Instructions for updating:
Use tf.cast instead.
Train on 364290 samples, validate on 40000 samples
Epoch 1/25

Epoch 00001: saving model to ./weights.01-0.1585-0.7732.hdf5
Epoch 2/25
 73344/364290 [=====>........................] - ETA: 15:49 - loss: 0.1557 - acc: 0.7811

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00003: saving model to ./weights.03-0.1447-0.7991.hdf5
Epoch 4/25

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





## 結果の可視化

In [None]:
# Plot accuracy
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()