In [1]:
from __future__ import print_function
import os
import numpy as np
from config import CURRENT_DIR
DATA_PATH = os.path.abspath(os.path.join(CURRENT_DIR, "../data"))

In [2]:
# define documents
texts = [
    "Well done!",
    "Good work",
    "Great effort",
    "nice work",
    "Excellent!",
    "Weak",
    "Poor effort!",
    "not good",
    "poor work",
    "Could have done better.",
]

# define class labels
labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

MAX_SEQUENCE_LENGTH = 1000
vocab_size = 50

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [5]:
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_train = to_categorical(labels)

In [6]:
model = Sequential()

model.add(Embedding(vocab_size, 8, input_length=MAX_SEQUENCE_LENGTH))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 8)           400       
_________________________________________________________________
flatten (Flatten)            (None, 8000)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 8001      
Total params: 8,401
Trainable params: 8,401
Non-trainable params: 0
_________________________________________________________________
None


# Sample from DL4US

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

all_texts = []
def load_data(file_path):
    tokenizer = Tokenizer(filters="")
    global all_texts
    whole_texts = []
    for line in open(file_path, encoding='utf-8'):
        whole_texts.append("<s> " + line.strip() + " </s>")
    all_texts += [whole_texts]

    tokenizer.fit_on_texts(whole_texts)
    
    return tokenizer.texts_to_sequences(whole_texts), tokenizer

# 読み込み＆Tokenizerによる数値化
x_train, tokenizer_en = load_data(os.path.join(DATA_PATH, "train.en"))
y_train, tokenizer_ja = load_data(os.path.join(DATA_PATH, "train.ja"))

en_vocab_size = len(tokenizer_en.word_index) + 1
ja_vocab_size = len(tokenizer_ja.word_index) + 1

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.02, random_state=42)

# パディング
x_train = pad_sequences(x_train, padding='post')
y_train = pad_sequences(y_train, padding='post')

seqX_len = len(x_train[0])
seqY_len = len(y_train[0])

print("english vocabulary size {}".format(en_vocab_size))
print("japanese vocabulary size {}".format(ja_vocab_size))
print(seqX_len)
print(seqY_len)

english vocabulary size 6637
japanese vocabulary size 8777
18
18


In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM
# from tensorflow.keras.layers import CuDNNLSTM as LSTM

emb_dim = 256
hid_dim = 256

## 符号化器
# Inputレイヤー（返り値としてテンソルを受け取る）
encoder_inputs = Input(shape=(seqX_len,))

# モデルの層構成（手前の層の返り値テンソルを、次の接続したい層に別途引数として与える）
# InputレイヤーとEmbeddingレイヤーを接続（+Embeddingレイヤーのインスタンス化）
encoder_embedded = Embedding(en_vocab_size, emb_dim, mask_zero=True)(encoder_inputs) # shape: (seqX_len,)->(seqX_len, emb_dim)
# EmbeddingレイヤーとLSTMレイヤーを接続（+LSTMレイヤーのインスタンス化）
_, *encoder_states = LSTM(hid_dim, return_state=True)(encoder_embedded)  # shape: (seqX_len, emb_dim)->(hid_dim, )
# このLSTMレイヤーの出力に関しては下記に補足あり

In [10]:
## 復号化器
# Inputレイヤー（返り値としてテンソルを受け取る）
decoder_inputs = Input(shape=(seqY_len,))

# モデルの層構成（手前の層の返り値テンソルを、次の接続したい層に別途引数として与える）
# InputレイヤーとEmbeddingレイヤーを接続
decoder_embedding = Embedding(ja_vocab_size, emb_dim) # 後で参照したいので、レイヤー自体を変数化
decoder_embedded = decoder_embedding(decoder_inputs)  # shape: (seqY_len,)->(seqY_len, emb_dim)
# EmbeddingレイヤーとLSTMレイヤーを接続（encoder_statesを初期状態として指定）
decoder_lstm = LSTM(hid_dim, return_sequences=True, return_state=True) # 後で参照したいので、レイヤー自体を変数化
decoder_outputs, _, _ = decoder_lstm(decoder_embedded, initial_state=encoder_states) # shape: (seqY_len, emb_dim)->(seqY_len, hid_dim)
# LSTMレイヤーとDenseレイヤーを接続
decoder_dense = Dense(ja_vocab_size, activation='softmax') # 後で参照したいので、レイヤー自体を変数化
decoder_outputs = decoder_dense(decoder_outputs) # shape: (seqY_len, hid_dim)->(seqY_len, ja_vocab_size)

# モデル構築（入力は符号化器＆復号化器、出力は復号化器のみ）
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# 今回は、sparse_categorical_crossentropy（正解ラベルとしてone_hot表現のベクトルでなく数値を受け取るcategorical_crossentropy）を使用

In [11]:
import numpy as np

train_target = np.hstack((y_train[:, 1:], np.zeros((len(y_train),1), dtype=np.int32)))

model.fit([x_train, y_train], np.expand_dims(train_target, -1), batch_size=128, epochs=15, verbose=2, validation_split=0.2)

Train on 39200 samples, validate on 9800 samples
Epoch 1/15
39200/39200 - 28s - loss: 2.9544 - val_loss: 2.3245
Epoch 2/15
39200/39200 - 23s - loss: 2.1073 - val_loss: 1.9715
Epoch 3/15
39200/39200 - 22s - loss: 1.8434 - val_loss: 1.7980
Epoch 4/15
39200/39200 - 22s - loss: 1.6830 - val_loss: 1.6795
Epoch 5/15
39200/39200 - 23s - loss: 1.5562 - val_loss: 1.5857
Epoch 6/15
39200/39200 - 23s - loss: 1.4469 - val_loss: 1.5072
Epoch 7/15
39200/39200 - 23s - loss: 1.3535 - val_loss: 1.4471
Epoch 8/15
39200/39200 - 23s - loss: 1.2712 - val_loss: 1.3945
Epoch 9/15
39200/39200 - 23s - loss: 1.1991 - val_loss: 1.3570
Epoch 10/15
39200/39200 - 23s - loss: 1.1343 - val_loss: 1.3295
Epoch 11/15
39200/39200 - 23s - loss: 1.0765 - val_loss: 1.2984
Epoch 12/15
39200/39200 - 22s - loss: 1.0238 - val_loss: 1.2808
Epoch 13/15
39200/39200 - 23s - loss: 0.9757 - val_loss: 1.2673
Epoch 14/15
39200/39200 - 23s - loss: 0.9317 - val_loss: 1.2596
Epoch 15/15
39200/39200 - 22s - loss: 0.8902 - val_loss: 1.2542


<tensorflow.python.keras.callbacks.History at 0x7f5e3aa7ea90>