In [None]:
!pip install gensim

In [None]:
from keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=27,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)

In [None]:
# データ整形
def delete(arr):
    res = []
    for l in arr:
        temp = []
        for num in l:
            if num == 1 or num == 2:
                continue
            temp.append(num)
        res.append(temp)
    return res

def decode_review(data):
    word_index = imdb.get_word_index()
    reversed_word_index = dict([value, key] for (key, value) in word_index.items())
    res = []
    for l in data:
        decoded_review = ' '.join([reversed_word_index.get(i-3, '?') for i in l])
        res.append(decoded_review)
    return res

X_train, X_test = delete(x_train), delete(x_test)
seq_train, seq_test = decode_review(X_train), decode_review(X_test)

In [None]:
import json

with open('data/seq_train.json', 'r') as f:
    seq_train = json.load(f)
with open('data/seq_test.json', 'r') as f:
    seq_test = json.load(f)
with open('data/y_train.json', 'r') as f:
    y_train = json.load(f)
with open('data/y_train.json', 'r') as f:
    y_test = json.load(f)

In [None]:
# word2vecモデル読み込み
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
import numpy as np
length = 100
size = 10000

# word2vecでembedding
def embedding(seq, model, length, size):
    res = np.zeros((size, length, 300))
    for i in range(size):
        words = seq[i].split(' ')
        for j in range(len(words)):
            try:
                res[i, j, :] = model[words[j]]
            except:
                pass
    return res

def label(y, size):
    res = np.zeros(size)
    for i in range(size):
        res[i] = y[i]
    return res

embed_train = embedding(seq_train, model, length, size)
embed_test = embedding(seq_test, model, length, size)
label_train = label(y_train, size)
label_test = label(y_test, size)

In [None]:
embed_train = np.load('data/embed_train.npy')
embed_test = np.load('data/embed_test.npy')
label_train = np.load('data/label_train.npy')
label_test = np.load('data/label_test.npy')

In [None]:
# モデル作成、学習
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

timesteps = 100
data_dim = 300

model = Sequential()
model.add(LSTM(128, input_shape=(timesteps, data_dim)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

callback = EarlyStopping(monitor='val_loss', patience=5)
model.fit(embed_train, label_train, epochs=100,
          validation_split=0.1, verbose=1, callbacks=[callback])

In [None]:
model.save('data/LSTM_model.h5')

In [None]:
from tensorflow.keras.models import load_model
n_model = load_model('data/LSTM_model.h5')
n_model.summary()