## Import

In [10]:
%env KERAS_BACKEND=tensorflow
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Reshape, MaxPooling2D, Flatten, Conv2D, Reshape
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
import tensorflow as tf
from keras import regularizers
from keras.models import model_from_json

env: KERAS_BACKEND=tensorflow


## 讀入資料庫

In [2]:
N_W = 5487
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=N_W,
                                                     skip_top=0,
                                                     oov_char=2,
                                                     index_from=3)

## 算sequence的平均長度和變異數

In [11]:
leng = [len(i) for i in x_train] + [len(i) for i in x_test]
M = np.mean(leng)
var = np.sqrt(np.var(leng))
print('平均長度:{}, 標準差:{}'.format(M, var))

平均長度:234.75892, 標準差:172.91149458735703


## 設定最大長度

In [4]:
MAX_LEN = int(np.round(M + 2*var))

## 處理字串長度

In [5]:
x_train_maxlen = sequence.pad_sequences(
    sequences = x_train, 
    maxlen=MAX_LEN,
    padding = 'pre',
    truncating = 'pre'
)
x_test_maxlen = sequence.pad_sequences(
    sequences = x_test, 
    maxlen=MAX_LEN,
    padding = 'pre',
    truncating = 'pre'
)

In [6]:
len(x_train_maxlen[0]), len(x_test_maxlen[0])

(581, 581)

## 架LSTM

In [7]:
N = 32
UNIT = 16

model = Sequential()
model.add(
    Embedding(
        input_dim = N_W,
        output_dim = N,
        input_length = MAX_LEN
    ))
model.add(
    Bidirectional(
        LSTM(
            units = UNIT,
            dropout = 0.01,
            recurrent_dropout = 0.05,
            return_sequences = True,
            input_shape = (MAX_LEN,N,),
        )))
model.add(Reshape((MAX_LEN, UNIT*2, 1,)))
model.add(
        Conv2D(
            input_shape=(MAX_LEN, UNIT*2, 1, ),
            data_format="channels_last",
            filters = 10,
            kernel_size = (MAX_LEN // 20, (UNIT * 2) // 8),
            strides = 1,
            padding = 'valid',
            activation = 'relu',
        ))
model.add(
    MaxPooling2D(
        pool_size = (2,2),
        strides = 2,
        data_format='channels_last',
    ))
model.add(
        Conv2D(
            data_format="channels_last",
            filters = 5,
            kernel_size = (MAX_LEN // 40, (UNIT * 2) // 8),
            strides = 2,
            padding = 'valid',
            activation = 'relu', 
        ))
model.add(
    MaxPooling2D(
        pool_size = (2,2),
        strides = 2,
        data_format='channels_last',
    ))
model.add(
        Conv2D(
            data_format="channels_last",
            filters = 2,
            kernel_size = (MAX_LEN // 80, (UNIT * 2) // 16),
            strides = 1,
            padding = 'valid',
            activation = 'relu', 
        ))
model.add(
    MaxPooling2D(
        pool_size = (2,2),
        strides = 2,
        data_format='channels_last',
    ))
model.add(Flatten())
model.add(
    Dense(
        8,
        activation='relu',
    ))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 581, 32)           175584    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 581, 32)           6272      
_________________________________________________________________
reshape_1 (Reshape)          (None, 581, 32, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 553, 29, 10)       1170      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 276, 14, 10)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 132, 6, 5)         2805      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 66, 3, 5)          0         
__________

In [8]:
early_stopping = EarlyStopping(monitor='val_acc', patience=5, verbose=2)
for i in range(100):
    model.fit(x_train_maxlen, y_train, batch_size=256, epochs=1,
              validation_data=[x_test_maxlen, y_test], callbacks=[early_stopping])
    model_json = model.to_json()
    open('lstm_model/imdb_model_arch'+ str(i)+'.json','w').write(model_json)
    model.save_weights('lstm_model/imdb_model_weights'+str(i)+'.h5')

Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
 1536/25000 [>.............................] - ETA: 7:45 - loss: 0.1425 - acc: 0.9557

KeyboardInterrupt: 

## 分數

In [13]:
model = model_from_json(open('lstm_model/imdb_model_arch4.json').read())
model.load_weights('lstm_model/imdb_model_weights4.h5')

In [15]:
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['acc'])
score = model.evaluate(x_test_maxlen, y_test, batch_size=512)
print('Loss of testing data = {}'.format(score[0]))
print('Acc of testing data = {}'.format(score[1]))

Loss of testing data = 0.3248444489479065
Acc of testing data = 0.8639999995231629
