In [1]:
from keras.preprocessing import sequence
from keras.datasets import imdb
from keras import layers, models

Using TensorFlow backend.


In [2]:
class Data:
    def __init__(self, max_features=20000, maxlen=80):
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

        # 입력 문장의 최대 길이(80)보다 클 경우 잘라내고, 작을 경우 앞에 padding(0)을 추가
        x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
        x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

        self.x_train, self.y_train = x_train, y_train
        self.x_test, self.y_test = x_test, y_test


# data lookup
data = Data()
print('train:', len(data.y_train), ', test:', len(data.y_test))
print('label:', data.y_train[20000:20005])
print('features:', data.x_train[20000:20005])

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
train: 25000 , test: 25000
label: [0 0 0 0 1]
features: [[   10    10    17     6    22  1985   255    11   631    62    28    77
    128    48     2   109    66    16     6   668  1985    19    41   861
      2     6    87   227     7  3588    21    17   230    17  3469  2391
      5   541  1554   140    14    31     9   254     8  1559 16128     2
      9   345  2516    42     2  5758   469     4    22    63   944     6
   1257  1166     7  2172   599  6203     2     2    17     2     5  8548
   2234  5252    17     2    26    52   696  1850]
 [  820   910  1030     8  5817   183    56    11  2716     7    68 12478
   4697    10    10   910    70  4146     4   118   927     4   118  1180
      5   907    21   131    36  5817   183    56    14    20     9   595
      4   619   155     9    15    13  1781   910    11    68  7671   127
     24    60   124    54     6    20     9    52    42    78    10    10
     12  

#### 단어가 2000개면 숫자가 numeric X data의 range가 0~2000 이 되는 encoding 개념.

In [3]:
class RNN_LSTM1(models.Model):
    def __init__(self, max_features, maxlen, bidirectional):
        x = layers.Input((maxlen,))    # 입력. 차원: [배치, maxlen(80), 1]
        e = layers.Embedding(max_features, 128)(x)    # 임베딩. 차원: [배치, maxlen(80), 임베딩 차원(128)]
        if bidirectional:
            # 양방향일 경우 layers.Bidirectional 모듈로 한번 감싸줍니다.
            # 차원: [배치, maxlen(80), 128 * 2]
            h = layers.Bidirectional(layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))(e)
        else:
            # 차원: [배치, maxlen(80), 128]
            h = layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2)(e)
        y = layers.Dense(1, activation='sigmoid')(h)    # 출력. 차원: [배치, 1]
        super().__init__(x, y)

        # try using different optimizers and different optimizer configs
        self.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# 모델 내부 구조를 살펴봅니다.
model1 = RNN_LSTM1(max_features=20000, maxlen=80, bidirectional=False)
model1.summary()

# LSTM을 양방향(bidirectional)으로 했을 때
model2 = RNN_LSTM1(max_features=20000, maxlen=80, bidirectional=True)
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 80, 128)           2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 80)                0         
_________________________________________________________________
