# Convolutional neural networks

A convolutional neural network is designed to identify indicative local predictors in a large structure, and to combine them to produce a fixed size vector representation of the structure, capturing the local aspects that are most informative for the prediction task at hand. I.e., the convolutional architecture will identify ngrams that are predictive for the task at hand, without the need to pre-specify an embedding vector for each possible ngram. The convolutional architecture also allows to share predictive behavior between ngrams that share similar components, even if the exact ngram was never seen at test time.

The convolutional architecture could be expanded into a hierarchy of convolution layers,
each one effectively looking at a longer range of ngrams in the sentence. This also allows the model to be sensitive to some non-contiguous ngrams.

In [1]:
import os
import numpy as np

import zipfile

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# data path initialization
BASE_DIR = '../'
TEXT_DATA_DIR = BASE_DIR + 'data/'
TEXT_DATA_FILE = "movie_reviews.csv"
HEADER = True

# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42


def load_data():
    # function for loading data
    x = []
    y = []
    with open(os.path.join(TEXT_DATA_DIR, TEXT_DATA_FILE), "r", encoding="utf-8") as f:
        if HEADER:
            _ = next(f)
        for line in f:
            temp_y, temp_x = line.rstrip("\n").split(",", 1)
            x.append(temp_x.replace("'", ""))
            y.append(temp_y)
    return x, y

data, labels = load_data()
labels = np.asarray(labels, dtype='int8')

# spliting our original data on train and validation sets
data_train, data_val, labels_train, labels_val = \
    train_test_split(data, np.asarray(labels, dtype='int8'),
                     test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED, stratify=labels)\
    

# initialize dictionary size and maximum sentence length
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 40

# create a dictionary with Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='#$%&()*+-/:;<=>@[\\]^{|}~\t\n,.!"?`')
tokenizer.fit_on_texts(data_train)

# replacing words with their indexes from our dictionary
X_train = tokenizer.texts_to_sequences(data_train)
X_val = tokenizer.texts_to_sequences(data_val)

# fit each sentence to max length
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

Using TensorFlow backend.


In [2]:
# path to embeddings file
EMBEDDINGS_DIR = BASE_DIR + 'embeddings'
EMBEDDINGS_FILE = 'glove.6B.50d.txt'

EMBEDDING_DIM = 50

# choose only 10000 words from our dictionary
first_10000 = {k: v for k, v in tokenizer.word_index.items() if v < 10000}

# upload embeddings
embeddings = {}
with zipfile.ZipFile(os.path.join(EMBEDDINGS_DIR, EMBEDDINGS_FILE+'.zip')) as myzip:
    with myzip.open(EMBEDDINGS_FILE) as f:
        for line in f:
            values = line.split()
            word = values[0].decode('UTF-8')
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
        del values, word, coefs, line

In [3]:
# prepare embeddings matrix where each row is word index

embedding_matrix = np.zeros((tokenizer.num_words, EMBEDDING_DIM))
for word, i in first_10000.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

First of all, let's create a simple convolutional neural network on pre-trained embeddings.

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

# инициализируем слой эмбеддингов
NAME = "words_cnn"

embedding_layer = Embedding(tokenizer.num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# initialize model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(GlobalMaxPooling1D())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
#model.fit(X_train, labels_train, validation_data=[X_test, labels_test], 
#          batch_size=1024, epochs=100, callbacks=[callback_1, callback_2, callback_3])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 50)            500000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 37, 100)           20100     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 34, 100)           40100     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 31, 100)           40100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
__________

Accuracy is worse than on rnns. Train – **80.55%**, validation – **76.10**. 

## Task 1

Try to improve this architecture and achieve better score.

<details>
  <summary>My solution</summary>
    <pre>
      <code>
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(SpatialDropout1D(0.2))
model.add(BatchNormalization())
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(SpatialDropout1D(0.2))
model.add(BatchNormalization())
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(GlobalMaxPooling1D())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
      </code>
    </pre>

</details>

***

My new score is on train – **79.30%** and on validation – **77.88%**.

About some tricks:

### [Spacial Dropout](https://arxiv.org/pdf/1411.4280.pdf)

This version performs the same function as Dropout, however it drops entire 1D feature maps instead of individual elements. If adjacent frames within feature maps are strongly correlated (as is normally the case in early convolution layers) then regular dropout will not regularize the activations and will otherwise just result in an effective learning rate decrease. In this case, **spacial dropout** will help promote independence between feature maps and should be used instead.

Here is an illustration of usual dropout:
<img src="https://photos-6.dropbox.com/t/2/AAD9bnES_baGH4NCCu8qwAiynRYb9n9NcDZ--ys1sO-axw/12/533843084/png/32x32/1/_/1/2/dropout.png/EKyZ0aIEGN8iIAIoAg/MMO1Y426ocFL2W0DQE8thr6a1n-lGkpMtl6tW7Y0FuA?size=2048x1536&size_mode=3" alt="dropout" style="width: 700px;"/>

And spacial dropout:

<img src="https://photos-2.dropbox.com/t/2/AAAwzLqr2EYT0Z3ehe_7NBz0AfqhCkOlBg6SrtdS1E7QYA/12/533843084/png/32x32/1/_/1/2/spacial_dropout.png/EKyZ0aIEGN8iIAIoAg/yvRhdcui-E_uvIMJHQ6WS_vFuD6i11Gc0BA9inTSqAQ?size=2048x1536&size_mode=3" alt="dropout" style="width: 700px;"/>

### [BatchNormalization](https://arxiv.org/abs/1502.03167)

Consider a batch of activations at some layer. To make each dimension unit gaussian, apply:

$$ \Large \widehat{x}^{(k)} = \frac{x^{(k)} - E[x^{(k)}]}{\sqrt{Var[x^{(k)}]}} $$ where the expectation and variance are computed over the training data set.

But as we want to perform identity transformations, we should scale and shift our "new" batch. 

So our output will be: $$ \Large y^{(k)} = \gamma^{(k)}\widehat{x}^{(k)} + \beta^{(k)} $$

Algorithm:
1. Compute the empirical mean and variance independently for each dimension.
2. Normalize

Profit:
- Improves gradient flow through the network
- Allows higher learning rates
- Reduces the strong dependence
on initialization
- Acts as a form of regularization
in a funny way, and slightly reduces the need for dropout, maybe

**Note: at test time BatchNorm layer functions differently:**
The mean/std are not computed based on the batch. Instead, a single fixed empirical mean of activations during training is used.

***

Another approach could be not to use pretrained word vectors, but to train them with other weights.


In [5]:
embedding_layer = Embedding(tokenizer.num_words,
                            EMBEDDING_DIM,
                            #weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(GlobalMaxPooling1D())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
#model.fit(X_train, labels_train, validation_data=[X_test, labels_test],
#          batch_size=1024, epochs=100, callbacks=[callback_1, callback_2, callback_3])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 50)            500000    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 37, 100)           20100     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 34, 100)           40100     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 31, 100)           40100     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
__________

It trains faster (4sec on GPU) and accuracy is higher: train – **81.53%**, validation – **79.77%**. 


For some task in NLP it is good to use positional embedding. But on our task it doesn't increase accuracy for this task :( 

In [6]:
from keras.layers import concatenate, Input
from keras.models import Model

POS_EMB_SIZE = 20

X_train_emb = np.array([list(range(MAX_SEQUENCE_LENGTH))]*len(X_train))
X_val_emb = np.array([list(range(MAX_SEQUENCE_LENGTH))]*len(X_val))

embedding_layer_pos = Embedding(tokenizer.num_words,
                                POS_EMB_SIZE,
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)



words_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
x1 = Embedding(tokenizer.num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(words_input)
pos_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
x2 = Embedding(tokenizer.num_words,
                                POS_EMB_SIZE,
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)(pos_input)
x = concatenate([x1, x2])

x = Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid")(x)
x = GlobalMaxPooling1D()(x)
x = Dense(100, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[words_input,pos_input], outputs=output)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


model.summary()
#model.fit([X_train, X_train_emb], labels_train, validation_data=([X_val, X_val_emb], labels_val),
#          batch_size=1024, epochs=100)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 40, 50)        500000      input_1[0][0]                    
____________________________________________________________________________________________________
embedding_5 (Embedding)          (None, 40, 20)        200000      input_2[0][0]                    
___________________________________________________________________________________________

## Dilated Convolutions

In simple terms, dilated convolution is just a convolution applied to input with defined gaps. With this definitions, given our input is an 2D image, dilation rate k=1 is normal convolution and k=2 means skipping one pixel per input and k=3 means skipping 2 pixels.

<img src="https://qph.ec.quoracdn.net/main-qimg-d9025e88d7d792e26f4040b767b25819" alt="dilated_conv" style="width: 700px;"/>

In a dilated convolution architecture the hierarchy of convolution layers each has a stride size of k - 1. This allows an exponential growth in the effective window size as a function of the number of layers. With this purpose, it finds usage in applications cares more about integrating knowledge of the wider context with less cost.

Let's compare:

**Usual convolutions**
<img src="https://photos-3.dropbox.com/t/2/AADAz0GQyUNqhNUJZZinmOkeHZIxheD_7IX4lCYytgUJ8Q/12/533843084/png/32x32/1/_/1/2/conv.png/EKyZ0aIEGOEiIAIoAg/k_P0Y9i1PEsQSbw2wWtPyd_WLhEEQ-WeU-RUL3vMWzw?size=2048x1536&size_mode=3" alt="simple_conv" style="width: 700px;"/>

**Dilated convolutions**
<img src="https://photos-2.dropbox.com/t/2/AAA5X_b80s2Ex4xai2phgske3Noh9fyQe6zjXReaSQc1tg/12/533843084/png/32x32/1/_/1/2/dil_conv.png/EKyZ0aIEGOEiIAIoAg/q_iqx-TzLxuzE_vOPw77As1zM9rV6l8LOC5Ds7wCAKU?size=2048x1536&size_mode=3" alt="simple_conv" style="width: 700px;"/>

In [11]:
embedding_layer = Embedding(tokenizer.num_words,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(activation="relu", filters=100, kernel_size=5, padding="valid", dilation_rate=1))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(activation="relu", filters=100, kernel_size=5, padding="valid", dilation_rate=2))
model.add(GlobalMaxPooling1D())
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
#model.fit(X_train, labels_train, validation_data=[X_test, labels_test],
#          batch_size=1024, epochs=100, callbacks=[callback_1, callback_2, callback_3])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 40, 50)            500000    
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 31, 100)           20100     
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 22, 100)           40100     
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 13, 100)           40100     
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 100)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
__________

Such architecture gives small increase in accuracy: train – **82.28%**, validation – **80.39%**.

Let's remind that convolution neural network works as ngram model. In previous example we used only one kernal size. What if we try to create many separate convolutions with different kernel size.

In [16]:
from keras.layers import concatenate, Input
from keras.models import Model


words_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')

x = Embedding(tokenizer.num_words,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)(words_input)

x1 = Conv1D(activation="relu", filters=100, kernel_size=2, padding="same")(x)
x2 = Conv1D(activation="relu", filters=100, kernel_size=3, padding="same")(x)
x3 = Conv1D(activation="relu", filters=100, kernel_size=4, padding="same")(x)
x4 = Conv1D(activation="relu", filters=100, kernel_size=5, padding="same")(x)
x = concatenate([x1,x2,x3,x4])
x = GlobalMaxPooling1D()(x)
x = Dense(100, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs=words_input, outputs=output)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


model.summary()
#model.fit([X_train, X_train_emb], labels_train, validation_data=([X_val, X_val_emb], labels_val),
#          batch_size=1024, epochs=100)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
embedding_14 (Embedding)         (None, 40, 50)        500000      input_6[0][0]                    
____________________________________________________________________________________________________
conv1d_35 (Conv1D)               (None, 40, 100)       10100       embedding_14[0][0]               
____________________________________________________________________________________________________
conv1d_36 (Conv1D)               (None, 40, 100)       15100       embedding_14[0][0]               
___________________________________________________________________________________________

As expected it was a good idea to do it. Train accuracy is **84.88%** and validation is **81.61%**.

***

## Neural networks on symbols

There are two main types of preprocessing for neural networks on symbols:
- similarly to words, train embeddings of symbols;
- symbols represent as OHE embeddings.

First, we'll figure out which characters to include. We can include any characters, but the general practice for English is to use 70 characters: lowercase letters, numbers and punctuation. Depending on the task or language, you can vary the number of characters.

Let's start with a small task.

## Task 2

Create a dictionary of 70 characters and replace the symbols with their indexes. 

*Tip: use the `string` library and the` from keras.preprocessing.sequence import pad_sequences` method.*

In [None]:
import string
from keras.preprocessing.sequence import pad_sequences

def create_vocab_set():
    
    #1. Your code
    return vocab, vocab_size

def text2sequence(text, vocab):
    temp = []
    #2. Your code
    return temp

vocab, vocab_size = create_vocab_set()

X_train = text2sequence(data_train, vocab)
X_test = text2sequence(data_test, vocab)

#3. Your code
X_train = 
X_val = 

<details>
  <summary>Here is a right answer!</summary>
      <pre>
            <code>
              1. alphabet = (list(string.ascii_lowercase) + list(string.digits) + 
                              list(string.punctuation) + [' ', '\n'])
                 vocab_size = len(alphabet)
                 vocab = {}
                 for ix, t in enumerate(alphabet):
                     vocab[t] = ix+1
              2. for review in text:
                     temp.append([])
                     for i in review:
                         char = vocab.get(i,0)
                         if char != 0:
                             temp[-1].append(char)
              3. X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
                 X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)
            </code>
      </pre>

</details>

In [8]:
from keras.layers import GlobalMaxPooling1D
NAME = "char_cnn_emb"
EMBEDDING_DIM = 50
vocab_size = 70

# инициализируем модель
model = Sequential()
model.add(Embedding(vocab_size+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid"))
model.add(GlobalMaxPooling1D())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
#model.fit(X_train, labels_train, validation_data=[X_test, labels_test],
#          batch_size=1024, epochs=100, callbacks=[callback_1, callback_2, callback_3])



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 40, 50)            3550      
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 37, 100)           20100     
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 34, 100)           40100     
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 31, 100)           40100     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
__________

Train accuracy is **82.21%**, validation – **78.31%**.

The main drawback of this approach is that we do not take into account the order of words (nonlinear transformations are performed, and then the best features are selected and several fully connected layers are applied on them).

Now consider the second approach to character models - ** One-Hot-Encoding (OHE) **. Suppose we have a dictionary of three characters "a", "b", "c". OHE representation of "abca" will be $$ a - 0 0 \\ b - 1 0 \\ c - 0 1 \\ а - 0 0 $$

For implementation, we will use the additional functions from Api Keras and TensorFlow.

In [17]:
import tensorflow as tf
# ohe функция
def ohe(x, sz):
    return tf.to_float(tf.one_hot(x, sz, on_value=1, off_value=0, axis=-1))

In [19]:
from keras.models import Model
from keras.layers import Input, Lambda
from keras.layers import MaxPooling1D, LSTM

NAME = "char_cnn_ohe"
# инициализация входа
in_sentence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
# Lambda слой для ohe преобразования
embedded = Lambda(ohe, output_shape=lambda x: (x[0], x[1], vocab_size), arguments={"sz": vocab_size})(in_sentence)
block = embedded
# свертки с MaxPooling
for i in range(3):
    block = Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid")(block)
    if i == 0:
        block = MaxPooling1D(pool_size=5)(block)
# LSTM ячейка
block = LSTM(128, dropout=0.1, recurrent_dropout=0.1)(block)
block = Dense(100, activation='relu')(block)
block = Dense(1, activation='sigmoid')(block)
# собираем модель
model = Model(inputs=in_sentence, outputs=block)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


model.summary()

#model.fit(X_train, labels_train, validation_data=[X_test, labels_test],
#          batch_size=1024, epochs=100, callbacks=[callback_1, callback_2, callback_3])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 40)                0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 40, 70)            0         
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 37, 100)           28100     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 7, 100)            0         
_________________________________________________________________
conv1d_43 (Conv1D)           (None, 4, 100)            40100     
_________________________________________________________________
conv1d_44 (Conv1D)           (None, 1, 100)            40100     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
__________

It slightly improves our previous model: train – **80.91%**, validation – **78.65%**.