In [1]:
import numpy as np

import json
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import *

import tensorflow as tf

Using TensorFlow backend.


In [2]:
with open('data/sentipolc_word_index.json', 'r') as f:
    word_index = json.load(f)

In [3]:
with open('data/words.tsv', 'w') as f:
    for w,_ in word_index.items():
        f.write('{}\n'.format(w))

In [4]:
words = [w for w,_ in word_index.items()]

In [5]:
data = np.load('data/sentipolc_seq.npz')

x_train = data['x_train']
y_train = data['y_train']

x_test = data['x_test']
y_test = data['y_test']

Check the max length of the text

In [6]:
max_len_seq = max([len(x) for x in x_train])
print('max len seq {}'.format(max_len_seq))
max_idx = max(x_train.max())
print('max id {}'.format(max_idx))

max len seq 40
max id 15204


In [7]:
x_train_pad = sequence.pad_sequences(x_train, maxlen=max_len_seq, padding='post')
x_test_pad = sequence.pad_sequences(x_test, maxlen=max_len_seq, padding='post')

In [8]:
x_train_pad.shape

(7410, 40)

In [9]:
x_train_pad[:2]

array([[15204,    11,   980,    15, 15204, 15204,    22, 15204, 15204,
          109,    10, 15204,    34,    33, 15204, 15204, 15204,    47,
        15204,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0],
       [15204, 15204,  1381,   474, 15204, 15204, 15204,    47, 15204,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]], dtype=int32)

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split_train_val = StratifiedShuffleSplit(1,test_size=0.2, random_state=123456)

for train, test in split_train_val.split(x_train_pad, y_train):
    x_train_pad_split, y_train_pad_split = x_train_pad[train], y_train[train]
    x_val_pad_split, y_val_pad_split = x_train_pad[test], y_train[test]

In [11]:
callbacks = [
    TensorBoard(log_dir='./logs_word_lstm', histogram_freq=1, batch_size=128, embeddings_freq=1, embeddings_metadata=words),
    ReduceLROnPlateau(patience=2, verbose=1)
]

Instructions for updating:
Use the retry module or similar alternatives.


In [33]:
def build_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_idx, output_dim=100, input_length=max_len_seq))
    model.add(Bidirectional(GRU(64,  dropout=0.2, recurrent_dropout=0.2, activation='relu')))
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [34]:
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 40, 100)           1520400   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)               63360     
_________________________________________________________________
dense_9 (Dense)              (None, 10)                1290      
_________________________________________________________________
dropout_4 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 11        
Total params: 1,585,061
Trainable params: 1,585,061
Non-trainable params: 0
_________________________________________________________________


In [35]:
!rm -rf logs_word_lstm
!mkdir -p logs_word_lstm

In [36]:
history = model.fit(x_train_pad_split, y_train_pad_split, validation_data=(x_val_pad_split, y_val_pad_split),
                    batch_size=128, epochs=5, callbacks=callbacks)

Train on 5928 samples, validate on 1482 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [37]:
del model

## Evaluate on the test set

In [43]:
callbacks = [
    TensorBoard(histogram_freq=0, batch_size=128, embeddings_freq=1, embeddings_metadata=words),
]

In [44]:
model = build_model()

history = model.fit(x_train_pad, y_train, batch_size=128, epochs=3, callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [45]:
loss, acc = model.evaluate(x_test_pad, y_test)



In [46]:
print('loss {}, acc {}'.format(loss, acc))

loss 0.6503494024276734, acc 0.7835


In [47]:
model.save('models/sentipolc_word_lstm.hdf5')