In [1]:
import numpy as np

import json
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import *
import keras.backend as K
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
with open('data/sentipolc_word_index.json', 'r') as f:
    word_index = json.load(f)

In [3]:
with open('data/words.tsv', 'w') as f:
    for w,_ in word_index.items():
        f.write('{}\n'.format(w))

In [4]:
words = [w for w,_ in word_index.items()]

In [5]:
data = np.load('data/sentipolc_seq.npz')

x_train = data['x_train']
y_train = data['y_train']

x_test = data['x_test']
y_test = data['y_test']

Check the max length of the text

In [6]:
max_len_seq = max([len(x) for x in x_train])
print('max len seq {}'.format(max_len_seq))
max_idx = max(np.array(x_train).max())
print('max id {}'.format(max_idx))

max len seq 40
max id 10208


In [7]:
max_features = 5000

In [8]:
def vectorize(data, max_idx):
    results = np.zeros((len(data), max_idx))
    for i, sequence in enumerate(data):
        filtered = [x for x in sequence if x < max_idx]
        results[i, filtered] = 1.
    return results

In [9]:
x_train_vect = vectorize(x_train, max_features)
x_test_vect = vectorize(x_test, max_features)

In [10]:
x_train_vect.shape

(4154, 5000)

# Define metrics

In [11]:
def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.
     Computes the precision, a metric for multi-label classification of
     how many selected items are relevant.
      """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))		
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))		
    precision = true_positives / (predicted_positives + K.epsilon())		
    return precision

In [12]:
def recall(y_true, y_pred):
    """Recall metric.
 
    Only computes a batch-wise average of recall.
    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit

split_train_val = StratifiedShuffleSplit(1,test_size=0.2, random_state=123456)

for train, val in split_train_val.split(x_train_vect, y_train):
    x_train_split, y_train_split = x_train_vect[train], y_train[train]
    x_val_split, y_val_split = x_train_vect[val], y_train[val]

In [14]:
callbacks = [
    ReduceLROnPlateau(patience=2, verbose=1),
    EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='auto')
]

In [18]:
def build_model():
    model = Sequential()
    model.add(Dense(8, input_shape=(max_features,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [19]:
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 8)                 40008     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 40,017
Trainable params: 40,017
Non-trainable params: 0
_________________________________________________________________


In [20]:
history = model.fit(x_train_split, y_train_split, 
                    validation_data=(x_val_split, y_val_split),
                    batch_size=128, epochs=20, callbacks=callbacks)

Train on 3323 samples, validate on 831 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 17/20
Epoch 18/20

Epoch 00018: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 00018: early stopping


## Evaluate on the test set

In [22]:
model = build_model()

history = model.fit(x_train_vect, y_train, batch_size=128, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
loss, acc = model.evaluate(x_test_vect,y_test,3)



In [25]:
print('loss {}, acc {}'.format(loss, acc))

loss 0.5671231577651842, acc 0.7133333441189357


In [None]:
#model.save('models/sentipolc_word_lstm.hdf5')