In [1]:
import numpy as np

import json
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import *

Using TensorFlow backend.


In [2]:
with open('data/sentipolc_char_index.json', 'r') as f:
    word_index = json.load(f)

In [3]:
with open('data/chars.tsv', 'w') as f:
    for w,_ in word_index.items():
        f.write('{}\n'.format(w))

In [4]:
words = [w for w,_ in word_index.items()]

In [5]:
data = np.load('data/sentipolc_char_seq.npz')

x_train = data['x_train']
y_train = data['y_train']

x_test = data['x_test']
y_test = data['y_test']

Check the max length of the text

In [6]:
max_len_seq = max([len(x) for x in x_train])
print('max len seq {}'.format(max_len_seq))
max_idx = max(x_train.max())
print('max id {}'.format(max_idx))

max len seq 238
max id 126


In [7]:
x_train_pad = sequence.pad_sequences(x_train, maxlen=max_len_seq, padding='post')
x_test_pad = sequence.pad_sequences(x_test, maxlen=max_len_seq, padding='post')

In [8]:
x_train_pad.shape

(7410, 238)

In [9]:
x_train_pad[:2]

array([[28,  6,  7,  3,  6,  7,  4,  1, 10,  3,  1, 17,  3,  8,  7,  2,  7,
         3,  1, 17,  5,  8,  1, 50,  2,  3,  1, 21,  3, 36,  2,  4,  6,  3,
        10,  5,  1, 11,  2,  1, 12,  4, 18, 17, 10,  2, 12,  3, 22,  1,  9,
        15, 13, 24, 15, 19, 13, 26,  9,  1, 14,  2, 12,  5,  1, 12, 29,  5,
         1, 47, 18,  2, 12,  3,  1,  7, 16,  7,  7,  2,  1, 11,  4,  6,  4,
         1, 20,  3,  8,  2,  4,  1,  9, 15, 13, 24, 15, 19, 13, 26,  9, 47,
         1,  9, 34, 31, 33,  9,  1, 23,  2,  3,  1,  9, 20, 27, 21, 19, 28,
        30, 21,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split_train_test = StratifiedShuffleSplit(1,test_size=0.2, random_state=123456)

for train, test in split_train_test.split(x_train_pad, y_train):
    x_train_pad_split, y_train_pad_split = x_train_pad[train], y_train[train]
    x_val_pad_split, y_val_pad_split = x_train_pad[test], y_train[test]

In [11]:
callbacks = [
    TensorBoard(histogram_freq=1, batch_size=128, embeddings_freq=1),
    ReduceLROnPlateau(patience=2, verbose=1)
]

Instructions for updating:
Use the retry module or similar alternatives.


In [12]:
def build_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_idx, output_dim=100, input_length=max_len_seq))
    model.add(Conv1D(filters=64, kernel_size=5, padding='same', activation='relu', strides=1))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu', strides=1))
    model.add(GlobalMaxPool1D())
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [13]:
model = build_model()
model.summary()

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 238, 100)          12600     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 238, 64)           32064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 119, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 119, 32)           10272     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                1650      
___________________________________________________________

In [14]:
history = model.fit(x_train_pad_split, y_train_pad_split, validation_data=(x_val_pad_split, y_val_pad_split),
                    batch_size=128, validation_split=0.2, epochs=10, callbacks=callbacks)

Train on 5928 samples, validate on 1482 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 10/10


## Evaluate on the test set

In [3]:
callbacks = [
    TensorBoard(histogram_freq=0, batch_size=128, embeddings_freq=1),
]

  return f(*args, **kwds)
  return f(*args, **kwds)


Instructions for updating:
Use the retry module or similar alternatives.


In [4]:
model = build_model()

history = model.fit(x_train_pad, y_train, batch_size=128, epochs=6, callbacks=callbacks)

NameError: name 'build_model' is not defined

In [19]:
model.evaluate(x_test_pad, y_test)



[0.45416203880310058, 0.8105]

In [20]:
model.save('models/sentipolc_cnn_char.hdf5')