In [1]:
# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.models import Model

Using TensorFlow backend.


In [3]:
MAX_NB_WORDS = 500000
MAX_SEQUENCE_LENGTH = 128
VALIDATION_SPLIT = 0.3
EMBEDDING_DIM = 50
GLOVE_DIR = "glove/glove.twitter.27B/glove.twitter.27B."+str(EMBEDDING_DIM)+"d.txt"

In [4]:
import pickle
with open('texts.pickle', 'rb') as handle:
    texts = pickle.load(handle)
with open('labels.pickle', 'rb') as handle:
    labels = pickle.load(handle)

In [5]:
""" #uncomment this chunk to create a new Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("[i] Saved word tokenizer to file: tokenizer.pickle")
"""
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle) # load a previously generated Tokenizer

In [6]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('[i] Found %s unique tokens.' % len(word_index))
data_int = pad_sequences(sequences, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-3))
data = pad_sequences(data_int, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

[i] Found 195202 unique tokens.


In [7]:
labels = np.asarray(labels)

In [8]:
print('[i] Shape of data tensor:', data.shape)
print('[i] Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('[i] Number of entries in each category:')
print("[+] Training:",y_train.sum(axis=0))
print("[+] Validation:",y_val.sum(axis=0))

[i] Shape of data tensor: (159571, 128)
[i] Shape of label tensor: (159571, 7)
[i] Number of entries in each category:
[+] Training: [ 10779   1132   5962    330   5598    993 100278]
[+] Validation: [ 4515   463  2487   148  2279   412 43068]


In [9]:
embeddings_index = {}
f = open(GLOVE_DIR)
print("[i] Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n[+] Proceeding with Embedding Matrix...", end="")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print("[i] Completed!")

[i] Loading GloVe from: glove/glove.twitter.27B/glove.twitter.27B.50d.txt ...Done.
[+] Proceeding with Embedding Matrix...[i] Completed!


In [21]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
embedded_sequences = embedding_layer(sequence_input)

In [22]:
l_dropout = SpatialDropout1D(0.5)(embedded_sequences)
l_lstm = Bidirectional(LSTM(40, return_sequences=True))(l_dropout)
l_gru = Bidirectional(GRU(40, return_sequences=True, reset_after=True, recurrent_activation='sigmoid'))(l_lstm)
l_maxpool = MaxPooling1D(pool_size=4)(l_gru)
l_avgpool = AveragePooling1D(pool_size=4)(l_gru)
l_merge = Concatenate(axis=1)([l_maxpool, l_avgpool, l_gru])
l_flat = Flatten()(l_merge)
preds = Dense(7, activation="sigmoid")(l_flat)

In [23]:
model = Model(sequence_input, preds)
adadelta = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.01)
model.compile(loss='binary_crossentropy',
              optimizer=adadelta,
              metrics=['acc'])

In [24]:
model.load_weights("checkpoint-0.076.h5")

In [25]:
model_checkpoints = callbacks.ModelCheckpoint("checkpoint2-{val_loss:.3f}.h5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=0)

In [26]:
print("Training Progress:")
model_log = model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=5, batch_size=128,
          callbacks=[model_checkpoints])

Training Progress:
Train on 111700 samples, validate on 47871 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.07491, saving model to checkpoint2-0.075.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.07491 to 0.07415, saving model to checkpoint2-0.074.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.07415 to 0.07273, saving model to checkpoint2-0.073.h5
Epoch 4/5
 18944/111700 [====>.........................] - ETA: 8:18 - loss: 0.0859 - acc: 0.9706

KeyboardInterrupt: 

In [27]:
model.save("toxic.h5")