In [None]:
import pickle
with open('cleaned_texts.pickle', 'rb') as handle:
    texts = pickle.load(handle)
    
with open('labels.pickle', 'rb') as handle:
    labels = pickle.load(handle)

In [None]:
shortened_texts = []
for text in texts:
    shortened_texts.append(text[:401])

In [None]:
MAX_NB_WORDS = 100000 # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 400 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 100 # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

In [None]:
import numpy as np
import pandas as pd
import re, sys, os, csv, keras, pickle
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
print("[i] Using Keras version",keras.__version__)

In [None]:
#""" #uncomment this chunk to create a new Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(shortened_texts)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("[i] Saved word tokenizer to file: tokenizer.pickle")
#"""
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle) # load a previously generated Tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(shortened_texts)
word_index = tokenizer.word_index
print('[i] Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

In [None]:
labels = to_categorical(np.asarray(labels)) # convert the category label to one-hot encoding
print('[i] Shape of data tensor:', data.shape)
print('[i] Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('[i] Number of entries in each category:')
print("[+] Training:",y_train.sum(axis=0))
print("[+] Validation:",y_val.sum(axis=0))

In [None]:
print("Tokenized sequence:\n", data[0])
print("One-hot label:", labels[0])

In [None]:
embeddings_index = {}
f = open(GLOVE_DIR)
print("[i] (long) Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n[+] Proceeding with Embedding Matrix...", end="")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(" Completed!")

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # input to the model

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
embedded_sequences = embedding_layer(sequence_input)

In [None]:
l_conv_3 = Conv1D(filters=256,kernel_size=3,activation='relu')(embedded_sequences)
l_conv_4 = Conv1D(filters=256,kernel_size=4,activation='relu')(embedded_sequences)
l_conv_5 = Conv1D(filters=256,kernel_size=5,activation='relu',)(embedded_sequences)

l_conv = Concatenate(axis=1)([l_conv_3, l_conv_4, l_conv_5])

In [None]:
l_pool = MaxPooling1D(4)(l_conv)
l_drop = Dropout(0.2)(l_pool)
l_flat = Flatten()(l_drop)
l_dense = Dense(32, activation='relu')(l_flat)
preds = Dense(11, activation='softmax')(l_dense) #follows the number of classes

In [None]:
model = Model(sequence_input, preds)
adadelta = optimizers.Adadelta(lr=1.4, rho=0.95, epsilon=None, decay=0.1) # let's use a hipster optimizer because we can
model.compile(loss='categorical_crossentropy',
              optimizer=adadelta,
              metrics=['acc'])
model.summary()

In [None]:
print("Training Progress:")
model_log = model.fit(x_train, y_train, validation_data=(x_val, y_val),
                      epochs=30, batch_size=32)