## CNN Text Classification with Keras and Theano

In [1]:
import numpy as np
import re, sys, os, csv, keras
from many_stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
print("Using Keras version",keras.__version__)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Using Keras version 2.1.4


Read from dataset (`csv` file)

In [2]:
texts, labels = [], []
print("Reading from csv file...")
with open('data.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        texts.append(row[0])
        labels.append(row[1])
print("Done!")

Reading from csv file...
Done!


In [3]:
MAX_NB_WORDS = 40000
MAX_SEQUENCE_LENGTH = 30 # max length of text (words)
VALIDATION_SPLIT = 0.1

In [4]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 30542 unique tokens.


In [5]:
labels = to_categorical(np.asarray(labels)) # convert to one-hot encoding vectors
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (40000, 30)
Shape of label tensor: (40000, 4)


In [6]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [7]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of entries in each category:')
print("Training:\n",y_train.sum(axis=0))
print("Validation:\n",y_val.sum(axis=0))

Number of entries in each category:
Training:
 [ 8698. 12243. 13767.  1292.]
Validation:
 [ 946. 1381. 1532.  141.]


### Preparing the Embedding layer
Compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings: [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from Stanford NLP. For new words, a "randomised vector" will be created.

In [8]:
EMBEDDING_DIM = 100
GLOVE_DIR = "dataset/glove/glove.twitter.27B.100d.txt"
embeddings_index = {}
f = open(GLOVE_DIR)
print("Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\nProceeding with Embedding Matrix...")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print("Completed!")

Loading GloVe from: dataset/glove/glove.twitter.27B.100d.txt ...Done.
Proceeding with Embedding Matrix...
Completed!


After computing our embedding matrix, load this embedding matrix into an `Embedding` layer. Toggle `trainable=False` to prevent the weights from being updated during training.

In [9]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

### RNN Structure
[Reference](https://github.com/richliao/textClassifier), [LTSM](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)

In [10]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_lstm1 = Bidirectional(LSTM(8,dropout=0.3,recurrent_dropout=0.3))(embedded_sequences)
preds = Dense(4, activation='softmax')(l_lstm1)

In [11]:
model = Model(sequence_input, preds)
adadelta = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
model.compile(loss='categorical_crossentropy',
              optimizer=adadelta,
              metrics=['acc'])

In [12]:
def step_cyclic(epoch):
    try:
        l_r, decay = 1.0, 0.00001
        if epoch%20==0:multiplier = 6
        else:multiplier = 1
        rate = float(multiplier * l_r * 1/(1 + decay * epoch))
        print("Epoch",epoch+1,"- learning_rate",rate)
        return rate
    except Exception as e:
        print("Error in lr_schedule:",str(e))
        return float(1.0)
    
def initial_boost(epoch):
    if epoch==0: return float(6.0)
    else: return float(1.0)
        
tensorboard = callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=50, write_grads=True , write_graph=True)
model_checkpoints = callbacks.ModelCheckpoint("checkpoints", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
#lr_schedule = callbacks.LearningRateScheduler(step_cyclic)
lr_schedule = callbacks.LearningRateScheduler(initial_boost)

In [15]:
model.summary()
print("Training Progress:")
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=200, batch_size=50,
          callbacks=[tensorboard, model_checkpoints, lr_schedule])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 30, 100)           3054300   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 16)                6976      
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 68        
Total params: 3,061,344
Trainable params: 7,044
Non-trainable params: 3,054,300
_________________________________________________________________
Training Progress:
Train on 36000 samples, validate on 4000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 

KeyboardInterrupt: 