In [1]:
from __future__ import print_function
import numpy as np
from keras.preprocessing.text import Tokenizer ## Converts text/strings into lists of tokens
from keras.preprocessing.sequence import pad_sequences ## Converts arbitrary length sequences into fixed length sequences by padding with zeros
from keras.utils import to_categorical
from keras.layers import Dense, Input, Dropout
from keras.layers import LSTM, GRU, SimpleRNN, Embedding
from keras.models import Sequential
import pickle
from keras import callbacks ##Special functions such as early stopping of training and reduction of learning rate

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  % self._get_c_name())


### Loading and inspecting datasets.

First we load the train and test files using pickle.load - pickle is a way of serializing and deserializing any python object into byte-stream so as to save the file in persistent memory

In [2]:
with open('20news_small_train.pkl', 'rb') as f:
    raw_train = pickle.load(f)

with open('20news_small_test.pkl', 'rb') as f:
    raw_test = pickle.load(f)

print('Type of raw_train: {}'.format(type(raw_train)))
print('Number of train examples = {}'.format(len(raw_train)))
print(raw_train[0])

Type of raw_train: <type 'list'>
Number of train examples = 11314
('From: mathew <mathew@mantis.co.uk>\nSubject: Alt.Atheism FAQ: Atheist Resources\nSummary: Books, addresses, music -- anything related to atheism\nKeywords: FAQ, atheism, books, music, fiction, addresses, contacts\nExpires: Thu, 29 Apr 1993 11:57:19 GMT\nDistribution: world\nOrganization: Mantis Consultants, Cambridge. UK.\nSupersedes: <19930301143317@mantis.co.uk>\nLines: 290\n\nArchive-name: atheism/resources\nAlt-atheism-archive-name: resources\nLast-modified: 11 December 1992\nVersion: 1.0\n\n                              Atheist Resources\n\n                      Addresses of Atheist Organizations\n\n                                     USA\n\nFREEDOM FROM RELIGION FOUNDATION\n\nDarwin fish bumper stickers and assorted other atheist paraphernalia are\navailable from the Freedom From Religion Foundation in the US.\n\nWrite to:  FFRF, P.O. Box 750, Madison, WI 53701.\nTelephone: (608) 256-8900\n\nEVOLUTION DESIGNS\n\

We see that every example is given as ('text', 'class'), so we separate texts from classes into two separate lists using zip(\*list)

In [3]:
(train_X, train_Y) = zip(*raw_train) ## zip(*[(a1, b1), (a2, b2), (a3, b3)]) ==> [(a1, a2, a3), (b1, b2, b3)]
(test_X, test_Y) = zip(*raw_test)

class_to_idx = {cls: idx for idx, cls in enumerate(set(train_Y))} ## since the classes are in string format we assign an index to each class
train_Y = [class_to_idx[item] for item in train_Y] ## We then convert the list of classes into list of indices corresponding to the class
test_Y = [class_to_idx[item] for item in test_Y]

print('Number of train docs found: {}'.format(len(train_X)))
print('Number of test docs found: {}'.format(len(test_X)))

Number of train docs found: 11314
Number of test docs found: 7532


In [4]:
## Create model and data parameters
MAX_SEQUENCE_LENGTH = 200
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [5]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) ## initialize an object which will store at most MAX_NB_WORDS as valid tokens. Rest will be <UNK> special tokens
tokenizer.fit_on_texts(train_X) ## Read all the texts, tokenize and create [token::index] information
sequences = tokenizer.texts_to_sequences(train_X) ## Convert each text corresponding indices of the words
test_sequences = tokenizer.texts_to_sequences(test_X)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
num_words = min(MAX_NB_WORDS, len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) ## Pads into fixed length sequences
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 134142 unique tokens.


tokenizer creates a dictionary object which remebers which word is what index

```python
list(word_index.items())[:4] ## Printing first four word index pairs
'''
[('8il4qs9', 114838),
 ('posteriorly', 53326),
 ('belt\r', 36222),
 ('10001110\r', 62214)]
'''
```
texts_to_sequences converts any string into a list of numbers
```python
tokenizer.texts_to_sequences(['I am laptop', 'green duck', 'I am green duck']) 
# [[9, 131, 7978], [1422, 6928], [9, 131, 1422, 6928]]
```

pad_sequences will simply pad variable input lists to a single input

```python
pad_sequences([[9, 131, 7978], [1422, 6928], [9, 131, 1422, 6928]], maxlen=6)
#array([[   0,    0,    0,    9,  131, 7978],
#       [   0,    0,    0,    0, 1422, 6928],
#       [   0,    0,    9,  131, 1422, 6928]])
```

In [6]:
train_Y = to_categorical(np.asarray(train_Y))
test_Y = to_categorical(np.asarray(test_Y))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', train_Y.shape)

Shape of data tensor: (11314, 200)
Shape of label tensor: (11314, 20)


In [7]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices] ## shuffling the data and labels
train_Y = train_Y[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = train_Y[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = train_Y[-num_validation_samples:]

In [10]:
model = Sequential()
model.add(Embedding(num_words,
                    EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH,
                    ))

model.add(LSTM(128))   # SimpleRNN, GRU, LSTM
model.add(Dense(y_train.shape[1], activation='softmax'))

'''
reduce_lr will <monitor> a metric ('val_loss' here) and if the metric doesn't decrease by <epsilon> amount in <patience> number of epochs the 
learning rate of training will be reduced by <factor> amount

early_stop will <monitor> a metric and if it doesn't decrease by <min_delta> in <patience> epochs then train will stop
'''
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


In [11]:
## Training
print('Training model.')

model.fit(x_train, y_train, 
          batch_size=64,
          epochs=10,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])


## Evaluation on test data
model.evaluate(test_data, test_Y)

Training model.
Train on 9052 samples, validate on 2262 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[1.5870883245766954, 0.59227296867097745]

In [None]:
# model.save('lstm_20news_solution.h5')