In [1]:
# LSTM and CNN for sequence classification in the IMDB dataset
import numpy as np
import keras
from keras.datasets import imdb, reuters
from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Dropout, RepeatVector

from keras.layers.noise import GaussianNoise
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution1D, MaxPooling1D

from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras_tqdm import TQDMNotebookCallback


Using TensorFlow backend.


#### Keras recently rolled out V2.0 with some breaking changes. I'll try to keep this tutorial as v1/v2 compatible as possible in case people are still running v1. 
If you see errors like `TypeError: Received unknown keyword arguments: {'epochs': 3}`, or size/shape mismatches, you probably have a version mismatch

In [2]:
print(keras.__version__)

2.0.2


In [3]:
# Fix random seed for reproducibility
np.random.seed(7)

# Preprocess the dataset into suitable shape to feed to NN

In [4]:

nb_top_words           = 5000  # Load the dataset but only keep the top n words, zero the rest. These will be our 'symbols' in our abridged corpus
sequence_len      = 50   # This is the length in the "time axis" 
embed_vector_len = 32    # Size of the feature vector each word will map to
nb_lstm                = 100   # Number of LSTM nodes
batch_size             = 64    # Number of samples to feed into the model for each forward/backward pass

In [5]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(nb_words=nb_top_words, skip_top=0, index_from=3)#nb_words=nb_top_words)



In [5]:
twitter_x = np.load('twitter_phrases.npy') # pre-converted indexes from Twitter set using IMDB word list
twitter_y = np.load('twitter_labels.npy')
rotten_x = np.load('rotten_phrases.npy') # pre-converted indexes from Twitter set using IMDB word list
rotten_y = np.load('rotten_labels.npy')

FileNotFoundError: [Errno 2] No such file or directory: 'twitter_phrases.npy'

## Let's peer into the data. I always like to start by getting a feel for the data
## This data is in the form of a sequence of integers. Each int maps to a word in the corpus dictionary.

In [6]:
x_train.dtype, x_train.shape

(dtype('O'), (25000,))

In [7]:
x_train[0][-10:]

[4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]

In [8]:
word_index = imdb.get_word_index()
index_word = {value: key for (key, value) in word_index.items()} # flip key:value pairs to get the integer as the key
index_word.update({0: "~"})
list(index_word.items())[:10]

[(0, '~'),
 (1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it')]

In [9]:
' '.join([index_word[idx] for idx in x_train[1]])

"the thought solid thought and do making to is spot nomination and while he of jack in where picked as getting on was did hands fact characters to always life thrillers not as me can't in at are br of sure your way of little it strongly random to view of love it so and of guy it used producer of where it of here icon film of outside to don't all unique some like of direction it if out her imagination below keep of queen he and to makes this stretch and of solid it thought begins br and and budget worthwhile though ok and and for ever better were and and for budget look kicked any to of making it out and follows for effects show to show cast this family us scenes more it severe making and to and finds tv tend to of and these thing wants but and an and cult as it is video do you david see scenery it in few those are of ship for with of wild to one is very work dark they don't do dvd with those them"

In [10]:
' '.join([index_word[idx] for idx in rotten_x[1]])

NameError: name 'rotten_x' is not defined

### Since we are using only the top *nb_top_words*, a lot of the more obscure words are dropped, resulting in something which is somewhat nonsensical to us, but will still contain an 'essence' of the sentiment that the algorithm will be able to extract.

#### Next, we need to pad the tensors out to the proper dimension in the time axis. Even though LSTM can handle variable length data, the backend still prefers rectangular tensors.

In [11]:
x_train = sequence.pad_sequences(x_train, maxlen=sequence_len)
x_test = sequence.pad_sequences(x_test, maxlen=sequence_len)

In [12]:
twitter_x = sequence.pad_sequences(twitter_x, maxlen=sequence_len)
rotten_x = sequence.pad_sequences(rotten_x, maxlen=sequence_len)
print(x_train.shape, x_test.shape, twitter_x.shape, rotten_x.shape)

NameError: name 'twitter_x' is not defined

# Without further ado, let's build an RNN in 5 lines of code. I'll walk you through each layer in detail. 

First, we initialize the model.

> ```python
model = Sequential()```

The [Keras Sequential Model](https://keras.io/getting-started/sequential-model-guide/) is based around building up the model layer-by-layer, like a cake. This is the easiest to graps for beginners, and works well, since many, if not most, neural networks can be represented this way. 
Calling model.add(layer) sticks the layer onto the topmost, and that becomes the new top. 



### Embedding

> ```python
model.add(Embedding(nb_top_words,                 
                    embed_vector_len, 
                    input_length=sequence_len
                   ))
```

Currently, the "shape" of each data vector is 1x5000, since it can be thought of as a [*one hot* representation](https://www.quora.com/What-is-one-hot-encoding-and-when-is-it-used-in-data-science), where the dimension associated with the integer has value 1 ('hot') and all other are 0. Each word would look like this:

[0,0,0,0,0,0,0,...,0,**1**,0,0,...,0,0,0] 

except 5000 long. That's really problematic, due to the [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) and also computational constraints. We want to transform our **uncondensed** and **sparse** data into a **compact** and **dense** representation, in our case a 32-vector. This process is known as embedding. **Word2Vec** is a popular word embedding system. Keras Embedding layer works very similarly, so if you want to know more, I suggest checking out how Word2Vec works. 

### LSTM

> ```python
model.add(LSTM(nb_lstm))```


The LSTM (long short-term memory) cell is a neuron with memory. It accomplishes this by have a memory state, which can be written to and read from. It's like a tiny ~~casette tape~~  [~~floppy disk~~](http://i.imgur.com/Osxo1UF.jpg)  USB flash drive. In short, the inputs from the prior layer (mathematically) control gates. These gates determine whether to erase, write, and/or read from the memory cell. LSTMs have been covered really well in depth  in a lot of places. In particular I recommend the articles by [colah](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) and [Karpathy](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). 

![LSTM](http://tc.sinaimg.cn/maxwidth.800/tc.service.weibo.com/cdn_images_1_medium_com/58ad765e09eacb5116c9dfc5897c7296.png)

### Dense

> ```python
model.add(Dense(1, activation='sigmoid'))```


Finally, we have a densely-connected layer. This is your "typical" neural network layer - each node from the prior layer connects to each node of the following. In this case, we are crunching down to a single node since we want a single answer - "Positive" or "Negative". We'll use a sigmoid activation to squash the output to the range 0-1. 

That's it for our network! Pretty simple, right? 

> ["Keras is so good that it is effectively cheating in machine learning"](https://news.ycombinator.com/item?id=13872670)

### Compiling 

> ```python
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])```

All that's left is compiling the model. This tells the model which loss function, optimizer, and metrics to use. 

- **Optimizer**: This is the algorithm which describes how we approach gradient descent. Adam is pretty modern and works quite well for a lot of problems, so it is typically the first go-to when picking hyperparameters
- **Metrics**: This does not actually affect the direct training of the model. Rather, it gives us humans a way to track the performance of the model over time. This can also be used for automatically early-stopping to avoid overfitting. 
- **Loss function**: This determines how the "penalty" for incorrect predictions is calculated. 

### Brief tangent: Cross-entropy
I'm working on a [simple summary of Cross-Entropy](https://github.com/xkortex/TechValleyMachineLearning/blob/master/CrossEntropy.ipynb). If you would like to know more, then check out that post, otherwise it's a bit of a tangent for this particular project. 

For now, all we need to know is cross-entropy is a very common loss function, and many Keras models use binary (yes/no problems) or categorical (multiple labels) cross-entropy. You are probably familiar with Mean Squared Error, which is a commonly used loss function if you are performing continuous regression. Cross-entropy is used to predict labels (logistic regression). The legendary [Andrew Ng Coursera Course](https://www.coursera.org/learn/machine-learning) covers this in more detail. 


In [13]:
model = Sequential()
model.add(Embedding(nb_top_words,                 
                    embed_vector_len, 
                    input_length=sequence_len
                   ))
model.add(LSTM(nb_lstm))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
# Have a look at our model
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 32)            160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301.0
Trainable params: 213,301
Non-trainable params: 0.0
_________________________________________________________________
None


In [15]:
# This will take about 3-5 minutes per epoch on a GPU. CPU folks will be here for quite some time. 
# We'll be making improvements to the model, so you don't have to run this just yet.
RUN_MODEL1 = True# I'm just using a switch here so I can run Kernel -> Restart and Run All
if RUN_MODEL1: 
    model.fit(x_train, y_train, 
              validation_data=(x_test, y_test), 
              epochs=3, 
              batch_size=batch_size, 
              verbose=0, # Some versions of Jupyter bork on Keras' progress bar. We replace it with Keras-TQDM instead
              callbacks=[TQDMNotebookCallback()])

KeyboardInterrupt: 

This network will get us to about 87% accuracy. However, we had to stop pretty early because of the risk of overfitting. A model that overfits easy is often a strong sign that the model will generalize poorly to new, unseen data, or data from different distributions. Let's see how it performs on a similar dataset, the Twitter dataset. This will be challenging, as it is much shorter, which gives the RNN less time to 'get up to speed'.  

In [17]:
if RUN_MODEL1:
    loss, metric = model.evaluate(twitter_x, twitter_y)
    print('Label ratio: {:.2f}%'.format(np.mean(twitter_y*100)))
    print('Loss: {}\nMetric: {:.2f}%'.format(loss, metric*100))

NameError: name 'twitter_x' is not defined

#### Ouch. No better than chance.
Let's see if we can do a bit better with the generalization. Since the dataset we want to extrapolate to (Twitter, Rotten Tomatoes, etc) is different in several ways, we want our IMDB-trained network to be robust to noise, idiosyncracies, and quirks unique to IMDB that do not generalize to other formats.  

### Adding dropout and noise

**Dropout** is a technique for reducing overfitting in neural networks by preventing over-adaptation to the training data. The general idea is, every cycle you randomly drop a certain percentage of nodes or connections. This forces the network to compensate by distributing over multiple nodes and prevents any given node from getting too "specialized". 

Adding noise is another way of reducing overfitting and improving generalization. By adding noise to the vectors, this forces the network to learn to compensate, just as humans process stimuli in a noisy environment.

In [None]:
dropout_rate = 0.2 # Rate of input units to drop
sigma=0.5         # Amount of noise to add (in terms of standard deviation)

model=Sequential()
model.add(Embedding(nb_top_words,                 
                    embed_vector_len, 
                    input_length=sequence_len
                   ))
model.add(GaussianNoise(sigma=sigma))
model.add(Dropout(dropout_rate))
model.add(LSTM(nb_lstm))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# This will take about 4-6 minutes per epoch on a GPU. 
RUN_MODEL2 = False
if RUN_MODEL2:
    model.fit(x_train, y_train, 
              validation_data=(x_test, y_test), 
              nb_epoch=3, 
              batch_size=batch_size, 
              verbose=0, # Some versions of Jupyter bork on Keras' progress bar. We replace it with Keras-TQDM instead
              callbacks=[TQDMNotebookCallback()])

In [None]:
if RUN_MODEL2:
    loss, metric = model.evaluate(twitter_x, twitter_y)
    print('Label ratio: {:.2f}%'.format(np.mean(twitter_y*100)))
    print('Loss: {}\nMetric: {:.2f}%'.format(loss, metric*100))

# LSTM + CNN = REAL ULTIMATE POWER!

#### LSTMs are awesome. Convolutional neural networks are awesome. I was blown away when I learned you can simply and easily combine both into the same model!

Convolutional layers look at local structures in the data. In image classifiers, this is visual features. In NLP, this looks at groups of words, or N-grams (sequence of N words). For instance, many English sentences contain N-grams of the form [**subject verb object**]. For example (dropping articles for simplicity):

- John rode bike
- Suzie hit ball
- Bobby made memes

Another common sequence is [**subject copula predicate**] (copula verbs: is, are, was, will be: 

- Roses are red
- Movie was bad
- Keras is awesome

In [None]:
nb_filter = 32     # This is the number of convolutional filters to use
filter_length = 5  # This is the size of the filter kernel. Since this is 1D, the kernel is Nx1
pool_length = 2    # Size of our max pooling structures

model = Sequential()
model.add(Embedding(nb_top_words, embed_vector_len, input_length=sequence_len))
model.add(GaussianNoise(sigma=sigma))
model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=pool_length))
model.add(Dropout(dropout_rate))
model.add(LSTM(100))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

## some stuff about sttructure

In [None]:
# This will take about 4-6 minutes per epoch on a GPU. 
RUN_MODEL3 = True
if RUN_MODEL3:
    model.fit(x_train, y_train, 
              validation_data=(x_test, y_test), 
              nb_epoch=3, 
              batch_size=batch_size, 
              verbose=0, # Some versions of Jupyter bork on Keras' progress bar. We replace it with Keras-TQDM instead
              callbacks=[TQDMNotebookCallback()])
    
loss, metric = model.evaluate(x_test, y_test, verbose=0)
print('Final validation accuracy: {:.2f}%'.format(metric*100))

In [None]:
if RUN_MODEL3:
    loss, metric = model.evaluate(twitter_x, twitter_y)
    print('Label ratio: {:.2f}%'.format(np.mean(twitter_y*100)))
    print('Loss: {}\nMetric: {:.2f}%'.format(loss, metric*100))
    loss, metric = model.evaluate(rotten_x, rotten_y)
    print('Label ratio: {:.2f}%'.format(np.mean(rotten_y*100)))
    print('Loss: {}\nMetric: {:.2f}%'.format(loss, metric*100))

In [None]:
nb_top_words           = 20000  # Load the dataset but only keep the top n words, zero the rest. These will be our 'symbols' in our abridged corpus
(x_train, y_train), (x_test, y_test) = imdb.load_data(nb_words=nb_top_words, skip_top=0, index_from=3)#nb_words=nb_top_words)
x_train = sequence.pad_sequences(x_train, maxlen=sequence_len)
x_test = sequence.pad_sequences(x_test, maxlen=sequence_len)

In [None]:
model = Sequential()
model.add(Embedding(nb_top_words, embed_vector_len, input_length=sequence_len))
model.add(GaussianNoise(sigma=sigma))
model.add(Dropout(dropout_rate))
model.add(LSTM(nb_lstm))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
RUN_MODEL4 = True
if RUN_MODEL4:
    model.fit(rotten_x, rotten_y, 
              validation_data=(x_test, y_test), 
              nb_epoch=10, 
              batch_size=batch_size, 
              verbose=0, # Some versions of Jupyter bork on Keras' progress bar. We replace it with Keras-TQDM instead
              callbacks=[TQDMNotebookCallback()])
    
loss, metric = model.evaluate(x_test, y_test, verbose=0)
print('Final validation accuracy: {:.2f}%'.format(metric*100))

In [None]:
model.fit(x_train, y_train, 
              validation_data=(x_test, y_test), 
              nb_epoch=10, 
              batch_size=batch_size, 
              verbose=0, # Some versions of Jupyter bork on Keras' progress bar. We replace it with Keras-TQDM instead
              callbacks=[TQDMNotebookCallback()])

In [None]:
if RUN_MODEL4:
    loss, metric = model.evaluate(x_test, y_test, verbose=0)
    print('IMDB:\nLabel ratio: {:.2f}%'.format(np.mean(twitter_y*100)))
    print('Loss: {}\nMetric: {:.2f}%'.format(loss, metric*100))
    loss, metric = model.evaluate(twitter_x, twitter_y)
    print('Twitter:\nLabel ratio: {:.2f}%'.format(np.mean(twitter_y*100)))
    print('Loss: {}\nMetric: {:.2f}%'.format(loss, metric*100))
    loss, metric = model.evaluate(rotten_x, rotten_y)
    print('Rotten Tomatoes:\nLabel ratio: {:.2f}%'.format(np.mean(rotten_y*100)))
    print('Loss: {}\nMetric: {:.2f}%'.format(loss, metric*100))

In [None]:
input_dim = nb_top_words
latent_dim = 24
inputs = Input(shape=(sequence_len, input_dim))
encoded = LSTM(latent_dim)(inputs)
decoded = RepeatVector(sequence_len)(encoded)
decoded = LSTM(input_dim, return_sequences=True)(decoded)
sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)

In [None]:
print(sequence_autoencoder.summary())

# References

## Datasets

http://www.cs.cornell.edu/people/pabo/movie-review-data/
http://snap.stanford.edu/data/web-Amazon.html

## Papers
[Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215)
[Semi-supervised Sequence Learning](https://arxiv.org/abs/1511.01432)

## Code

- [Keras Examples: IMDB CNN](https://github.com/fchollet/keras/blob/master/examples/imdb_cnn.py)
- [Machine Learning Mastery: Sequence Classification with LSTM Recurrent Neural Networks in Python with Keras](http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/)
- [Microway: Building a Movie Review Sentiment Classifier using Keras and Theano Deep Learning Frameworks](https://www.microway.com/hpc-tech-tips/keras-theano-deep-learning-frameworks/)

In [None]:
'''This example demonstrates the use of Convolution1D for text classification.
Gets to 0.89 test accuracy after 2 epochs.
90s/epoch on Intel i5 2.4Ghz CPU.
10s/epoch on Tesla K40 GPU.
'''

from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(nb_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

In [None]:
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test), verbose=0, callback)