# Big Data Content Analytics - AUEB

## Introduction to Convolutional Networks for Text Classification

* Lab Assistant: George Perakis
* Email: gperakis[at]aeub.gr | perakisgeorgios[at]gmail.com

### Importing Modules

In [None]:
import numpy as np

from tensorflow.python import keras

from tensorflow.python.keras.preprocessing import sequence

from tensorflow.python.keras.models import Sequential

from tensorflow.python.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D

from tensorflow.python.keras.datasets import imdb

### Setting experiment hyperparameters

In [None]:
# set parameters:

max_features = 15_000 # total vocabulary size

maxlen = 400 # maximum length of tokens to use for each review

In [None]:
print('Dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative).\n\nReviews have been preprocessed, and each review is encoded as a sequence \nof word indexes (integers).\n\nFor convenience, words are indexed by overall frequency in the dataset,\nso that for instance the integer "3" encodes the 3rd most frequent word in the data.\n\nThis allows for quick filtering operations such as: "only consider the top 10,000 \nmost common words, but eliminate the top 20 most common words".\n')

In [None]:
print('Loading data...')

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# in case this fails, then you should install this version of numpy
# !pip install numpy==1.16.1
# an run the jupyter notebook from scratch.

In [None]:
# checking out the first line
print(x_train[0])

In [None]:
len(x_train[0])

In [None]:
# checking out the number of classes on our dataset.
print(set(y_train))

In [None]:
print('Train sequences: {}'.format(len(x_train)))
print('Test sequences: {}'.format(len(x_test)))

In [None]:
print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train,
                                 maxlen=maxlen,
                                 padding='pre',
                                 truncating='pre',
                                 value=0.0)

x_test = sequence.pad_sequences(x_test,
                                maxlen=maxlen,
                                 padding='pre',
                                 truncating='pre',
                                 value=0.0)

In [None]:
# help(sequence.pad_sequences)

In [None]:
print('x_train shape: {}'.format(x_train.shape))
print('x_test shape: {}'.format(x_test.shape))

In [None]:
# checking again the first review with the padding.
print(x_train[0])

In [None]:
print(y_train[0])

## How Convolutions Work

<img src="http://deeplearning.stanford.edu/wiki/images/6/6c/Convolution_schematic.gif">

## What are Strides and Padding

<img src="http://deeplearning.net/software/theano/_images/numerical_padding_strides.gif">

## How Max Pooling Works

<img src="http://cs231n.github.io/assets/cnn/maxpool.jpeg">

## Build Text CNN Model

In [None]:
# network hyperparameters
embedding_dims = 50

nof_filters = 25

kernel_size = 3

hidden_dims = 50

In [None]:
print('Build model...')

model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:

model.add(Conv1D(nof_filters, 
                 kernel_size, 
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))

model.add(Dropout(0.2))
model.add(Activation('relu'))

# since we have a binary classification scheme, we selece ONE neuron with SIGMOID activation.
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
print( model.summary() )

In [None]:
model.compile(
    loss='binary_crossentropy',  # binary classification task
    optimizer='adam',
    metrics=['accuracy'])

In [None]:
# fit hyperparameters
batch_size = 128
epochs = 100

In [None]:
from tensorflow.python.keras.callbacks import EarlyStopping

# early stopping callback

es = keras.callbacks.EarlyStopping(
    monitor   = 'val_loss', # which metric we want to use as criterion to stop training
    min_delta = 0, # Minimum change in the monitored quantity to qualify as an improvement
    patience  = 4, # we 3 epochs before stopping
    verbose   = 1, # verbosity level
    mode      = 'auto',
    restore_best_weights = True
)

In [None]:
history = model.fit(
    x_train,                # features
    y_train,                # labels
    epochs=epochs,          # numbers of epoch
    batch_size=batch_size,  # define batch size
    verbose=1,              # the most extended verbose
    validation_split=0.1,   # 90% for train and 10% for validation
    callbacks=[es]
)


In [None]:
score = model.evaluate(
    x_test,                  # features
    y_test,                  # labels
    batch_size=batch_size,   # batch size
    verbose=1                # the most extended verbose
)


print('\nTest categorical_crossentropy:', score[0])
print('\nTest accuracy:', score[1])