In [1]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import math
import json

from sklearn.model_selection import *
from sklearn import metrics

from daf.datasets import atti_dataset
from daf.utils import dataset_utils
from daf.utils import keras_util

import random

  from ._conv import register_converters as _register_converters


# Recurrent Neural Networks

In [2]:
(x_train, y_train), (x_test, y_test) = atti_dataset.load_data(num_words=None)
label_index_dict = atti_dataset.get_label_index()

num_words = max([max(x) for x in x_train]) + 1
num_words

A local file was found, but it seems to be incomplete or outdated because the auto file hash does not match the original value of 36d64dbf4288c8ba3d30b5180037ed28 so we will re-download the data.
Downloading data from  https://media.githubusercontent.com/media/teamdigitale/daf-models/master/daf-datasets/data/atti/atti_dataset.npz
A local file was found, but it seems to be incomplete or outdated because the auto file hash does not match the original value of bda94d98e9f1771f4131107346a0898f so we will re-download the data.
Downloading data from  https://media.githubusercontent.com/media/teamdigitale/daf-models/master/daf-datasets/data/atti/label_index.json


52396

In [None]:
print('Total of {} classes'.format(len(label_index_dict)))
label_index_dict

## Data Preparation

We need to create the function that transform the x and y.
In this case we need to:
- x: pad the sequences
- y: one hot encoding

In [None]:
maxlen = max([len(x) for x in x_train])
maxlen

In [None]:
from functools import partial

def x_transformer(x_data):
    return partial(tf.keras.preprocessing.sequence.pad_sequences, x_data, maxlen, padding='post')

def y_tranformer(y_data):
    return partial(dataset_utils.to_one_hot, y_data, num_classes)

In [None]:
batch_size = 128
num_classes = len(set(y_train))
train_val_split = math.ceil(len(x_train) * 0.7)
print('num classes {}'.format(num_classes))
print('training size {}, validation size {}'.format(train_val_split, len(x_train) - train_val_split))

In [None]:
train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, train_val_split, True)

val_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer,
                                                    batch_size, train_val_split, len(x_train), False)

train_steps = train_val_split // batch_size + 1
val_steps = (len(x_train) - train_val_split) // batch_size + 1

## Models

We evaluate: 
- CNN
- LSTM and GRU models with dropout, 
- reverse the text order and uses Bidirectional-LSTM 

### CNN

In [None]:
def build_model_cnn(num_words, num_classes, embed_size):
    keras_util.new_session()
    input_l = tf.keras.Input(shape=(maxlen,), dtype='int32')
    embed_l = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embed_size, name='embed')(input_l)
    l = tf.keras.layers.Conv1D(32,5, activation='relu')(embed_l)
    l = tf.keras.layers.MaxPooling1D(2)(l)
    l = tf.keras.layers.Conv1D(64,3, activation='relu')(l)
    l = tf.keras.layers.MaxPooling1D(2)(l)
    l = tf.keras.layers.Conv1D(128,3, activation='relu')(l)
    l = tf.keras.layers.GlobalMaxPool1D()(l)
    output_l = tf.keras.layers.Dense(num_classes, activation='softmax')(l)
    model = tf.keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=tf.keras.optimizers.RMSprop(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
cnn_model = build_model_cnn(num_words, num_classes, embed_size)
cnn_model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="logs/{}_{}".format('cnn', '32_64_128')),
    tf.keras.callbacks.EarlyStopping(patience=5, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)
    ]  

In [None]:
cnn_model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=30, 
                          validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

### LSTM

In [None]:
def build_model_rnn(neurons, num_words, num_class, embed_size, dropout, rec_dropout, cell=tf.keras.layers.GRU):
    keras_util.new_session()
    input_l = tf.keras.Input(shape=(maxlen,), dtype='int32')
#     embed_l = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embed_size, name='embed')(input_l)
    l = tf.keras.layers.Bidirectional(cell(neurons, activation='relu', dropout=dropout, recurrent_dropout=rec_dropout,
                           return_sequences=False))(embed_l)
    l = tf.keras.layers.Dense(64)(l)
    output_l = tf.keras.layers.Dense(num_classes, activation='softmax')(l)
    model = tf.keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model 

In [None]:
gru_model = build_model_rnn(64, num_words, num_classes, 100, 0.0, 0.0)
gru_model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="logs/{}_{}".format('gru', '64')),
    tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=2, verbose=1, mode='min',cooldown=0, min_lr=0)
    ]  

In [None]:
history = gru_model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=10, 
                          validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

In [None]:
import gc
gc.collect()

We can see the there is no improvement in using an embedding layer, but we have milion of parameters with respect to thousand of observations. 

## Build the final model

the best model is obtained embedding of size 64

In [None]:
import gc
gc.collect()

In [None]:
train_steps = len(x_train) // batch_size + 1

train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, len(x_train), True)

In [None]:
model = build_model_cnn(num_words, num_classes, embed_size)
cnn_model.summary()

In [None]:
history = model.fit_generator(train_generator, train_steps, 8)

## Evaluate on the test set

In [None]:
x_test_v = dataset_utils.vectorize_sequences(x_test, num_words)
y_test_v = dataset_utils.to_one_hot(y_test, num_classes)

In [None]:
test_predictions = model.predict(x_test_v, verbose=1)

In [None]:
predictions = np.array([np.argmax(x) for x in test_predictions])
precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_test,predictions, average='weighted')
accuracy = metrics.accuracy_score(y_test, predictions)

auc_score = metrics.roc_auc_score(y_test_v, test_predictions, average='weighted')

print('accuracy ', accuracy)
print('precision ', precision)
print('recall ', recall)
print('f-measure ', fscore)

### Report the classification result for each class

In [None]:
print(metrics.classification_report(y_test, predictions))

### Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
np.set_printoptions(precision=2)

In [None]:
plt.rcParams['figure.figsize'] = (20,20)
plot_confusion_matrix(conf_matrix, classes=label_index_dict,
                      title='Confusion matrix, without normalization')