In [1]:
import numpy as np
import pandas as pd
import pickle

from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import activations
from keras import metrics
from keras import regularizers
import math

import random

from dataset import atti_dirigenti

# to make the experimens replicable
random.seed(123456)

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
(x_train, y_train), (x_val, y_val), (x_test, y_test) = atti_dirigenti.load_data(num_words=10000, remove_stopwords=True)

In [3]:
label_index = atti_dirigenti.get_labels()
len(label_index)

20

### Preparing Data

for data and labels

In [4]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension), dtype=np.float32)
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

In [9]:
dimension = max(x_train.max() + x_val.max() + x_test.max()) + 1
print('max dimension {}'.format(dimension))

max dimension 10125


In [10]:
x_train = vectorize_sequences(x_train, dimension)
x_val = vectorize_sequences(x_val, dimension)
x_test = vectorize_sequences(x_test, dimension)

MemoryError: 

In [7]:
x_train[0:]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [8]:
x_train.shape

(99390, 11000)

One hot encoding for the labels

In [9]:
def to_one_hot(labels):
    results = np.zeros((len(labels), len(set(labels))), dtype=np.float16)
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results

In [10]:
y_train = to_one_hot(y_train)
y_val = to_one_hot(y_val)
y_test = to_one_hot(y_test)

In [11]:
y_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float16)

## Build the Models

apart the base model we add new methods with different kind of regularizers. In particular, we take into account:
- l2 that penalize the weights coefficient with a value proportional of their l1 and l2 norms. The general idea is that we prefer a simple model where the distribution of parameters values has less entropy.
- dropout where the idea is to reset randomly a percentage of the weights to zero in order to avoid that neurons start to memorize noise patterns.

In [None]:
def build_model(neuron_layers):
    model = models.Sequential()
    model.add(layers.Dense(neurons, activation='relu', input_shape=(x_train.shape[-1], )))
    model.add(layers.Dense(neurons, activation='relu'))
    model.add(layers.Dense(neurons, activation='relu'))
    model.add(layers.Dense(len(label_index), activation='softmax'))
    
    model.compile(optimizer=optimizers.Adam(), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
def build_model_l2(neurons):
    model = models.Sequential()
    model.add(layers.Dense(neurons, activation='relu', kernel_regularizer=regularizers.l2(), input_shape=(x_train.shape[-1], )))
    model.add(layers.Dense(neurons, kernel_regularizer=regularizers.l2(0.0001), activation='relu'))
    model.add(layers.Dense(neurons, kernel_regularizer=regularizers.l2(0.0001), activation='relu'))
    model.add(layers.Dense(len(label_index), activation='softmax'))
    
    model.compile(optimizer=optimizers.Adam(), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [12]:
def build_model_dropout(neurons, dropout= 0.1):
    first_layer = True
    model = models.Sequential()
    
    for neuron in neurons:
        if first_layer:
            model.add(layers.Dense(neuron, activation='relu', input_shape=(x_train.shape[-1], )))
            model.add(layers.Dropout(dropout))
            first_layer = False
        else:
            model.add(layers.Dense(neuron, activation='relu'))
            model.add(layers.Dropout(dropout))
            
    model.add(layers.Dense(len(label_index), activation='softmax'))
    
    model.compile(optimizer=optimizers.Adam(), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
model_base = build_model(256)

In [None]:
history_base = model_base.fit(x=x_train, y=y_train, validation_data=(x_val, y_val),
                   epochs=10, batch_size=256)

In [None]:
model_l2 = build_model_l2(256)

In [None]:
history_l2 = model_l2.fit(x=x_train, y=y_train, validation_data=(x_val, y_val),
                   epochs=10, batch_size=256)

In [23]:
model_dropout = build_model_dropout(neurons = [256, 128], dropout = 0.3)

In [25]:
history_dropout = model_dropout.fit(x=x_train, y=y_train, validation_data=(x_val, y_val),
                   epochs=5, batch_size=256)

Train on 99390 samples, validate on 11044 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Printing The Loss

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def chart_loss(history, name):
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(loss) + 1)

    plt.plot(epochs, loss, 'b+', label='Training Loss')
    plt.plot(epochs, val_loss, 'b', label='Validation Loss')
    plt.title('Training and validation loss {}'.format(name))
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.yticks(np.arange(0,2.2, step=0.2))
    plt.xticks(epochs)
    plt.legend()
    plt.show()

In [None]:
chart_loss(history_base, 'Base')
chart_loss(history_l2, 'L2')
chart_loss(history_dropout, 'Dropout')

From the charts we can see that: 
- the model with l2 regularization is able to avoid overfitting during the training. 
- the model that uses dropout (0.5) has a lower loss with respect to l2 while combatting overfitting

In [None]:
def chart_acc(history, name):
    acc = history.history['acc']
    val_acc = history.history['val_acc']

    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, acc, 'b+', label='Training Acc')
    plt.plot(epochs, val_acc, 'b', label='Validation Acc')
    plt.title('Training and validation acc {}'.format(name))
    plt.xlabel('Epochs')
    plt.ylabel('Accuray')
    plt.yticks(np.arange(0.5,1.05, step=0.05))
    plt.xticks(epochs)
    plt.legend()
    plt.show()

In [None]:
chart_acc(history_base, 'Base')
chart_acc(history_l2, 'L2')
chart_acc(history_dropout, 'Dropout')

From the chart above we can see that the best model is the model that uses dropout, while the best epoch is the 6 where the validation accuracy crosses the training accuracy

In [None]:
def compare_loss(histories):
    epochs = range(1, len(list(histories.values())[0].history['val_loss']) + 1)

    for i, history in histories.items():
        val_loss = history.history['val_loss']
        plt.plot(epochs, val_loss, label='Validation Loss {}'.format(i))
            
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
compare_loss({'Base': history_base, 'L2': history_l2, 'Dropout': history_dropout})

In [None]:
def compare_accuracy(histories):
    epochs = range(1, len(list(histories.values())[0].history['val_acc']) + 1)

    for i, history in histories.items():
        val_loss = history.history['val_acc']
        plt.plot(epochs, val_loss, label='Validation Accuracy {}'.format(i))
            
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [None]:
compare_accuracy({'Base': history_base, 'L2': history_l2, 'Dropout': history_dropout})

In [None]:
def min_loss(history):
    val_loss = history.history['val_loss'] 
    return np.argmin(val_loss) + 1   

def accuracy(history, epoch):
    val_acc = history.history['val_acc']
    return val_acc[epoch-1]

In [None]:
print('min loss for model base is {}'.format(min_loss(history_base)))
print('min loss for model L2 is {}'.format(min_loss(history_l2)))
print('min loss for model Dropout is {}'.format(min_loss(history_dropout)))

In [None]:
print('best validation accuracy for model base {}'.format(
    accuracy(history_base, min_loss(history_base))))
print('best validation accuracy for model L2 {}'.format(
    accuracy(history_l2, min_loss(history_l2))))
print('best validation accuracy for model Dropout {}'.format(
    accuracy(history_dropout, min_loss(history_dropout))))

### Evaluate on the Test Set

- train the best model for the best epochs

In [None]:
import gc
gc.collect()

In [None]:
model = build_model_dropout(256)

In [None]:
history = model.fit(x=np.concatenate([x_train, x_val]), y=np.concatenate([y_train, y_val]), epochs=6, batch_size=256)

In [None]:
loss, acc = model.evaluate(x_test, y_test)

In [None]:
print('loss {}'.format(loss))
print('acc {}'.format(acc))

## Conclusion

As recap we can see that using regularization, in particular dropout, the accuracy grows from 0.83 to 0.84.