In [1]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import math
import json

from sklearn.model_selection import *
from sklearn import metrics

from daf.datasets import atti_dataset
from daf.utils import dataset_utils
from daf.utils import keras_util

from keras.callbacks import TensorBoard

import random

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Embeddings

In [2]:
(x_train, y_train), (x_test, y_test) = atti_dataset.load_data(num_words=None)
label_index_dict = atti_dataset.get_label_index()

num_words = max([max(x) for x in x_train]) + 1
num_words

52396

In [3]:
print('Total of {} classes'.format(len(label_index_dict)))
label_index_dict

Total of 34 classes


{'ALTRI UFFICI': 0,
 'AVVOCATURA REGIONALE                                  ': 1,
 'D.G.  AVVOCATURA                                      ': 2,
 "D.G. COMPETITIVITA' DEL SISTEMA REGIONALE E SVILUPPO D": 3,
 'D.G. PRESIDENZA                                       ': 4,
 'DIPARTIMENTO BILANCIO E FINANZE                       ': 5,
 'DIPARTIMENTO ORGANIZZAZIONE                           ': 6,
 'DIPARTIMENTO ORGANIZZAZIONE E RISORSE                 ': 7,
 'DIPARTIMENTO POLITICHE FORMATIVE E BENI CULTURALI     ': 8,
 'DIPARTIMENTO POLITICHE TERRITORIALI E AMBIENTALI      ': 9,
 'DIPARTIMENTO PRESIDENZA AFFARI LEGISLATIVI E GIURIDICI': 10,
 'DIPARTIMENTO SALUTE E POLITICHE SOLIDARIETA           ': 11,
 'DIPARTIMENTO SVILUPPO ECONOMICO                       ': 12,
 'DIREZIONE AFFARI LEGISLATIVI, GIURIDICI ED ISTITUZIONALI': 13,
 'DIREZIONE AGRICOLTURA E SVILUPPO RURALE': 14,
 "DIREZIONE ATTIVITA' PRODUTTIVE": 15,
 'DIREZIONE CULTURA E RICERCA': 16,
 'DIREZIONE DIFESA DEL SUOLO E PROTEZIONE CIV

## Data Preparation

We need to create the function that transform the x and y.
In this case we need to:
- x: pad the sequences
- y: one hot encoding

In [4]:
maxlen = max([len(x) for x in x_train])
maxlen

101

In [5]:
from functools import partial

def x_transformer(x_data):
    return partial(tf.keras.preprocessing.sequence.pad_sequences, x_data, maxlen)

def y_tranformer(y_data):
    return partial(dataset_utils.to_one_hot, y_data, num_classes)

In [6]:
batch_size = 128
num_classes = len(set(y_train))
train_val_split = math.ceil(len(x_train) * 0.8)
print('num classes {}'.format(num_classes))
print('training size {}, validation size {}'.format(train_val_split, len(x_train) - train_val_split))

num classes 34
training size 117777, validation size 29444


In [7]:
train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, train_val_split, True)

val_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer,
                                                    batch_size, train_val_split, len(x_train), False)

train_steps = train_val_split // batch_size + 1
val_steps = (len(x_train) - train_val_split) // batch_size + 1

## The Effects of Word Embeddings

Before using a word embedding as a layer in our network let evaluate the effect of embeddings by training a simple classifier that has it as only layer.

In [8]:
def build_embed_model(num_words, num_classes, embed_size):
    keras_util.new_session()
    input_l = tf.keras.Input(shape=(maxlen,), dtype='int32')
    embed_l = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embed_size, name='embed')(input_l)
    l = tf.keras.layers.Flatten()(embed_l)
    output_l = tf.keras.layers.Dense(num_classes, activation='softmax')(l)
    model = tf.keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [9]:
embed_model = build_embed_model(num_words, num_classes, 64)
embed_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 101)               0         
_________________________________________________________________
embed (Embedding)            (None, 101, 64)           3353344   
_________________________________________________________________
flatten_1 (Flatten)          (None, 6464)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 34)                219810    
Total params: 3,573,154
Trainable params: 3,573,154
Non-trainable params: 0
_________________________________________________________________


In [10]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="logs/{}_{}".format('only_embedding', '64')),
    tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(patience=3)
    ]  

In [11]:
embed_model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=10, 
                          validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: early stopping


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7ff8401c9a20>

In [12]:
embed_model = build_embed_model(num_words, num_classes, 128)
embed_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 101)               0         
_________________________________________________________________
embed (Embedding)            (None, 101, 128)          6706688   
_________________________________________________________________
flatten_1 (Flatten)          (None, 12928)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 34)                439586    
Total params: 7,146,274
Trainable params: 7,146,274
Non-trainable params: 0
_________________________________________________________________


In [13]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="logs/{}_{}".format('only_embedding', '128')),
    tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(patience=3)
    ]  

In [14]:
embed_model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=10, 
                          validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: early stopping


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7ff8436328d0>

## Add an embedding layer to our previous best classifier

In [15]:
def build_model_embedding(neurons, num_words, num_class, embed_size, maxlen, dropout):
    keras_util.new_session()
    input_l = tf.keras.Input(shape=(maxlen,), dtype='int32')
    embed_l = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embed_size, name='embed')(input_l)
    l = tf.keras.layers.Flatten()(embed_l)
    l = tf.keras.layers.Dense(neurons, activation='relu')(l)
    l = tf.keras.layers.Dropout(dropout)(l)
    l = tf.keras.layers.Dense(neurons, activation='relu')(l)
    l = tf.keras.layers.Dropout(dropout)(l)
    output_l = tf.keras.layers.Dense(num_classes, activation='softmax')(l)
    model = tf.keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [16]:
def train(embeds, epochs):
    histories = {}
    for embed_size in embeds:
        print('*** Network with embedding {} ***'.format(embed_size))
        model = build_model_embedding(128, num_words, num_classes, embed_size, maxlen, 0.5)
        print(model.summary())
        
        callbacks = [
            tf.keras.callbacks.TensorBoard(log_dir="logs/embeds_{}".format(embed_size)),
            tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
            tf.keras.callbacks.ReduceLROnPlateau(patience=3)
            ]  
        
        history = model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=epochs, 
                      validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

        histories['embed_{}'.format(embed_size)] = history

In [17]:
import gc
gc.collect()

845

In [18]:
embeds = [64, 128]

histories = train(embeds, 10)

*** Network with embedding 64 ***
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 101)               0         
_________________________________________________________________
embed (Embedding)            (None, 101, 64)           3353344   
_________________________________________________________________
flatten_1 (Flatten)          (None, 6464)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               827520    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 34)                4386      
Total params: 4,201,762
Trainable params: 4,201,762
Non-trainable params: 0
________________________________

We can see the there is no improvement in using an embedding layer, but we have milion of parameters with respect to thousand of observations. In the next part we are going to investigate the usage of a pretrained embedding model.

### Use Pre-trained embeddings

In [22]:
id_word_dict = atti_dataset.get_word_index()

import gensim

word2vec_model = gensim.models.Word2Vec.load('../data/dataset/atti.word2vec')

embed_size = word2vec_model.vector_size

counter_not_present =0
embed_matrix = np.zeros((num_words, embed_size))
for i, word in id_word_dict.items():
    if word in word2vec_model:
        vect = word2vec_model[word]
        embed_matrix[int(i)] = vect
    else:
        counter_not_present +=1

counter_not_present

In [64]:
def build_model_embedding(neurons, num_words, num_class, embed_size, maxlen, dropout):
    keras_util.new_session()
    input_l = tf.keras.Input(shape=(maxlen,), dtype='int32')
    embed_l = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embed_size, 
                                        weights=[embed_matrix], trainable=False, name='embed')(input_l)
    l = tf.keras.layers.Flatten()(embed_l)
    l = tf.keras.layers.BatchNormalization()(l)
    l = tf.keras.layers.Dense(neurons, activation='relu')(l)
    l = tf.keras.layers.BatchNormalization()(l)
    l = tf.keras.layers.Dropout(dropout)(l)
    l = tf.keras.layers.Dense(neurons, activation='relu')(l)
    output_l = tf.keras.layers.Dense(num_classes, activation='softmax')(l)
    model = tf.keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [65]:
model = build_model_embedding(256, num_words, num_classes, embed_size, maxlen, 0.1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 101)               0         
_________________________________________________________________
embed (Embedding)            (None, 101, 100)          5239600   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10100)             0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 10100)             40400     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               2585856   
_________________________________________________________________
batch_normalization_2 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
__________

In [66]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="logs/pre_local_trained_embeds_{}".format(embed_size)),
    tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(patience=3)
    ]  

history = model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=10, 
              validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 00007: early stopping


### Use Italian Pre-trained embeddings

In [None]:
from gensim.models import KeyedVectors

In [None]:
embed_path = '../../utils/cc.it.300.vec/data'
fasttext_model = KeyedVectors.load_word2vec_format(embed_path, binary=False)

We do some queries on the model

In [None]:
fasttext_model.most_similar('approvazione')

Load the words dictionary

In [None]:
import gc
gc.collect()

In [None]:
embed_size = 300

embed_matrix = np.zeros((num_words, embed_size))
for i, word in id_word_dict.items():
    if word in fasttext_model.vocab:
        vect = fasttext_model.get_vector(word)
        embed_matrix[int(i)] = vect
    else:
        counter_not_present +=1

In [None]:
def build_model_embedding(neurons, num_words, num_class, embed_size, maxlen, dropout):
    keras_util.new_session()
    input_l = tf.keras.Input(shape=(maxlen,), dtype='int32')
    embed_l = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embed_size, 
                                        weights=[embed_matrix], trainable=False, name='embed')(input_l)
    l = tf.keras.layers.Flatten()(embed_l)
    l = tf.keras.layers.Dense(neurons, activation='relu')(l)
    l = tf.keras.layers.Dropout(dropout)(l)
    l = tf.keras.layers.Dense(neurons, activation='relu')(l)
    l = tf.keras.layers.Dropout(dropout)(l)
    l = tf.keras.layers.Dense(neurons, activation='relu')(l)
    l = tf.keras.layers.Dropout(dropout)(l)
    output_l = tf.keras.layers.Dense(num_classes, activation='softmax')(l)
    model = tf.keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
model = build_model_embedding(512, num_words, num_classes, embed_size, maxlen, 0.01)
model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="logs/pre_trained_embeds_{}".format(embed_size)),
    tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(patience=3)
    ]  

history = model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=10, 
              validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

## Build the final model

the best model is obtained embedding of size 64

In [None]:
import gc
gc.collect()

In [None]:
train_steps = len(x_train) // batch_size + 1

train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, len(x_train), True)

In [None]:
model = build_embed_model(num_words, num_classes, 64)
model.summary()

In [None]:
history = model.fit_generator(train_generator, train_steps, 8)

## Evaluate on the test set

In [None]:
import gc
gc.collect()

In [None]:
x_test_v = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
y_test_v = dataset_utils.to_one_hot(y_test, num_classes)

In [None]:
test_predictions = model.predict(x_test_v, verbose=1)

In [None]:
predictions = np.array([np.argmax(x) for x in test_predictions])
precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_test,predictions, average='weighted')
accuracy = metrics.accuracy_score(y_test, predictions)

auc_score = metrics.roc_auc_score(y_test_v, test_predictions, average='weighted')

print('accuracy ', accuracy)
print('precision ', precision)
print('recall ', recall)
print('f-measure ', fscore)

### Report the classification result for each class

In [None]:
print(metrics.classification_report(y_test, predictions))

### Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
np.set_printoptions(precision=2)

In [None]:
plt.rcParams['figure.figsize'] = (20,20)
plot_confusion_matrix(conf_matrix, classes=label_index_dict,
                      title='Confusion matrix, without normalization')