In [1]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import math
import json

from sklearn.model_selection import *
from sklearn import metrics

from daf.datasets import atti_dataset
from daf.utils import dataset_utils
from daf.utils import keras_util
import keras

import random

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
(x_train, y_train), (x_test, y_test) = atti_dataset.load_data(num_words=None)
label_index_dict = atti_dataset.get_label_index()

num_words = max([max(x) for x in x_train]) + 1
num_words

A local file was found, but it seems to be incomplete or outdated because the auto file hash does not match the original value of d6eaee8bd106697b93982053ec65aad1 so we will re-download the data.
Downloading data from  https://media.githubusercontent.com/media/teamdigitale/daf-models/master/daf-datasets/data/atti/atti_dataset.npz


34731

In [3]:
label_index_dict

{'ALTRI UFFICI': 0,
 'AVVOCATURA REGIONALE                                  ': 1,
 'D.G.  AVVOCATURA                                      ': 2,
 "D.G. COMPETITIVITA' DEL SISTEMA REGIONALE E SVILUPPO D": 3,
 'D.G. PRESIDENZA                                       ': 4,
 'DIPARTIMENTO BILANCIO E FINANZE                       ': 5,
 'DIPARTIMENTO ORGANIZZAZIONE                           ': 6,
 'DIPARTIMENTO ORGANIZZAZIONE E RISORSE                 ': 7,
 'DIPARTIMENTO POLITICHE FORMATIVE E BENI CULTURALI     ': 8,
 'DIPARTIMENTO POLITICHE TERRITORIALI E AMBIENTALI      ': 9,
 'DIPARTIMENTO PRESIDENZA AFFARI LEGISLATIVI E GIURIDICI': 10,
 'DIPARTIMENTO SALUTE E POLITICHE SOLIDARIETA           ': 11,
 'DIPARTIMENTO SVILUPPO ECONOMICO                       ': 12,
 'DIREZIONE AGRICOLTURA E SVILUPPO RURALE': 13,
 "DIREZIONE ATTIVITA' PRODUTTIVE": 14,
 'DIREZIONE DIFESA DEL SUOLO E PROTEZIONE CIVILE': 15,
 'DIREZIONE DIRITTI DI CITTADINANZA E COESIONE SOCIALE': 16,
 'DIREZIONE GENERALE BILANCIO 

In [6]:
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))

In [7]:
num_words

34731

In [8]:
x_train

array([list([23, 3, 6376, 3, 151, 161, 3, 31775, 7, 2, 8, 1489, 4, 2, 207, 6, 2, 263, 3751, 6399, 3526, 20, 718, 8, 19, 649, 16, 8865, 4, 215, 24324, 14, 2411, 4, 257, 3462, 3, 35, 28, 1804, 25, 2969, 3]),
       list([33, 683, 2, 206, 41, 1564, 1791, 1855, 3545, 4, 65, 28, 126, 7311, 6, 2904, 3, 106, 1719, 1792, 209, 37, 187, 2705]),
       list([23, 3, 1697, 5, 43, 1328, 38, 466, 27, 1047, 4, 8165, 5, 22, 6, 26, 14, 48, 25, 398, 1303, 11, 394, 4105, 4, 211, 2, 5]),
       ...,
       list([85, 14210, 7, 455, 811, 204, 29, 26, 14, 48, 25, 435, 50, 460, 4, 58, 9, 120, 6, 202, 7, 96, 49, 4, 167, 27, 738, 4, 102, 2202, 1841, 193, 49, 3837, 3162]),
       list([2370, 1509, 3, 129, 7, 3701, 7478, 5, 4204, 3768, 15, 307, 25, 448, 2348, 17082, 7873, 10857, 17, 6, 307, 25, 7274, 1024, 6, 484, 8, 32, 3886, 17, 3, 529, 4, 1984, 73, 20, 66, 8, 39, 1260, 4, 511, 4, 920, 6, 4, 129, 4, 5549, 606, 426, 3]),
       list([3245, 1067, 8540, 6286, 8, 32, 49, 222, 6, 10294, 8, 32, 49, 281, 761, 1244, 102

In [10]:
from functools import partial

def x_transformer(x_data):
    return partial(dataset_utils.vectorize_sequences, x_data, num_words)

def y_tranformer(y_data):
    return partial(dataset_utils.to_one_hot, y_data, num_classes)

In [11]:
batch_size = 128
num_classes = 28

In [12]:
num_classes

28

In [7]:
def build_dropout_model(neurons, num_words, num_classes, dropout=0.5):
    input_l = keras.Input(shape=(num_words, ))
    l = keras.layers.Dense(neurons, activation='relu')(input_l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    output_l = keras.layers.Dense(num_classes, activation='softmax')(l)
    model = keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [8]:
train_steps = len(x_train) // batch_size + 1

train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, len(x_train), True)



In [41]:
model = build_dropout_model(512, num_words, num_classes)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 34731)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               17782784  
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_8 (Dense)              (None, 28)                14364     
Total para

In [42]:
model_path = '../checkpoints/weights.{epoch:02d}-{loss:.2f}.hdf5'

In [43]:
train_callbacks = [
    keras.callbacks.ModelCheckpoint(model_path, verbose=1, save_best_only=True, monitor='loss'),
    keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1, monitor='loss')
]

In [44]:
history = model.fit_generator(train_generator, train_steps, 3, callbacks=train_callbacks)

Epoch 1/3

Epoch 00001: loss improved from inf to 0.87973, saving model to ../checkpoints/weights.01-0.88.hdf5
Epoch 2/3

Epoch 00002: loss improved from 0.87973 to 0.44180, saving model to ../checkpoints/weights.02-0.44.hdf5
Epoch 3/3

Epoch 00003: loss improved from 0.44180 to 0.33017, saving model to ../checkpoints/weights.03-0.33.hdf5


In [114]:
model.save('../checkpoints/final_model.hdf5')

In [None]:
del model

In [4]:
model = tf.keras.models.load_model('../checkpoints/baseline.hdf5')

In [13]:
d = x_transformer(x_train[:10])()

In [14]:
r = model.predict_on_batch(d)

### Compare Results

In [23]:
np.argmax(r, axis=1)

array([10, 14,  9, 15, 16,  3, 22, 13, 16, 22])

In [22]:
y_train[:10]

array([ 4, 14,  9, 15, 16,  3, 22, 13, 16, 22])

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
punctuation = ['-', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}', '’', '”', '“', '``', "''"]
stop_words = set(stopwords.words('italian'))
stop_words.update(punctuation)

pad_char = 0
start_char=1
oov_char=2

In [60]:
with open('../data/dataset/id_word_dict.json', 'r') as f:
    id_word_dict = json.load(f)

with open('../data/dataset/label_index.json', 'r') as f:
    label_id_dict = json.load(f)
    id_label_dict = {v: k for k, v in label_id_dict.items()}

In [87]:
word_id_dict = {v:int(k) for k,v in id_word_dict.items()}

In [89]:
tokenizer = RegexpTokenizer(r'\w+')

In [90]:
def hasnumbers(value):
    return any(c.isdigit() for c in value)


def tokenize_sentence(sentence, remove_stopwords=False, tokenizer=tokenizer.tokenize):
    """
    Tokenize the sentence and remove stopwords if true
    :param sentence: the sentence to be tokenized
    :param remove_stopwords: True to remove stopwa
    :param tokenizer:
    :return:
    """
    sentence = sentence.replace('`', ' ')
    sentence = sentence.replace("'", " ")
    sentence = sentence.replace("”", ' ')
    sentence = sentence.replace("“", ' ')
    words = []

    for w in tokenizer(sentence):
        if not hasnumbers(w) and len(w) > 2:
            w = w.replace('_', '')
            if remove_stopwords:
                if w not in stop_words:
                    words.append(w.lower())
            elif w in stop_words or len(w) > 1:
                words.append(w.lower())
    yield words


def sentence_to_idxs(tokenized_sentence):
    """
    convert a tokenized sentence into a sequence of idx
    :param tokenized_sentence:
    :param max_idx:
    :return:
    """
    results = []
    for sample in tokenized_sentence:
        encoded_sample = []
        for w in sample:
            if w in word_id_dict:
                encoded_sample.append(word_id_dict[w])
            else:
                encoded_sample.append(oov_char)
        results.append(encoded_sample)
    return results


def vectorize_sequences(sequences, num_words):
    """

    :param sequences:
    :param dimension:
    :return: sequences encoded as indicator arrays
    """
    results = np.zeros((len(sequences), num_words))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

def sentence_pipeline(sentence):
    """
    :param sentence:
    :return: the sentence into its vectorized form
    """
    tokenized = list(tokenize_sentence(sentence))
    sequences = list(sentence_to_idxs(tokenized))
    vectorized = vectorize_sequences([sequences], num_words)
    return vectorized

In [131]:
sentence = """
Approvazione di una seconda variante al progetto esecutivo delle opere di adeguamento di un invaso artificiale in loc. Piano S. Croce, nel Comune di Monterotondo Marittimo (GR) - ditta Piazzi s.r.l. - L.R.T. n° 64 del 5.11.2009 e s.m.i. e Regolamento n°18/r del 25.02.2010 a s.m.i - pratica n. 222.
"""

In [132]:
vectorized = sentence_pipeline(sentence)

In [133]:
vectorized.shape

(1, 34731)

In [134]:
np.argwhere(vectorized)

array([[    0,     7],
       [    0,    24],
       [    0,    25],
       [    0,    33],
       [    0,    41],
       [    0,    45],
       [    0,    67],
       [    0,   125],
       [    0,   166],
       [    0,   182],
       [    0,   247],
       [    0,   324],
       [    0,   333],
       [    0,   377],
       [    0,   466],
       [    0,   687],
       [    0,   708],
       [    0,  1032],
       [    0,  1224],
       [    0,  1903],
       [    0,  2282],
       [    0,  2593],
       [    0, 15242]])

In [135]:
np.argmax(model.predict(vectorized))

15