In [1]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import math
import json

from sklearn.model_selection import *
from sklearn import metrics

from daf.datasets import atti_dataset
from daf.utils import dataset_utils
from daf.utils import keras_util
import keras

import random

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
(x_train, y_train), (x_test, y_test) = atti_dataset.load_data(num_words=None)
label_index_dict = atti_dataset.get_label_index()

num_words = max([max(x) for x in x_train]) + 1
num_words

37658

In [3]:
label_index_dict

{'ALTRI UFFICI': 0,
 'AVVOCATURA REGIONALE                                  ': 1,
 'D.G.  AVVOCATURA                                      ': 2,
 "D.G. COMPETITIVITA' DEL SISTEMA REGIONALE E SVILUPPO D": 3,
 'D.G. DIRITTI DI CITTADINANZA E COESIONE SOCIALE       ': 4,
 'D.G. ORGANIZZAZIONE                                   ': 5,
 'D.G. ORGANIZZAZIONE E RISORSE                         ': 6,
 'D.G. POLITICHE TERRITORIALI, AMBIENTALI E PER LA MOBIL': 7,
 'D.G. PRESIDENZA                                       ': 8,
 'DIPARTIMENTO BILANCIO E FINANZE                       ': 9,
 'DIPARTIMENTO ORGANIZZAZIONE                           ': 10,
 'DIPARTIMENTO ORGANIZZAZIONE E RISORSE                 ': 11,
 'DIPARTIMENTO POLITICHE FORMATIVE E BENI CULTURALI     ': 12,
 'DIPARTIMENTO POLITICHE TERRITORIALI E AMBIENTALI      ': 13,
 'DIPARTIMENTO PRESIDENZA AFFARI LEGISLATIVI E GIURIDICI': 14,
 'DIPARTIMENTO SALUTE E POLITICHE SOLIDARIETA           ': 15,
 'DIPARTIMENTO SVILUPPO ECONOMICO           

In [4]:
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))

In [5]:
num_words

37658

In [6]:
x_train

array([list([398, 1536, 6, 615, 3, 13, 484, 7, 148, 4, 676, 8, 68, 698, 23, 633, 4, 1498, 1463, 105, 1499, 4, 158, 16, 3, 100, 44, 498, 3, 1881, 50, 237, 3]),
       list([125, 18, 26, 2, 7, 261, 157, 6, 611, 34, 10, 1477, 8271, 258, 35, 414, 4, 2, 2]),
       list([236, 384, 20, 412, 364, 297, 9, 11, 1196, 5, 60, 755, 49, 56, 1367, 43, 160, 24, 319, 756, 3, 172, 24, 512, 8, 37, 597, 4, 1312, 56, 1453, 36, 48, 92, 499, 491, 285, 11, 20167, 3]),
       ...,
       list([1045, 4, 257, 30, 13, 17, 16, 5, 68, 771, 163, 95, 23, 1036, 18, 1044, 3, 5, 13, 307, 16, 5, 771, 53, 284, 386, 228, 99, 6, 228, 648, 15, 1028, 23, 410, 1575, 30, 37, 4555, 309, 3, 26, 4, 41, 8, 37, 685, 958, 336, 3]),
       list([273, 21, 379, 171, 21, 380, 631, 8, 31, 75, 4, 748, 428, 942, 228, 46, 32, 14, 2, 12, 3, 135, 15, 202, 1159, 162, 20, 3]),
       list([77, 2765, 763, 6, 77, 1841, 2520, 9, 736, 5, 342, 9, 70, 9, 26, 6, 34, 7, 44, 971, 15, 666, 27, 850, 3, 53, 3, 184, 128, 3, 4, 59, 8, 76, 72, 4, 2428, 337, 43

In [7]:
from functools import partial

def x_transformer(x_data):
    return partial(dataset_utils.vectorize_sequences, x_data, num_words)

def y_tranformer(y_data):
    return partial(dataset_utils.to_one_hot, y_data, num_classes)

In [8]:
batch_size = 128
num_classes = len(label_index_dict)

In [9]:
num_classes

35

In [10]:
def build_dropout_model(neurons, num_words, num_classes, dropout=0.5):
    input_l = keras.Input(shape=(num_words, ))
    l = keras.layers.Dense(neurons, activation='relu')(input_l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    output_l = keras.layers.Dense(num_classes, activation='softmax')(l)
    model = keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [8]:
train_steps = len(x_train) // batch_size + 1

train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, len(x_train), True)



In [41]:
model = build_dropout_model(512, num_words, num_classes)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 34731)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               17782784  
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_8 (Dense)              (None, 28)                14364     
Total para

In [42]:
model_path = '../checkpoints/weights.{epoch:02d}-{loss:.2f}.hdf5'

In [43]:
train_callbacks = [
    keras.callbacks.ModelCheckpoint(model_path, verbose=1, save_best_only=True, monitor='loss'),
    keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1, monitor='loss')
]

In [44]:
history = model.fit_generator(train_generator, train_steps, 3, callbacks=train_callbacks)

Epoch 1/3

Epoch 00001: loss improved from inf to 0.87973, saving model to ../checkpoints/weights.01-0.88.hdf5
Epoch 2/3

Epoch 00002: loss improved from 0.87973 to 0.44180, saving model to ../checkpoints/weights.02-0.44.hdf5
Epoch 3/3

Epoch 00003: loss improved from 0.44180 to 0.33017, saving model to ../checkpoints/weights.03-0.33.hdf5


In [114]:
model.save('../checkpoints/final_model.hdf5')

In [None]:
del model

In [11]:
model = tf.keras.models.load_model('../checkpoints/baseline.hdf5')

In [12]:
d = x_transformer(x_train[:10])()

In [13]:
r = model.predict_on_batch(d)

### Compare Results

In [14]:
np.argmax(r, axis=1)

array([ 4,  3, 26, 18, 24,  3,  3, 16, 22, 13])

In [15]:
y_train[:10]

array([ 4,  3, 26, 18, 24,  3,  3, 16,  8, 13])

In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
punctuation = ['-', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}', '’', '”', '“', '``', "''"]
stop_words = set(stopwords.words('italian'))
stop_words.update(punctuation)

pad_char = 0
start_char=1
oov_char=2

In [18]:
with open('../data/dataset/id_word_dict.json', 'r') as f:
    id_word_dict = json.load(f)

with open('../data/dataset/label_index.json', 'r') as f:
    label_id_dict = json.load(f)
    id_label_dict = {v: k for k, v in label_id_dict.items()}

In [19]:
word_id_dict = {v:int(k) for k,v in id_word_dict.items()}

In [20]:
tokenizer = RegexpTokenizer(r'\w+')

In [21]:
def hasnumbers(value):
    return any(c.isdigit() for c in value)


def tokenize_sentence(sentence, remove_stopwords=False, tokenizer=tokenizer.tokenize):
    """
    Tokenize the sentence and remove stopwords if true
    :param sentence: the sentence to be tokenized
    :param remove_stopwords: True to remove stopwa
    :param tokenizer:
    :return:
    """
    sentence = sentence.replace('`', ' ')
    sentence = sentence.replace("'", " ")
    sentence = sentence.replace("”", ' ')
    sentence = sentence.replace("“", ' ')
    words = []

    for w in tokenizer(sentence):
        if not hasnumbers(w) and len(w) > 2:
            w = w.replace('_', '')
            if remove_stopwords:
                if w not in stop_words:
                    words.append(w.lower())
            elif w in stop_words or len(w) > 1:
                words.append(w.lower())
    yield words


def sentence_to_idxs(tokenized_sentence):
    """
    convert a tokenized sentence into a sequence of idx
    :param tokenized_sentence:
    :param max_idx:
    :return:
    """
    results = []
    for sample in tokenized_sentence:
        encoded_sample = []
        for w in sample:
            if w in word_id_dict:
                encoded_sample.append(word_id_dict[w])
            else:
                encoded_sample.append(oov_char)
        results.append(encoded_sample)
    return results


def vectorize_sequences(sequences, num_words):
    """

    :param sequences:
    :param dimension:
    :return: sequences encoded as indicator arrays
    """
    results = np.zeros((len(sequences), num_words))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

def sentence_pipeline(sentence):
    """
    :param sentence:
    :return: the sentence into its vectorized form
    """
    tokenized = list(tokenize_sentence(sentence))
    sequences = list(sentence_to_idxs(tokenized))
    vectorized = vectorize_sequences([sequences], num_words)
    return vectorized

In [22]:
sentence = """
Approvazione di una seconda variante al progetto esecutivo delle opere di adeguamento di un invaso artificiale in loc. Piano S. Croce, nel Comune di Monterotondo Marittimo (GR) - ditta Piazzi s.r.l. - L.R.T. n° 64 del 5.11.2009 e s.m.i. e Regolamento n°18/r del 25.02.2010 a s.m.i - pratica n. 222.
"""

In [23]:
vectorized = sentence_pipeline(sentence)

In [24]:
vectorized.shape

(1, 37658)

In [25]:
np.argwhere(vectorized)

array([[    0,     7],
       [    0,    23],
       [    0,    25],
       [    0,    28],
       [    0,    40],
       [    0,    43],
       [    0,    58],
       [    0,   139],
       [    0,   190],
       [    0,   200],
       [    0,   249],
       [    0,   265],
       [    0,   289],
       [    0,   334],
       [    0,   437],
       [    0,   673],
       [    0,   682],
       [    0,   873],
       [    0,  1311],
       [    0,  1877],
       [    0,  2499],
       [    0,  2759],
       [    0, 15668]])

In [26]:
np.argmax(model.predict(vectorized))

20