In [5]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import math
import json

from sklearn.model_selection import *
from sklearn import metrics

from daf.datasets import atti_dataset
from daf.utils import dataset_utils
from daf.utils import keras_util
import keras

import random

In [6]:
(x_train, y_train), (x_test, y_test) = atti_dataset.load_data(num_words=None)
label_index_dict = atti_dataset.get_label_index()

num_words = max([max(x) for x in x_train]) + 1
num_words

34731

In [7]:
label_index_dict

{'ALTRI UFFICI': 0,
 'AVVOCATURA REGIONALE                                  ': 1,
 'D.G.  AVVOCATURA                                      ': 2,
 "D.G. COMPETITIVITA' DEL SISTEMA REGIONALE E SVILUPPO D": 3,
 'D.G. PRESIDENZA                                       ': 4,
 'DIPARTIMENTO BILANCIO E FINANZE                       ': 5,
 'DIPARTIMENTO ORGANIZZAZIONE                           ': 6,
 'DIPARTIMENTO ORGANIZZAZIONE E RISORSE                 ': 7,
 'DIPARTIMENTO POLITICHE FORMATIVE E BENI CULTURALI     ': 8,
 'DIPARTIMENTO POLITICHE TERRITORIALI E AMBIENTALI      ': 9,
 'DIPARTIMENTO PRESIDENZA AFFARI LEGISLATIVI E GIURIDICI': 10,
 'DIPARTIMENTO SALUTE E POLITICHE SOLIDARIETA           ': 11,
 'DIPARTIMENTO SVILUPPO ECONOMICO                       ': 12,
 'DIREZIONE AGRICOLTURA E SVILUPPO RURALE': 13,
 "DIREZIONE ATTIVITA' PRODUTTIVE": 14,
 'DIREZIONE DIFESA DEL SUOLO E PROTEZIONE CIVILE': 15,
 'DIREZIONE DIRITTI DI CITTADINANZA E COESIONE SOCIALE': 16,
 'DIREZIONE GENERALE BILANCIO 

In [8]:
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))

In [9]:
from functools import partial

def x_transformer(x_data):
    return partial(dataset_utils.vectorize_sequences, x_data, num_words)

def y_tranformer(y_data):
    return partial(dataset_utils.to_one_hot, y_data, num_classes)

In [10]:
batch_size = 128
num_classes = len(set(y_train))

In [11]:
def build_dropout_model(neurons, num_words, num_classes, dropout=0.5):
    input_l = keras.Input(shape=(num_words, ))
    l = keras.layers.Dense(neurons, activation='relu')(input_l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    output_l = keras.layers.Dense(num_classes, activation='softmax')(l)
    model = keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [12]:
train_steps = len(x_train) // batch_size + 1

train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, len(x_train), True)



In [13]:
model = build_dropout_model(512, num_words, num_classes)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 34731)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               17782784  
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_4 (Dense)              (None, 28)                14364     
Total para

In [15]:
model_path = '../checkpoints/weights.{epoch:02d}-{loss:.2f}.hdf5'

In [16]:
train_callbacks = [
    keras.callbacks.ModelCheckpoint(model_path, verbose=1, save_best_only=True, monitor='loss'),
    keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1, monitor='loss')
]

In [11]:
history = model.fit_generator(train_generator, train_steps, 6, callbacks=train_callbacks)

Epoch 1/6
Epoch 00001: loss improved from inf to 1.15781, saving model to ../checkpoints/weights.01-1.16.hdf5
Epoch 2/6
Epoch 00002: loss improved from 1.15781 to 0.69239, saving model to ../checkpoints/weights.02-0.69.hdf5
Epoch 3/6
Epoch 00003: loss improved from 0.69239 to 0.55043, saving model to ../checkpoints/weights.03-0.55.hdf5
Epoch 4/6
Epoch 00004: loss improved from 0.55043 to 0.47026, saving model to ../checkpoints/weights.04-0.47.hdf5
Epoch 5/6
Epoch 00005: loss improved from 0.47026 to 0.41268, saving model to ../checkpoints/weights.05-0.41.hdf5
Epoch 6/6
Epoch 00006: loss improved from 0.41268 to 0.37328, saving model to ../checkpoints/weights.06-0.37.hdf5


In [12]:
model.save('../checkpoints/final_model.hdf5')

In [None]:
del model

In [3]:
model = tf.keras.models.load_model('../checkpoints/final_model.hdf5')

In [4]:
x = dataset_utils.vectorize_sequences(x_train[0], num_words)

In [5]:
np.argmax(model.predict(x))

9

In [6]:
y_train[0]

9

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
punctuation = ['-', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}', '’', '”', '“', '``', "''"]
stop_words = set(stopwords.words('italian'))
stop_words.update(punctuation)

pad_char = 0
start_char=1
oov_char=2

In [27]:
with open('../data/dataset/id_word_dict.json', 'r') as f:
    word_id_dict = json.load(f)

with open('../data/dataset/label_index.json', 'r') as f:
    label_id_dict = json.load(f)
    id_label_dict = {v: k for k, v in label_id_dict.items()}

In [28]:
tokenizer = RegexpTokenizer(r'\w+')

In [36]:
def hasnumbers(value):
    return any(c.isdigit() for c in value)


def tokenize_sentence(sentence, remove_stopwords=False, tokenizer=tokenizer.tokenize):
    """
    Tokenize the sentence and remove stopwords if true
    :param sentence: the sentence to be tokenized
    :param remove_stopwords: True to remove stopwa
    :param tokenizer:
    :return:
    """
    sentence = sentence.replace('`', ' ')
    sentence = sentence.replace("'", " ")
    sentence = sentence.replace("”", ' ')
    sentence = sentence.replace("“", ' ')
    words = []

    for w in tokenizer(sentence):
        if not hasnumbers(w) and len(w) > 2:
            w = w.replace('_', '')
            if remove_stopwords:
                if w not in stop_words:
                    words.append(w.lower())
            elif w in stop_words or len(w) > 1:
                words.append(w.lower())
    yield words


def sentence_to_idxs(tokenized_sentence):
    """
    convert a tokenized sentence into a sequence of idx
    :param tokenized_sentence:
    :param max_idx:
    :return:
    """
    results = []
    for sample in tokenized_sentence:
        encoded_sample = []
        for w in sample:
            if w in word_id_dict:
                encoded_sample.append(word_id_dict[w])
            else:
                encoded_sample.append(oov_char)
        results.append(encoded_sample)
    return results


def vectorize_sequences(sequences, num_words):
    """

    :param sequences:
    :param dimension:
    :return: sequences encoded as indicator arrays
    """
    results = np.zeros((len(sequences), num_words))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

def sentence_pipeline(sentence):
    """
    :param sentence:
    :return: the sentence into its vectorized form
    """
    tokenized = list(tokenize_sentence(sentence))
    sequence = list(sentence_to_idxs(tokenized))
    vectorized = vectorize_sequences([sequence], num_words)
    return vectorized

In [37]:
sentence = 'CONCESSIONE DI COLTIVAZIONE DI RISORSE GEOTERMICHE "PIANCASTAGNAIO" - AUTORIZZAZIONE REALIZZAZIONE MODIFICHE IMPIANTISTICHE PER ELIMINAZIONE TRASCINATO LIQUIDO IN TURBINA PRESSO LA CENTRALE GEOTERMOELETTRICA PIANCASTAGNAIO 5 - COMUNE DI PIANCASTAGNAIO (SI) - ART. 84 D.LGS. 624/96'

In [38]:
vectorized = sentence_pipeline(sentence)

In [39]:
vectorized.shape

(1, 52396)

In [40]:
np.argmax(model.predict(vectorized))

8