In [1]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import math
import json

from sklearn.model_selection import *
from sklearn import metrics

from daf.datasets import atti_dataset
from daf.utils import dataset_utils
from daf.utils import keras_util
import keras

import random

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
(x_train, y_train), (x_test, y_test) = atti_dataset.load_data(num_words=None)
label_index_dict = atti_dataset.get_label_index()

num_words = max([max(x) for x in x_train]) + 1
num_words

34731

In [3]:
label_index_dict

{'ALTRI UFFICI': 0,
 'AVVOCATURA REGIONALE                                  ': 1,
 'D.G.  AVVOCATURA                                      ': 2,
 "D.G. COMPETITIVITA' DEL SISTEMA REGIONALE E SVILUPPO D": 3,
 'D.G. PRESIDENZA                                       ': 4,
 'DIPARTIMENTO BILANCIO E FINANZE                       ': 5,
 'DIPARTIMENTO ORGANIZZAZIONE                           ': 6,
 'DIPARTIMENTO ORGANIZZAZIONE E RISORSE                 ': 7,
 'DIPARTIMENTO POLITICHE FORMATIVE E BENI CULTURALI     ': 8,
 'DIPARTIMENTO POLITICHE TERRITORIALI E AMBIENTALI      ': 9,
 'DIPARTIMENTO PRESIDENZA AFFARI LEGISLATIVI E GIURIDICI': 10,
 'DIPARTIMENTO SALUTE E POLITICHE SOLIDARIETA           ': 11,
 'DIPARTIMENTO SVILUPPO ECONOMICO                       ': 12,
 'DIREZIONE AGRICOLTURA E SVILUPPO RURALE': 13,
 "DIREZIONE ATTIVITA' PRODUTTIVE": 14,
 'DIREZIONE DIFESA DEL SUOLO E PROTEZIONE CIVILE': 15,
 'DIREZIONE DIRITTI DI CITTADINANZA E COESIONE SOCIALE': 16,
 'DIREZIONE GENERALE BILANCIO 

In [88]:
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))

In [89]:
num_words

34731

In [90]:
x_train

array([list([1253, 1342, 460, 4, 397, 1405, 1221, 28, 1150, 824, 35, 54, 2, 20, 8769]),
       list([3563, 3, 33, 506, 2826, 112, 16, 2105, 4, 65, 20, 47, 84, 2, 3]),
       list([93, 7, 60, 8, 19, 814, 6, 990, 7, 620, 818, 110, 13492, 2486, 5]),
       ...,
       list([41, 21, 15, 3984, 17, 85, 18388, 3, 33, 66, 200, 4893, 4, 826, 3, 81, 371]),
       list([1253, 2, 2, 4, 397, 1405, 1221, 28, 1150, 824, 35, 54, 22340, 20, 8769]),
       list([109, 8, 397, 6, 140, 3642, 28, 6979, 2, 163, 9, 31, 42, 18, 2, 9, 80, 63, 9, 11, 7904, 9, 781, 9, 8, 275, 517, 11, 709, 507, 256, 248, 232, 15, 248, 1132, 4, 755, 6, 2, 110, 1973, 11, 3202, 7, 2486, 17, 2])],
      dtype=object)

In [91]:
x_transformer(x_train[0])().shape

(15, 34731)

In [6]:
from functools import partial

def x_transformer(x_data):
    return partial(dataset_utils.vectorize_sequences, x_data, num_words)

def y_tranformer(y_data):
    return partial(dataset_utils.to_one_hot, y_data, num_classes)

In [8]:
batch_size = 128
num_classes = 28

In [71]:
num_classes

28

In [7]:
def build_dropout_model(neurons, num_words, num_classes, dropout=0.5):
    input_l = keras.Input(shape=(num_words, ))
    l = keras.layers.Dense(neurons, activation='relu')(input_l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    l = keras.layers.Dropout(dropout)(l)
    l = keras.layers.Dense(neurons, activation='relu')(l)
    output_l = keras.layers.Dense(num_classes, activation='softmax')(l)
    model = keras.Model(inputs=input_l, outputs=output_l)
    
    model.compile(optimizer=keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [8]:
train_steps = len(x_train) // batch_size + 1

train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, y_tranformer, 
                                                      batch_size, 0, len(x_train), True)



In [41]:
model = build_dropout_model(512, num_words, num_classes)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 34731)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               17782784  
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_8 (Dense)              (None, 28)                14364     
Total para

In [42]:
model_path = '../checkpoints/weights.{epoch:02d}-{loss:.2f}.hdf5'

In [43]:
train_callbacks = [
    keras.callbacks.ModelCheckpoint(model_path, verbose=1, save_best_only=True, monitor='loss'),
    keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1, monitor='loss')
]

In [44]:
history = model.fit_generator(train_generator, train_steps, 3, callbacks=train_callbacks)

Epoch 1/3

Epoch 00001: loss improved from inf to 0.87973, saving model to ../checkpoints/weights.01-0.88.hdf5
Epoch 2/3

Epoch 00002: loss improved from 0.87973 to 0.44180, saving model to ../checkpoints/weights.02-0.44.hdf5
Epoch 3/3

Epoch 00003: loss improved from 0.44180 to 0.33017, saving model to ../checkpoints/weights.03-0.33.hdf5


In [114]:
model.save('../checkpoints/final_model.hdf5')

In [None]:
del model

In [4]:
model = tf.keras.models.load_model('../checkpoints/final_model.hdf5')

In [9]:
d = x_transformer(x_train[:10])()

In [15]:
r = model.predict_on_batch(d)

In [25]:
np.argmax(model.predict(d)[0])

4

In [10]:
y_train[:10]

array([ 4, 14,  9, 15, 16,  3, 22, 13, 16, 22])

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [112]:
punctuation = ['-', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}', '’', '”', '“', '``', "''"]
stop_words = set(stopwords.words('italian'))
stop_words.update(punctuation)

pad_char = 0
start_char=1
oov_char=2

In [115]:
with open('../data/dataset/id_word_dict.json', 'r') as f:
    word_id_dict = json.load(f)

with open('../data/dataset/label_index.json', 'r') as f:
    label_id_dict = json.load(f)
    id_label_dict = {v: k for k, v in label_id_dict.items()}

In [116]:
tokenizer = RegexpTokenizer(r'\w+')

In [21]:
def hasnumbers(value):
    return any(c.isdigit() for c in value)


def tokenize_sentence(sentence, remove_stopwords=False, tokenizer=tokenizer.tokenize):
    """
    Tokenize the sentence and remove stopwords if true
    :param sentence: the sentence to be tokenized
    :param remove_stopwords: True to remove stopwa
    :param tokenizer:
    :return:
    """
    sentence = sentence.replace('`', ' ')
    sentence = sentence.replace("'", " ")
    sentence = sentence.replace("”", ' ')
    sentence = sentence.replace("“", ' ')
    words = []

    for w in tokenizer(sentence):
        if not hasnumbers(w) and len(w) > 2:
            w = w.replace('_', '')
            if remove_stopwords:
                if w not in stop_words:
                    words.append(w.lower())
            elif w in stop_words or len(w) > 1:
                words.append(w.lower())
    yield words


def sentence_to_idxs(tokenized_sentence):
    """
    convert a tokenized sentence into a sequence of idx
    :param tokenized_sentence:
    :param max_idx:
    :return:
    """
    results = []
    for sample in tokenized_sentence:
        encoded_sample = []
        for w in sample:
            if w in word_id_dict:
                encoded_sample.append(word_id_dict[w])
            else:
                encoded_sample.append(oov_char)
        results.append(encoded_sample)
    return results


def vectorize_sequences(sequences, num_words):
    """

    :param sequences:
    :param dimension:
    :return: sequences encoded as indicator arrays
    """
    results = np.zeros((len(sequences), num_words))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

def sentence_pipeline(sentence):
    """
    :param sentence:
    :return: the sentence into its vectorized form
    """
    tokenized = list(tokenize_sentence(sentence))
    sequence = list(sentence_to_idxs(tokenized))
    vectorized = vectorize_sequences([sequence], num_words)
    return vectorized

NameError: name 'tokenizer' is not defined

In [22]:
sentence = "Attività riconosciuta. Corso  Adempimenti Amministrativi e contabili per la gestione aziendale matr. 2018GL0044 - Ag. CAT - ASCOM Maremma. Nomina Commissione di esame."

In [20]:
vectorized = sentence_pipeline(sentence)

NameError: name 'sentence_pipeline' is not defined

In [120]:
vectorized.shape

(1, 34731)

In [121]:
np.argmax(model.predict(vectorized)[0])

6