In [1]:
import numpy as np
import pandas as pd
import math
import random
import pickle
import json

# to make the experimens replicable
random.seed(123456)

In [2]:
df = pd.read_pickle('../data/atti-dirigenti-processed.pkl')

# Dataset Creation

for the analysis we are going to create a dataset where the `OGGETTO` is the independent variable while the `UFFICIO_DG` is the dependent variable.

In [3]:
dataset = df[['OGGETTO', 'UFFICIO_DG', 'DATA_ATTO']]
dataset.shape

(152455, 3)

### Groups the documents by office

In [4]:
documents_per_office = dataset.groupby(['UFFICIO_DG']).count()
documents_per_office.describe()

Unnamed: 0,OGGETTO,DATA_ATTO
count,36.0,36.0
mean,4234.861111,4234.861111
std,4010.579446,4010.579446
min,105.0,105.0
25%,1011.25,1011.25
50%,2870.0,2870.0
75%,7239.75,7239.75
max,13826.0,13826.0


###  Dataset Creation

Here, we can:
- select the documents with frequency greater that the 25 percentile
- or use all the dataset

In [5]:
value = 2000
sel_dataset = documents_per_office[documents_per_office.OGGETTO >= 2000]
sel_dataset.shape

(20, 2)

In [6]:
sel_dataset['UFFICIO_DG'] = sel_dataset.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
sel_dataset.describe()

Unnamed: 0,OGGETTO,DATA_ATTO
count,20.0,20.0
mean,6902.15,6902.15
std,3539.235857,3539.235857
min,2624.0,2624.0
25%,3719.25,3719.25
50%,6590.5,6590.5
75%,9389.5,9389.5
max,13826.0,13826.0


In [8]:
sel_dataset.head()

Unnamed: 0_level_0,OGGETTO,DATA_ATTO,UFFICIO_DG
UFFICIO_DG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1025,3030,3030,1025
1928,2710,2710,1928
1934,4923,4923,1934
1937,7160,7160,1937
1943,3833,3833,1943


### Join the dataset to select a subset of the data

In [9]:
final_ds = dataset.merge(sel_dataset, how='inner', on=['UFFICIO_DG'])

In [10]:
final_ds.shape

(138043, 5)

### Select the OGGETTO and UFFICIO

In [11]:
final_ds = final_ds[['OGGETTO_x',"UFFICIO_DG"]]
final_ds.head()

Unnamed: 0,OGGETTO_x,UFFICIO_DG
0,DGR 968/07 e s.m.i. Accreditamento degli organ...,DIREZIONE ISTRUZIONE E FORMAZIONE
1,Nomina Commissione d’esame per il percorso for...,DIREZIONE ISTRUZIONE E FORMAZIONE
2,Nomina della Commissione d'esame matricola 201...,DIREZIONE ISTRUZIONE E FORMAZIONE
3,Progetti formativi per drop-out a.s.f. 2015-20...,DIREZIONE ISTRUZIONE E FORMAZIONE
4,REG (CE) 1080/2006-Por Creo Fesr 2007-2013-Lin...,DIREZIONE ISTRUZIONE E FORMAZIONE


In [12]:
len(set(final_ds['UFFICIO_DG']))

20

### Transform it in a dataset

In [13]:
samples = []
labels = []

In [14]:
for text, label in final_ds.as_matrix():
    samples.append(text)
    labels.append(label)

In [15]:
samples[:5]

["DGR 968/07 e s.m.i. Accreditamento degli organismi formativi. Rilascio dell'accreditamento all'organismo formativo Bioscience Research Center - cod. GR1035.",
 'Nomina Commissione d’esame per il percorso formativo “Formazione obbligatoria per utilizzatori professionali di prodotti fitosanitari”, MATRICOLA N. 2016SI0046',
 "Nomina della Commissione d'esame matricola 2016PI0301. Agenzia Formativa Cescot",
 'Progetti formativi per drop-out a.s.f. 2015-2016. Integrazione impegno per progetto "Figaro - Operatore del benessere (acconciatura)"',
 'REG (CE) 1080/2006-Por Creo Fesr 2007-2013-Linea di intevento 5.1.d-Chiusura attività']

In [16]:
labels[:5]

['DIREZIONE ISTRUZIONE E FORMAZIONE',
 'DIREZIONE ISTRUZIONE E FORMAZIONE',
 'DIREZIONE ISTRUZIONE E FORMAZIONE',
 'DIREZIONE ISTRUZIONE E FORMAZIONE',
 'DIREZIONE ISTRUZIONE E FORMAZIONE']

In [17]:
samples = np.array(samples)
labels = np.array(labels)

In [18]:
with open('../data/dataset-dirigenti.pkl', 'wb') as o:
    pickle.dump((samples, labels), o)

In [19]:
dataset_path = '../data/dataset-dirigenti.pkl'

with open(dataset_path, 'rb') as f:
    samples, labels = pickle.load(f)

## Create Train and Test Set

- shuffle data
- split the dataset into 80-20

In [20]:
from sklearn.model_selection import StratifiedShuffleSplit

In [21]:
split_train_test = StratifiedShuffleSplit(1,test_size=0.2, random_state=123456)

In [22]:
for train, test in split_train_test.split(samples, labels):
    train_split_samples, test_split_samples = samples[train], samples[test]
    train_split_labels, test_split_labels = labels[train], labels[test]

## Create a label index vocabulary

In [23]:
index_label_dict = dict(enumerate(set(train_split_labels),0))
label_index_dict = {v:k for k,v in index_label_dict.items()}

In [24]:
label_index_dict

{'01943': 0,
 'DIREZIONE GENERALE SVILUPPO ECONOMICO                 ': 1,
 '01946': 2,
 '01025': 3,
 'D.G. PRESIDENZA                                       ': 4,
 '01934': 5,
 'DIREZIONE GENERALE DIRITTO ALLA SALUTE E POLITICHE DI ': 6,
 'DIREZIONE GENERALE POLITICHE TERRITORIALI E AMBIENTALI': 7,
 '01937': 8,
 'D.G.  AVVOCATURA                                      ': 9,
 '01928': 10,
 'POLITICHE AMBIENTALI, ENERGIA E CAMBIAMENTI CLIMATICI': 11,
 'DIREZIONE ORGANIZZAZIONE E SISTEMI INFORMATIVI': 12,
 'DIREZIONE GENERALE BILANCIO E FINANZE                 ': 13,
 'DIREZIONE DIFESA DEL SUOLO E PROTEZIONE CIVILE': 14,
 'DIREZIONE GENERALE POLITICHE FORMATIVE, BENI E ATTIVIT': 15,
 "D.G. COMPETITIVITA' DEL SISTEMA REGIONALE E SVILUPPO D": 16,
 'DIREZIONE DIRITTI DI CITTADINANZA E COESIONE SOCIALE': 17,
 'DIREZIONE ISTRUZIONE E FORMAZIONE': 18,
 'DIREZIONE AGRICOLTURA E SVILUPPO RURALE': 19}

In [25]:
with open('data_dirigenti_label_index.json', 'w') as f: 
    json.dump(label_index_dict, f)

## Convert Train and Test Labels to idx

In [26]:
train_labels = np.array([label_index_dict[l] for l in train_split_labels])
test_labels = np.array([label_index_dict[l] for l in test_split_labels])

In [27]:
print(train_labels)
print(test_labels)

[16 11  3 ...,  4  4 17]
[ 6  7  4 ...,  1  8 16]


## Create the word index vocabulary

we create the word index with all the tokens from the train sample (with punctuation and stop-words)

In [28]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
punctuation = ['-', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}', '’', '”', '“', '``', "''"]
stop_words = set(stopwords.words('italian'))
stop_words.update(punctuation)

In [30]:
def tokenize_sample(samples, remove_stopwords=True, tokenizer=word_tokenize):
    for sample in samples:
        words = []
        sample = sample.replace('`', ' ')
        sample = sample.replace("'", " ")
        for w in tokenizer(sample):
            if remove_stopwords:
                if w not in stop_words:
                    words.append(w.lower())
            else:
                words.append(w.lower())
        yield words

In [31]:
train_samples_tokenized = tokenize_sample(train_split_samples, remove_stopwords=False, tokenizer=word_tokenize)

In [32]:
from collections import Counter

In [33]:
counter = Counter()

In [34]:
for words in train_samples_tokenized:
    counter.update(words)

In [35]:
counter.most_common()[:20]

[('.', 256844),
 ('di', 181082),
 ('-', 86613),
 ('e', 82411),
 ('del', 73939),
 ('per', 63553),
 (',', 57757),
 ('n.', 40423),
 ('della', 39756),
 (')', 34981),
 ('``', 32901),
 ('a', 32144),
 ('(', 31821),
 ("''", 31650),
 ('in', 30318),
 ('dell', 25572),
 ('la', 23388),
 ('al', 21711),
 ('impegno', 20898),
 ('regionale', 20120)]

### Save the word_index_dict and the most frequent words

In [36]:
index_word_dict = dict(enumerate([ k for k,v in counter.most_common()],3))
word_index_dict = {v:k for k,v in index_word_dict.items()}

In [37]:
with open('data_dirigenti_word_index.json', 'w') as f: 
    json.dump(word_index_dict, f)

In [38]:
with open('data_dirigenti_most_common.json', 'w') as f:
    json.dump(counter.most_common(), f)

### Transform the samples from words to sequnce of index

we reserve some index for utility chars

In [39]:
pad_char = 0
start_char=1
oov_char=2

In [40]:
train_samples_tokenized = tokenize_sample(train_split_samples, remove_stopwords=False, tokenizer=word_tokenize)
test_samples_tokenized = tokenize_sample(test_split_samples, remove_stopwords=False, tokenizer=word_tokenize)

In [41]:
def samples_to_idx(tokenized_samples, word_index_dict):
    for sample in tokenized_samples:
        encoded_sample = []
        for w in sample:
            if w in word_index_dict:
                encoded_sample.append(word_index_dict[w])
            else:
                encoded_sample.append(oov_char)
        yield encoded_sample

In [42]:
train_sample = np.array(list(samples_to_idx(train_samples_tokenized, word_index_dict)))
test_data = np.array(list(samples_to_idx(test_samples_tokenized, word_index_dict)))

### Split train into train and validation set

In [43]:
split_train_val = StratifiedShuffleSplit(1,test_size=0.1, random_state=123456)

In [44]:
for train, val in split_train_val.split(train_sample, train_labels):
    train_data, val_data = train_sample[train], train_sample[val]
    train_labels_, val_labels = train_labels[train], train_labels[val]

In [45]:
print('labels training {}'.format(train_labels_.shape))
print('labels validation {}'.format(val_labels.shape))
print('labels test {}'.format(test_labels.shape))

labels training (99390,)
labels validation (11044,)
labels test (27609,)


In [46]:
print('samples training {}'.format(train_data.shape))
print('samples validation {}'.format(val_data.shape))
print('samples test {}'.format(test_data.shape))

samples training (99390,)
samples validation (11044,)
samples test (27609,)


In [47]:
train_labels = train_labels_

In [48]:
np.savez_compressed('data_dirigenti.npz', 
                    x_train=train_data, y_train=train_labels, 
                    x_val=val_data, y_val=val_labels, 
                    x_test=test_data, y_test=test_labels)

In [49]:
loaded = np.load('data_dirigenti.npz')

In [50]:
loaded['x_train'].shape

(99390,)