In [1]:
import numpy as np
import random
from dataset import atti_dirigenti

from keras import layers, models, optimizers, utils, metrics
from keras.callbacks import *
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import load_model
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from scipy import stats

[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using TensorFlow backend.


In [2]:
(x_train, y_train),(x_val, y_val), (x_test, y_test) = atti_dirigenti.load_data(num_words=50000, remove_stopwords=True)

In [3]:
x_train[:5]

array([ list([315, 188, 1180, 963, 250, 177, 19935, 1379, 180, 192, 131, 452, 398, 1128, 36, 191, 729]),
       list([34, 234, 164, 54, 35, 1677, 337, 1922, 175, 189, 105, 166, 493, 60, 4557, 109, 4403, 1855, 936, 142, 2723, 118, 173, 502, 21, 40]),
       list([674, 36, 273, 180, 192, 131, 43353, 452, 398, 22365, 1538]),
       list([93, 313, 561, 448, 639, 223, 266, 254, 1291, 2802, 36, 21, 40]),
       list([23, 612, 371, 770, 879, 371, 218, 2155, 1868, 68, 22, 217, 322, 451, 60, 1590, 109, 84, 149, 26, 43, 48, 48, 2903, 5544, 713, 1186, 621, 51, 822, 561, 29775, 3917, 12745, 621, 561, 340, 347, 399, 149])], dtype=object)

In [4]:
label_index = atti_dirigenti.get_labels()
len(label_index)

20

In [5]:
num_classes = len(label_index)

In [6]:
def max_index(data):
    return max(data.max())

In [7]:
max_idx = max([max_index(x_train), max_index(x_val), max_index(x_test)]) + 11

In [8]:
max_idx

50167

In [9]:
num_features = max_idx 

In [10]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

In [11]:
def vectorize_sequences_generator(sequences, dimension, batch_size):
    num_batches = len(sequences) // batch_size
    
    i=0
    while True:
        # to be sure don't go over the size of the dataset
        n = i % num_batches
        i +=1
        if (n+1) * batch_size < len(sequences):
            yield vectorize_sequences(sequences[n*batch_size : (n+1)*batch_size], dimension)
        else:
            yield vectorize_sequences(sequences[n*batch_size : len(sequences)], dimension)

#### Evaluate if works

In [12]:
batch_size = 256
steps_per_epoch = len(x_train) // batch_size

i = 0
for batch in vectorize_sequences_generator(x_train, num_features, batch_size):
    print(np.argmax(batch[-1]))
    i+=1
    
    if i == 10:
        break
    

26
28
26
23
24
21
22
28
10
41


In [13]:
batch_size = 256

In [14]:
def to_one_hot(labels, num_classes):
    results = np.zeros((len(labels), num_classes), dtype=np.float16)
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results

In [15]:
def to_one_hot_generator(labels, batch_size, num_classes):
    num_batches = len(labels) // batch_size
    num_classes = len(set(labels))
    
    i = 0
    while True:
        n = i % num_batches
        i += 1
        if (n+1) * batch_size < len(labels):
            yield to_one_hot(labels[n*batch_size : (n+1)*batch_size], num_classes)
        else:
            yield to_one_hot(labels[n*batch_size : len(labels)], num_classes)
        

In [16]:
batch_size = 256
steps_per_epoch = len(y_train) // batch_size

i = 0
for batch in to_one_hot_generator(y_train, batch_size, num_classes):
    for v in batch[:10]:
        print(np.argmax(v))
    break

    

19
5
19
5
16
5
16
16
8
11


In [17]:
y_train[:10]

array([19,  5, 19,  5, 16,  5, 16, 16,  8, 11])

In [18]:
def data_generator(data, labels, batch_size, num_features, num_classes):
    x_generator = vectorize_sequences_generator(data, num_features, batch_size)
    y_generator = to_one_hot_generator(labels, batch_size, num_classes)
    
    while True:
        yield next(x_generator), next(y_generator)

In [19]:
for x_batch, y_batch in data_generator(x_train, y_train, 256, num_features, num_classes):
    print(x_batch.shape, y_batch.shape)
    break

(256, 50167) (256, 20)


In [20]:
train_generator = data_generator(x_train, y_train, batch_size, num_features, num_classes)
val_generator = data_generator(x_val, y_val, batch_size, num_features, num_classes)
test_generator = data_generator(x_test, y_test, batch_size, num_features, num_classes)

In [21]:
x_all = np.concatenate([x_train, x_val, x_test])
y_all = np.concatenate([y_train, y_val, y_test])

In [22]:
all_generator = data_generator(x_all, y_all, batch_size, num_features, num_classes)

### Build the Model

In [23]:
def build_model(neurons, dropout, num_features, num_classes, activation='relu', init_mode='glorot_uniform'):
    input_tensor = layers.Input(shape=(num_features,))
    first_layer = True
    for n in neurons:
        if first_layer:
            first_layer = False
            l = layers.Dense(n, activation=activation, kernel_initializer=init_mode)(input_tensor)
            l = layers.Dropout(dropout)(l)
        else:
            l_next = layers.Dense(n, activation=activation, kernel_initializer=init_mode)(l)
            l_next = layers.Dropout(dropout)(l_next)
            l = l_next
    last_layer = layers.Dense(num_classes, activation='softmax')(l)
    model = models.Model(inputs = [input_tensor], outputs = [last_layer])
    
    model.compile(loss='categorical_crossentropy', optimizer= 'Adam', metrics=['accuracy'])
    return model

### Define the Callback

In [24]:
!mkdir checkpoints

mkdir: cannot create directory ‘checkpoints’: File exists


In [25]:
model_path = './checkpoints/weights.{epoch:02d}-{loss:.2f}.hdf5'

In [26]:
train_callbacks = [
    ModelCheckpoint(model_path, verbose=1, save_best_only=True, monitor='loss'),
    ReduceLROnPlateau(patience=2, verbose=1, monitor='loss')
]


callbacks = [
    ModelCheckpoint(model_path, save_best_only=True),
    ReduceLROnPlateau(patience=2, verbose=1),
    TensorBoard(log_dir='./logs', histogram_freq=1, batch_size=batch_size, write_graph=True, write_images=True)
]


Instructions for updating:
Use the retry module or similar alternatives.


In [27]:
test_model = build_model([128,128], 0.3, num_features, len(label_index), activation='tanh')
test_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50167)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               6421504   
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                2580      
Total params: 6,440,596
Trainable params: 6,440,596
Non-trainable params: 0
_________________________________________________________________


In [28]:
train_steps = len(x_train) // batch_size
val_steps = len(x_val) // batch_size

history = test_model.fit_generator(all_generator, steps_per_epoch=train_steps, epochs=3, callbacks=train_callbacks)

Epoch 1/3

Epoch 00001: loss improved from inf to 0.93425, saving model to ./checkpoints/weights.01-0.93.hdf5
Epoch 2/3

Epoch 00002: loss improved from 0.93425 to 0.45541, saving model to ./checkpoints/weights.02-0.46.hdf5
Epoch 3/3

Epoch 00003: loss improved from 0.45541 to 0.33473, saving model to ./checkpoints/weights.03-0.33.hdf5


## Load Model

In [29]:
del test_model

In [30]:
test_model = models.load_model('./checkpoints/weights.03-0.33.hdf5')

In [31]:
test_steps = len(x_test) // batch_size

loss, acc = test_model.evaluate_generator(test_generator, steps=val_steps)
print('loss {}, acc {}'.format(loss,acc))

loss 0.19035246379153672, acc 0.9348655523255814
