In [1]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import math
import json

from sklearn.model_selection import *
from sklearn import metrics

import random

  from ._conv import register_converters as _register_converters


In [2]:
from daf.datasets import atti_dataset
from daf.utils import dataset_utils
from daf.utils import keras_util

In [67]:
num_words = 10000

In [68]:
(x_train, y_train), (x_test, y_test) = atti_dataset.load_data(num_words=num_words)

Downloading data from  https://media.githubusercontent.com/media/teamdigitale/daf-models/master/daf-datasets/data/atti/atti_dataset.npz


In [69]:
label_index_dict = atti_dataset.get_label_index()

Downloading data from  https://media.githubusercontent.com/media/teamdigitale/daf-models/master/daf-datasets/data/atti/label_index.json


In [70]:
max_dimension = max([max(x) for x in x_train])
max_dimension

9999

## Data Preparation

In [71]:
from functools import partial

def x_transformer(x_data):
    return partial(dataset_utils.vectorize_sequences, x_data, num_words)

In [72]:
batch_size = 128
num_classes = len(set(y_train))
train_val_split = math.ceil(len(x_train) * 0.8)

In [73]:
train_val_split

117777

In [74]:
train_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, batch_size, 
                                                     0, train_val_split, True)

val_generator = dataset_utils.dataset_generator_fun(x_train, y_train, x_transformer, batch_size,
                                             train_val_split, len(x_train), False)

In [75]:
def build_model(neurons, num_features):
    keras_util.new_session()
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(neurons, activation='relu', input_shape=(num_features, )))
    model.add(tf.keras.layers.Dense(neurons, activation='relu'))
    model.add(tf.keras.layers.Dense(neurons, activation='relu'))
    model.add(tf.keras.layers.Dense(len(label_index_dict), activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [76]:
model = build_model(64, num_words)

In [77]:
model.summary()  

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                640064    
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 34)                2210      
Total params: 650,594
Trainable params: 650,594
Non-trainable params: 0
_________________________________________________________________


In [78]:
train_steps = train_val_split // batch_size + 1
val_steps = (len(x_train) - train_val_split) // batch_size + 1

In [79]:
callbacks = [tf.keras.callbacks.TensorBoard(log_dir="logs/dense_512")]

In [80]:
history = model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=5, 
                              validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Prediction on the test set

In [None]:
test_steps = len(x_test) // batch_size + 1
test_generator = data_utils.data_generator(x_test, y_test, dimensions, num_classes, batch_size, 0, len(x_test))

In [None]:
model.evaluate_generator(test_generator, test_steps)

In [None]:
test_predictions = model.predict_generator(test_generator, test_steps)

In [None]:
print(test_predictions.shape)
print(y_test.shape)

In [None]:
predictions = np.array([np.argmax(x) for x in test_predictions])

In [None]:
metrics.precision_recall_fscore_support(y_test,predictions, average='weighted')

In [None]:
metrics.accuracy_score(y_test, predictions)

In [None]:
y_test_v = data_utils.to_one_hot(y_test, len(set(y_test)))

In [None]:
metrics.roc_auc_score(y_test_v, test_predictions, average='weighted')

In [None]:
print(metrics.classification_report(y_test, predictions))

### Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
np.set_printoptions(precision=2)

In [None]:
plt.rcParams['figure.figsize'] = (20,20)
plot_confusion_matrix(conf_matrix, classes=label_index_dict,
                      title='Confusion matrix, without normalization')