In [1]:
import tensorflow as tf
import numpy as np
import h5py
import os
import pickle
import datetime
from random import shuffle
from functools import reduce
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive')
# !pip install tensorflow-gpu

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
max_seg = 20
max_word = 40
max_doc = 100
level_class_cnt = 3

test_percentage = 0.1
validation_percentage = 0.1

batch_size = 512
epochs = 8


input_path = '/content/gdrive/My Drive/data_source/milnet/model_inputs/gourmet.hdf5'
fine_tuning_input_path = '/content/gdrive/My Drive/data_source/milnet/model_inputs/organic.hdf5'
w2v_weights_path = '/content/gdrive/My Drive/data_source/milnet/model_inputs/ft_weights.npy'

model_out_path = '/content/gdrive/My Drive/data_source/milnet/results/gourmet_sentence_ft.h5'
log_out_dir = '/content/gdrive/My Drive/data_source/milnet/log/'

sample_amount = 0
mini_batch_cnt = 0
with h5py.File(input_path) as in_file:
    for index in range(len(in_file['label/'].keys())):
        mini_batch_cnt += 1
        sample_amount += len(in_file['label/' + str(index)])
batch_indices = [*range(mini_batch_cnt)]
shuffle(batch_indices)

fine_tuning_amount = 0
fine_tuning_mini_batch_cnt = 0
with h5py.File(fine_tuning_input_path) as in_file:
    for index in range(len(in_file['label/'].keys())):
        fine_tuning_mini_batch_cnt += 1
        fine_tuning_amount += len(in_file['label/' + str(index)])
fine_tuning_batch_indices = [*range(fine_tuning_mini_batch_cnt)]

train_batches = batch_indices[0:int(mini_batch_cnt * (1 - test_percentage - validation_percentage))]
validation_batches = batch_indices[int(mini_batch_cnt * (1 - test_percentage - validation_percentage)): int(mini_batch_cnt * (1 - test_percentage))]
test_batches = batch_indices[int(mini_batch_cnt * (1 - test_percentage)):]

fine_tuning_train_batches = fine_tuning_batch_indices[0:int(fine_tuning_mini_batch_cnt * (1 - test_percentage - validation_percentage))]
fine_tuning_validation_batches = fine_tuning_batch_indices[int(fine_tuning_mini_batch_cnt * (1 - test_percentage - validation_percentage)): int(fine_tuning_mini_batch_cnt * (1 - test_percentage))]
fine_tuning_test_batches = fine_tuning_batch_indices[int(fine_tuning_mini_batch_cnt * (1 - test_percentage)):]

In [0]:
w2v = np.load(w2v_weights_path, allow_pickle=True)
w2v_len = w2v.shape[1]

In [0]:
''' Mappling the rating labels to sentiment levels.

Args:
    raw_label (int): The rating label in the raw dataset.

Returns:
    (int): The sentiment level for the raw label.
'''

def __label_map(raw_label):
    if raw_label == 1 or raw_label == 2:
        return 0
    elif raw_label == 3:
        return 1
    else:
        return 2

''' Remove the samples with rating label 2 and 4 in the dataset.

This method will help to maximize the distance between the classes.

Args:
    feature_array (np.array): The feature numpy array.
    label_array (np.array): The label numpy array, the labels should not be mapped yet.

Returns:
    (np.array, np.array): The feature numpy array and the label numpy array.
'''
def __balance_data(feature_array, label_array):
    to_balance_indices = np.concatenate([np.where(label_array == 2)[0], np.where(label_array == 4)[0]])
    return np.delete(feature_array, to_balance_indices, axis=0), np.delete(label_array, to_balance_indices, axis=0)

''' Turn the segmented text input into a document vector input.

This method is used when calculating the document level performance, it helps to prepare the input.
The data will be padded or truncated accordingly.

Args: 
    feature (np.array): The segmented word index matrix.
    max_doc (int): The maximun length of the document vector.
'''
def __get_doc_vector(feature, max_doc):
    doc_vec = feature[feature != 0]
    if doc_vec.shape[0] > max_doc:
        return doc_vec[:max_doc]
    elif doc_vec.shape[0] < max_doc:
        topad_len = max_doc - doc_vec.shape[0]
        pad_width = [(0, 0) if i != 0 else (0, topad_len) for i in range(len(doc_vec.shape))]
        return np.pad(doc_vec, pad_width, 'constant', constant_values=0)
    else:
        return doc_vec

''' Generate the batch data for `fit_generator`.

Args:
    input_path (str): The path of the .hdf5 file.
    batch_indices (int): The indices of the mini batches used in the .hdf5 file.
    batch_size (int): The batch size.
    max_seg (int): The maximum number of segmentation in a document.
    max_word (int): The maximum number of word in a segment.
    epochs (int): The expected epochs of the generator.
    use_balance (boolean): Whether removing the samples with rating 2 and 4.
'''
def data_generator(input_path, batch_indices, batch_size=batch_size, max_seg=max_seg, max_word=max_word, epochs=epochs, use_balance=True):
    with h5py.File(input_path) as in_file:
        feature_array, label_array = np.zeros((batch_size, max_seg, max_word)), np.zeros((batch_size, 1))
        batch_index = 0
        for _ in range(epochs):
            shuffle(batch_indices)
            for index in batch_indices:
                doc, label = in_file['document/' + str(index)], in_file['label/' + str(index)]
                random_doc_order = [*range(len(doc))]
                shuffle(random_doc_order)
                for i in random_doc_order:
                    feature_array[batch_index] = doc[i][:max_seg, :max_word]
                    label_array[batch_index] = label[i]
                    batch_index += 1
                    if batch_index == batch_size:
                        if use_balance:
                            feature_array, label_array = __balance_data(feature_array, label_array)
                        yield feature_array, np.array([np.array([__label_map(l[0])]) for l in label_array])
                        batch_index = 0
                        feature_array, label_array = np.zeros((batch_size, max_seg, max_word)), np.zeros((batch_size, 1))

# This is used for generating data for document level model
# def data_generator(input_path, batch_indices, batch_size=batch_size, max_doc=max_doc, epochs=epochs, use_balance=True):
#     with h5py.File(input_path) as in_file:
#         feature_array, label_array = np.zeros((batch_size, max_doc)), np.zeros((batch_size, 1))
#         batch_index = 0
#         for _ in range(epochs):
#             shuffle(batch_indices)
#             for index in batch_indices:
#                 doc, label = in_file['document/' + str(index)], in_file['label/' + str(index)]
#                 random_doc_order = [*range(len(doc))]
#                 shuffle(random_doc_order)
#                 for i in random_doc_order:
#                     feature_array[batch_index] = __get_doc_vector(doc[i], max_doc)
#                     label_array[batch_index] = label[i]
#                     batch_index += 1
#                     if batch_index == batch_size:
#                         if use_balance:
#                             feature_array, label_array = __balance_data(feature_array, label_array)
#                         yield feature_array, np.array([np.array([__label_map(l[0])]) for l in label_array])
#                         batch_index = 0
#                         feature_array, label_array = np.zeros((batch_size, max_doc)), np.zeros((batch_size, 1))

def get_data(batch_indices, max_seg=max_seg, max_word=max_word):
    global input_path
    with h5py.File(input_path) as in_file:
        sample_amount = sum([len(in_file['document/' + str(i)]) for i in batch_indices])
        feature_array, label_array = np.zeros((sample_amount, max_seg, max_word)), np.zeros((sample_amount, 1))
        batch_index = 0
        cnt = 0
        for index in batch_indices:
            doc, label = in_file['document/' + str(index)], in_file['label/' + str(index)]
            for i in range(len(doc)):
                feature_array[cnt] = doc[i][:max_seg, :max_word]
                label_array[cnt] = __label_map(label[i])
                cnt += 1
        return feature_array, label_array

In [0]:
# Container for layers, since multiple instance learning requires reusing classifier for segments.
shared_sublayer_cache = {}

''' Execute `sublayer` for every tensor in `layer_in` splitted along first dimension.

Args:
    layer_in (Tensor): The input tensor.
    sublayer (func): The function applied to all splitted tensors.
    args (dict): The arguments of sublayer

'''
def branch_execute(layer_in, sublayer, args={}):
    instance_cnt = layer_in.shape[1]
    sliced_inputs = [tf.keras.layers.Lambda(lambda x: x[:,i])(layer_in) 
                     for i in range(instance_cnt)]
    branch_layers = [sublayer(**{**{'layer_in': sliced_inputs[i]}, **args}) 
                     for i in range(instance_cnt)]
    expand_layer = tf.keras.layers.Lambda(lambda x: tf.keras.backend.expand_dims(x, axis=1))
    expanded_layers = [expand_layer(branch_layers[i]) for i in range(instance_cnt)]
    concated_layer = tf.keras.layers.Concatenate(axis=1)(expanded_layers)
    return concated_layer

def __sentence_encode_layer_share(layer_in, hidden_feature_dim, kernel_height, eta):
    cnned_height = layer_in.shape[1] - kernel_height + 1
    global shared_sublayer_cache
    if 'shared_sentence_encode_sublayers' + str(kernel_height) not in shared_sublayer_cache:
        shared_sublayer_cache['shared_sentence_encode_sublayers' + str(kernel_height)] = {
            'conv_layer': tf.keras.layers.Conv1D(
                filters=hidden_feature_dim,
                kernel_size=kernel_height,
                kernel_regularizer=tf.keras.regularizers.l2(eta)
            ),
            'batch_normalize_layer': tf.keras.layers.BatchNormalization(
            ),
            'relu_layer': tf.keras.layers.ReLU(
            ),
            'max_pool_layer': tf.keras.layers.MaxPool1D(
                (cnned_height,)
            )
        }
    shared_layers = shared_sublayer_cache['shared_sentence_encode_sublayers' + str(kernel_height)]
    conv_layer = shared_layers['conv_layer'](layer_in)
    batch_normalize_layer = shared_layers['batch_normalize_layer'](conv_layer)
    relu_layer = shared_layers['relu_layer'](batch_normalize_layer)
    max_pool_layer = shared_layers['max_pool_layer'](relu_layer)
    return max_pool_layer

def __multi_kernel_encode_layer(layer_in, hidden_feature_dim, kernel_heights, eta):
    cnn_layers = [__sentence_encode_layer_share(layer_in, hidden_feature_dim, h, eta) 
                  for h in kernel_heights]
    concated_layer = tf.keras.layers.Concatenate()(cnn_layers)
    flatten_layer = tf.keras.layers.Flatten()(concated_layer)
    return flatten_layer

def __seg_classifier_layer_share(layer_in, class_cnt, dropout_rate, eta):
    global shared_sublayer_cache
    if 'shared_seg_classifier_sublayers' not in shared_sublayer_cache:
        shared_sublayer_cache['shared_seg_classifier_sublayers'] = {
            'drop_out_layer': tf.keras.layers.Dropout(
                dropout_rate
            ),
            'dense_layer': tf.keras.layers.Dense(
                units=class_cnt,
                activation='softmax',
                kernel_regularizer=tf.keras.regularizers.l2(eta),
                bias_regularizer=tf.keras.regularizers.l2(eta)
            )
        }
    shared_layers = shared_sublayer_cache['shared_seg_classifier_sublayers']
    drop_out_layer = shared_layers['drop_out_layer'](layer_in)
    dense_layer = shared_layers['dense_layer'](drop_out_layer)
    return dense_layer

def __attention_layer_share(layer_in, attention_key_dim, dropout_rate, eta):
    global shared_sublayer_cache
    if 'shared_attention_sublayers' not in shared_sublayer_cache:
        shared_sublayer_cache['shared_attention_sublayers'] = {
            'drop_out_layer': tf.keras.layers.Dropout(
                dropout_rate
            ),
            'dense_layer': tf.keras.layers.Dense(
                units=attention_key_dim, 
                activation='tanh',
                kernel_regularizer=tf.keras.regularizers.l2(eta),
                bias_regularizer=tf.keras.regularizers.l2(eta)
            ),
            'nobias_dense_layer': tf.keras.layers.Dense(
                units=1, 
                use_bias=False, 
                bias_regularizer=tf.keras.regularizers.l2(eta)
            )
        }
    shared_layers = shared_sublayer_cache['shared_attention_sublayers']
    drop_out_layer = shared_layers['drop_out_layer'](layer_in)
    dense_layer = shared_layers['dense_layer'](drop_out_layer)
    nobias_dense_layer = shared_layers['nobias_dense_layer'](dense_layer)
    return nobias_dense_layer

def bidirectional_gru_layer(layer_in, gru_feature_dim):
    bidirectional_layer = tf.keras.layers.Bidirectional(
        tf.keras.layers.GRU(gru_feature_dim, return_sequences=True)
    )(layer_in)
    return bidirectional_layer

def merge_layer(layer_in, class_cnt, eta):
    dot_layer = tf.keras.layers.Dot(axes=1)(layer_in)
    flatten_layer = tf.keras.layers.Flatten()(dot_layer)
    dense_layer = tf.keras.layers.Dense(
        units=class_cnt, 
        activation='softmax',
        kernel_regularizer=tf.keras.regularizers.l2(eta),
        bias_regularizer=tf.keras.regularizers.l2(eta)
    )(flatten_layer)
    return dense_layer

def performance_judge(model, generator, class_cnt):
    eps = np.finfo(float).eps
    accuracy, precisions, recalls, f1s = [], [], [], []
    for i, (features, labels) in enumerate(generator):
        predicted = model.predict(features)
        precisions.append([])
        recalls.append([])
        f1s.append([])
        contingency_table = np.zeros((class_cnt, class_cnt))
        for index in range(features.shape[0]):
            contingency_table[int(labels[index][0])][np.argmax(predicted[index])] += 1
        accuracy.append(np.trace(contingency_table) / features.shape[0])
        for index in range(class_cnt):
            precisions[i].append(contingency_table[index][index] / (np.sum(contingency_table[:, index]) + eps))
            recalls[i].append(contingency_table[index][index] / (np.sum(contingency_table[index, :]) + eps))
            f1s[i].append(2 * precisions[i][-1] * recalls[i][-1] / ((precisions[i][-1] + recalls[i][-1]) + eps))
    precisions = [float(sum(l))/len(l) for l in zip(*precisions)]
    recalls = [float(sum(l))/len(l) for l in zip(*recalls)]
    f1s = [float(sum(l))/len(l) for l in zip(*f1s)]
    print('Accuracy:', round(reduce(lambda x, y: x + y, accuracy) / len(accuracy), 3))
    for index in range(class_cnt):
        print('_____ Class', index, '_____')
        print('Precision\t', round(precisions[index], 3))
        print('Recall\t\t', round(recalls[index], 3))
        print('F1 Score\t', round(f1s[index], 3))

In [7]:
print('Constructing Model ...', end='')

model_input = tf.keras.Input((max_seg, max_word))

embedding_layer = tf.keras.layers.Embedding(
    input_dim=w2v.shape[0], 
    output_dim=w2v_len, 
    weights=[w2v], 
    input_length=max_word, 
    trainable=False
)(model_input)

encoding_layer = branch_execute(
    embedding_layer, 
    sublayer=__multi_kernel_encode_layer, 
    args={
        'hidden_feature_dim': 100,
        'kernel_heights': [3, 4, 5],
        'eta': 1e-4
    }
)

biglu_layer = bidirectional_gru_layer(
    encoding_layer, 
    gru_feature_dim=50
)

attention_layer = branch_execute(
    biglu_layer, 
    sublayer=__attention_layer_share, 
    args={
        'attention_key_dim': 100,
        'dropout_rate': 0.5,
        'eta': 1e-4
    }
)

softmaxed_attention_layer = tf.keras.layers.Softmax(
    axis=1
)(attention_layer)

classification_layer = branch_execute(
    encoding_layer, 
    sublayer=__seg_classifier_layer_share, 
    args={
        'class_cnt': level_class_cnt,
        'dropout_rate': 0.5,
        'eta': 1e-4
    }
)

merge_layer = merge_layer(
    [softmaxed_attention_layer, classification_layer],
    class_cnt=level_class_cnt,
    eta=1e-4
)

model = tf.keras.Model(model_input, merge_layer)

print('\rModel Constructed. Compiling ...', end='')

model.compile(
    optimizer=tf.keras.optimizers.Adam(clipvalue=0.5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

print('\rModel Compiled.')

model.summary()

W0730 18:04:24.014059 139976480278400 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Constructing Model ...

W0730 18:04:27.501128 139976480278400 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0730 18:04:32.368327 139976480278400 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0730 18:04:32.370305 139976480278400 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with

Model Compiled.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20, 40)]     0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 40, 300)  49505700    input_1[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 40, 300)      0           embedding[0][0]                  
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 40, 300)      0           embedding[0][0]                  
______________________________________________________________________________

In [8]:
logdir = os.path.join(log_out_dir, datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=0)

model.fit_generator(
    data_generator(input_path, train_batches, use_balance=True), 
    validation_data=data_generator(input_path, validation_batches, use_balance=True),
    steps_per_epoch=(sample_amount * (1 - test_percentage - validation_percentage) // batch_size) - 1,
    validation_steps=(sample_amount * (validation_percentage) // batch_size) - 1,
    validation_freq=2,
    epochs=epochs,
    callbacks=[tensorboard_callback]
)

model.save(model_out_path)

print('########## Training Error ##########')
performance_judge(model, data_generator(input_path, train_batches, epochs=1, use_balance=True), level_class_cnt)
print('')
print('############ Test Error ############')
performance_judge(model, data_generator(input_path, test_batches, epochs=1, use_balance=True), level_class_cnt)
print('')
print('####### No Cheat Test Error ########')
performance_judge(model, data_generator(input_path, test_batches, epochs=1, use_balance=False), level_class_cnt)

print(logdir)

Epoch 1/8


W0730 18:04:47.380562 139976480278400 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
############ Test Error ############
Accuracy: 0.761
_____ Class 0 _____
Precision	 0.807
Recall		 0.635
F1 Score	 0.71
_____ Class 1 _____
Precision	 0.793
Recall		 0.692
F1 Score	 0.738
_____ Class 2 _____
Precision	 0.716
Recall		 0.935
F1 Score	 0.811

####### No Cheat Test Error ########
Accuracy: 0.697
_____ Class 0 _____
Precision	 0.841
Recall		 0.505
F1 Score	 0.63
_____ Class 1 _____
Precision	 0.611
Recall		 0.689
F1 Score	 0.647
_____ Class 2 _____
Precision	 0.706
Recall		 0.896
F1 Score	 0.79
/content/gdrive/My Drive/data_source/milnet/log/20190730_180441


In [0]:
# Document level model construction

print('Constructing Model ...', end='')

model_input = tf.keras.Input((max_doc,))

embedding_layer = tf.keras.layers.Embedding(
    input_dim=w2v.shape[0], 
    output_dim=w2v_len, 
    weights=[w2v], 
    input_length=max_doc, 
    trainable=False
)(model_input)

encoding_layer = __multi_kernel_encode_layer(
    embedding_layer, 
    100, 
    [3, 4, 5], 
    1e-4
)

classification_layer = __seg_classifier_layer_share(
    encoding_layer, 
    level_class_cnt, 
    0.5, 
    1e-4
)

model = tf.keras.Model(model_input, classification_layer)

model.compile(
    optimizer=tf.keras.optimizers.Adam(clipvalue=0.5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

print('\rModel Compiled.')