In [1]:
import numpy as np
import gensim.downloader as api
from tensorflow.keras.datasets import imdb

In [3]:
class Hyperparameters(object):
    """ Add hyper-parameters in init so when you read a json, it will get updated as your latest code. """
    def __init__(self,
                 learning_rate=5e-2,
                 architecture=None,
                 epochs=500,
                 batch_size=10,
                 loss='cross_entropy',
                 optimizer='sgd',
                 lr_at_plateau=True,
                 reduction_factor=None,
                 validation_check=True):
        """
        :param learning_rate: float, the initial value for the learning rate
        :param architecture: str, the architecture types
        :param epochs: int, the number of epochs we want to train
        :param batch_size: int, the dimension of the batch size
        :param loss: str, loss type, cross entropy or square loss
        :param optimizer: str, the optimizer type.
        :param lr_at_plateau: bool, protocol to decrease the learning rate.
        :param reduction_factor, int, the factor which we use to reduce the learning rate.
        :param validation_check: bool, if we want to keep track of validation loss as a stopping criterion.
        """
        self.learning_rate = learning_rate
        self.architecture = architecture
        self.epochs = epochs
        self.batch_size = batch_size
        self.loss = loss
        self.optimizer = optimizer
        self.lr_at_plateau = lr_at_plateau
        self.reduction_factor = reduction_factor
        self.validation_check = validation_check


class Dataset:
    """ Here we save the dataset specific related to each experiment. The name of the dataset,
    the scenario, if we modify the original dataset, and the dimensions of the input.
    This is valid for the modified_MNIST_dataset, verify if it is going to be valid next"""
    # TODO: add output_dims
    def __init__(self,
                 dataset_name='dataset_1',
                 scenario=1,
                 additional_dims=2,
                 n_training=10,
                 redundancy_amount=None):
        """
        :param dataset_name: str, dataset name
        :param scenario: int, the learning paradigm
        :param additional_dims: int, additional noise
        :param n_training: int, number of training examples
        :param redundancy_amount, percentage of redundant features, scenario 4 only
        """
        self.dataset_name = dataset_name
        self.scenario = scenario
        self.additional_dims = additional_dims
        self.n_training = n_training
        self.redundancy_amount = redundancy_amount


class Experiment(object):
    """
    This class represents your experiment.
    It includes all the classes above and some general
    information about the experiment index.
    IF YOU ADD ANOTHER CLASS, MAKE SURE TO INCLUDE IT HERE.
    """
    def __init__(self,
                 id,
                 output_path,
                 train_completed=False,
                 hyper=None,
                 dataset=None):
        """
        :param id: index of output data folder
        :param output_path: output directory
        :param train_completed: bool, it indicates if the experiment has already been trained
        :param hyper: instance of Hyperparameters class
        :param dataset: instance of Dataset class
        """
        if hyper is None:
            hyper = Hyperparameters()
        if dataset is None:
            dataset = Dataset()

        self.id = id
        self.output_path = output_path
        self.train_completed = train_completed
        self.hyper = hyper
        self.dataset = dataset

In [13]:
word_vectors = api.load("glove-wiki-gigaword-100")
idx_start_dct = dict([(1, 0), (2, 30), (4, 15)])

class DatasetGenerator:
    """ This class is meant to be the generator for different
    types of transformation to the IMDb datasets.
    """
    def __init__(self,
                 exp,
                 n_vl=5000):
        """ Initializer for the class. We pass the object Experiment to
        assess the transformation required.
        :param exp: Experiment object
        :param n_vl: int, number of validation samples per class
        """
        self.exp = exp
        self.n_vl = n_vl

        (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=5000,
                                                                              index_from=idx_start_dct[self.exp.dataset.scenario])
        self.train_data = train_data
        self.train_labels = train_labels
        self.test_data = test_data
        self.test_labels = test_labels
        word_index = imdb.get_word_index(path='imdb_word_index.json')
        self.reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        self.glove_embedding = api.load("glove-wiki-gigaword-100")

    def split_train_validation(self):
        """ Split of the training and validation set. """
        y_tr, y_vl, id_tr, id_vl = np.array([], dtype=int), np.array([], dtype=int), np.array([], dtype=int), np.array([], dtype=int)
        n_tr = self.exp.dataset.n_training

        for y_ in np.unique(self.train_labels):
            id_class_y_ = np.where(self.train_labels == y_)[0]
            tmp_id_tr = np.random.choice(id_class_y_,
                                         size=n_tr,
                                         replace=False)
            tmp_id_vl = np.random.choice(np.setdiff1d(id_class_y_, tmp_id_tr),
                                         size=self.n_vl,
                                         replace=False)

            id_tr = np.append(id_tr, tmp_id_tr)
            id_vl = np.append(id_vl, tmp_id_vl)
            y_tr = np.append(y_tr, y_ * np.ones(n_tr))
            y_vl = np.append(y_vl, y_ * np.ones(self.n_vl))

        return id_tr, y_tr, id_vl, y_vl

    def index2str(self, x):
        """ Transform in a list of string values.
         :param x: a data from imdb.load_data(),
         it contains the most used words.
         """
        return [self.reverse_word_index[id_] for id_ in x]

    def embedding(self, words_lst):
        """ Here we generate the embedding for each sample.
        We use the pre-trained word-vectors from gensim-data
        :param words_lst: the list of words in a sample
        """
        embedding_array = [self.glove_embedding[w_] for w_ in words_lst]
        embedding_mean = np.mean(np.array(embedding_array), axis=0)
        return embedding_mean

In [14]:
exp = Experiment(0, '.')
exp.dataset.n_training

10

In [15]:
dataset_generator = DatasetGenerator(exp)

In [18]:
id_tr, _, id_vl, _ = dataset_generator.split_train_validation()

In [19]:
id_tr

array([22532,  7581,  4532,  8789, 19070,  1138,  1848, 10136, 23842,
       19887,  8170, 12955, 24824,  3255,  3007, 21461,  9576,  2369,
        6542, 16761])

In [26]:
lst_words = dataset_generator.index2str(dataset_generator.train_data[123])

len(lst_words)

56

In [41]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=5000,
                                                                      index_from=0)

In [42]:
train_data[123]

[1,
 304,
 2,
 1298,
 17,
 1023,
 2508,
 84,
 2772,
 49,
 113,
 2,
 28,
 4,
 1,
 88,
 1217,
 99,
 10,
 25,
 107,
 8,
 3,
 134,
 10,
 112,
 216,
 138,
 32,
 218,
 953,
 51,
 10,
 13,
 8,
 2711,
 58,
 319,
 420,
 9,
 35,
 73,
 56,
 1800,
 69,
 5,
 2,
 20,
 2,
 964,
 9,
 35,
 82,
 59,
 355,
 96]

In [43]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=5000,
                                                                      index_from=100)

In [44]:
train_data[123]

[1,
 404,
 102,
 1398,
 117,
 1123,
 2608,
 184,
 2872,
 149,
 213,
 102,
 128,
 104,
 101,
 188,
 1317,
 199,
 110,
 125,
 207,
 108,
 103,
 234,
 110,
 212,
 316,
 238,
 132,
 318,
 1053,
 151,
 110,
 113,
 108,
 2811,
 158,
 419,
 520,
 109,
 135,
 173,
 156,
 1900,
 169,
 105,
 2,
 120,
 102,
 1064,
 109,
 135,
 182,
 159,
 455,
 196]

In [30]:
train_data[123]

[1,
 404,
 102,
 1398,
 117,
 1123,
 2608,
 184,
 2872,
 149,
 213,
 102,
 128,
 104,
 101,
 188,
 1317,
 199,
 110,
 125,
 207,
 108,
 103,
 234,
 110,
 212,
 316,
 238,
 132,
 318,
 1053,
 151,
 110,
 113,
 108,
 2811,
 158,
 419,
 520,
 109,
 135,
 173,
 156,
 1900,
 169,
 105,
 2,
 120,
 102,
 1064,
 109,
 135,
 182,
 159,
 455,
 196]