In [31]:
# Interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Data wrangling and standard library
import os
from itertools import  islice
import numpy as np

# Machine learning
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical

## Data

There are 46 different topics in the Reuters dataset; some topics are more represented than others, but each topic has at least 10 examples in the training set.

In [3]:
(train_X, train_y), (test_X, test_y) = reuters.load_data(num_words=10000, test_split=0.2, seed=1227)

### Shape

In [4]:
train_X.shape, train_y.shape

((8982,), (8982,))

In [5]:
test_X.shape, test_y.shape

((2246,), (2246,))

### Structure of Training Examples

Each row of training example is a list of integers (indices), and there are 8,982 of such training examples.

In [6]:
# First two training samples
train_X[0:2]

array([list([1, 599, 127, 262, 6474, 8548, 2, 1184, 7, 10, 2021, 1027, 10, 1633, 153, 6, 676, 6, 9794, 403, 3217, 162, 172, 4, 294, 517, 237, 676, 57, 85, 136, 583, 164, 4, 517, 9, 4, 155, 1700, 403, 1082, 590, 3884, 13, 109, 206, 2, 208, 483, 854, 22, 5382, 13, 271, 99, 179, 1355, 6, 4, 214, 1574, 2854, 2886, 118, 4, 2170, 179, 718, 1440, 2, 36, 34, 1845, 10, 2066, 41, 805, 30, 625, 268, 1648, 1845, 24, 692, 164, 4, 78, 1571, 708, 4, 3884, 9, 4, 237, 33, 1310, 10, 2066, 268, 33, 646, 6, 1133, 24, 2066, 36, 8, 7, 4, 1027, 3691, 7, 521, 42, 237, 534, 6, 1773, 725, 21, 403, 3217, 162, 117, 10, 306, 555, 40, 6922, 66, 1704, 164, 4, 78, 1930, 3884, 543, 660, 1048, 1306, 2978, 2346, 4, 2886, 9, 5382, 55, 5161, 6, 1773, 799, 2310, 2071, 21, 294, 162, 6, 4, 403, 107, 129, 2066, 23, 625, 2, 8, 676, 164, 4, 78, 1520, 1207, 10, 295, 216, 161, 144, 62, 119, 190, 1085, 51, 152, 216, 23, 189, 2, 9, 137, 2277, 55, 1982, 13, 532, 3686, 1292, 3051, 6, 4, 782, 1376, 913, 36, 8, 970, 209, 351, 6, 1310, 

In [7]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
# The items() method returns a view object, which contains the key-value pairs of the dictionary, as tuples in a list
# The islice() method returns an iterator object used to access 'selected' (first 5) elements from the list of tuples returned by items()
list(islice(word_index.items(), 5))

[('mdbl', 10996),
 ('fawc', 16260),
 ('degussa', 12089),
 ('woods', 8803),
 ('hanging', 13796)]

In [8]:
# Create a dictionary mapping integers to words (in other words, reverse the word_index dictionary key-value pairs)
# We add 3 to the integer indices because 0, 1, and 2 are reserved indices for "padding", "start of sequence", and "unknown"
reverse_word_index = {value + 3: key for key, value in word_index.items()}
list(islice(reverse_word_index.items(), 5))

[(10999, 'mdbl'),
 (16263, 'fawc'),
 (12092, 'degussa'),
 (8806, 'woods'),
 (13799, 'hanging')]

In [10]:
for i in range(7):
    reverse_word_index.get(i, 'Key not found')

'Key not found'

'Key not found'

'Key not found'

'Key not found'

'the'

'of'

'to'

In [11]:
# Decode first training examples back to words
decoded_first_train_sample = " ".join([reverse_word_index.get(i, '?') for i in train_X[0]])
decoded_first_train_sample

'? french foreign minister jean bernard ? predicted in a published interview a successful end to negotiations to admit gulf petrochemical exports into the european community ec negotiations have been under way between the community and the six nation gulf cooperation council gcc for three years ? due here tomorrow from oman for his first official visit to the united arab emirates uae told the semi official daily al ? he was confident a solution would soon be reached i am confident that problems between the two big partners the gcc and the ec will find a solution i will work to reach that solution he said in the interview conducted in paris an ec decision to impose tariffs on gulf petrochemical exports over a set quota has strained trade relations between the two sides gcc members saudi arabia kuwait bahrain qatar the uae and oman are threatening to impose heavy customs duties on european exports to the gulf if no solution is reached ? said negotiations between the two groups took a lon

### Number of Unique Words

In [12]:
# Convert each list of integers to numpy array
for index, arr in np.ndenumerate(train_X):
    train_X[index] = np.array(arr, dtype=np.int32)

In [13]:
train_X[2], train_X[3]

(array([  1,  53, 188,  26,  14, 188,  26, 255, 346, 219,  91, 142, 146,
         93, 102,  17,  12], dtype=int32),
 array([   1,  603, 1827, 2175,    7,  104,  138,  165,   47,   20,   22,
          10,   29,  157,    6, 1890, 3200,    4,    2, 4494,   29,   21,
          29,  276,    4, 1167,  379,    8,  104, 1827, 2175,   62,   84,
         158,   63,   20,   22, 1038, 5841,    7,  110,  185,   77,  202,
         318,   47,   20,   22,   10,  139,  157,   51,  138,   83,   12,
          20,   22,   10,   29,  157,  266, 1827, 2175,   62,  958,  208,
           6,   10,  365,   63,   20,   29,   21,   29,  154,    7,    2,
        1219, 2175,    6,  533, 6809,   13,    4,    2, 3242,  276, 6904,
        1867,   27,  246,  260,  128,  140,    2,   12], dtype=int32))

In [14]:
# Concatenate all training samples into one numpy array
all_integers = np.concatenate(train_X)
all_integers.shape

(1307140,)

In [15]:
# Check memory usage in megabytes
all_integers.size * all_integers.itemsize * 1e-6

5.22856

In [16]:
# Check number of unique words
np.unique(all_integers).size

9977

In [17]:
# Max value of all integers
all_integers.max()

9999

### Structure of Targets

The label associated with each training example is an integer between 0 and 45, each representing a topic:

In [18]:
np.unique(train_y), np.unique(train_y).shape

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]),
 (46,))

### Check for Missingness

In [19]:
# Targets
np.isnan(train_y).any()

False

In [20]:
# Training data
missingness_dict = {}
for index, arr in np.ndenumerate(train_X):
    if np.isnan(arr).any():
        missingness_dict[index] = np.isnan(arr).sum() / sum(len(arr))

In [21]:
missingness_dict == {}

True

In [22]:
del missingness_dict

No missing values for the training data.

## Data Pre-processing

### Training Data

We one-hot encode the lists of integers to turn them into vectors of 0s and 1s. For instance, we turn the sequence [3, 5] into a 10,000-dimensional vector that would be all 0s except for indices 3 and 5, which would be 1s. Recall that the max word index is 9,999, which means that no word index will exceed 10,000:


In [23]:
def vectorize_sequences(sequences, dimension=10000) -> np.ndarray:
    """
    One-hot encode training data

    Parameters
    ----------
    sequences : np.ndarray
        Training data where each of the m example or row is an array of integers
    dimension : int, optional
        Number of possible unique words, by default 10,000

    Returns
    -------
    np.ndarray
        Training data where each example is a one-hot encoded vector
    """
    # Initialize a matrix of zeros with shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for index, word_indices in np.ndenumerate(sequences):
        # For each row of the zero matrix, set all column indices that equal to the word index to 1
        # For the ith example, if the word index is [1, 2, 9, 1000, 983, 454], then set results[i, [1, 2, 9, 1000, 983, 454]] = 1.
        results[index, word_indices] = 1.
    return results

In [24]:
# Vectorize training and test data
train_X = vectorize_sequences(train_X)
test_X = vectorize_sequences(test_X)

In [28]:
# Examine the first ten training example
train_X[:10]

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [29]:
# Examine the first ten testing example
test_X[:10]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]])

### Targets

We use one-hot encoding for the labels, creating a matrix with 46 columns, each of which is a all-zero column vector with one's in the place of the label index:

In [34]:
one_hot_train_y = to_categorical(train_y)
one_hot_test_y = to_categorical(test_y)
one_hot_train_y.shape, one_hot_test_y.shape

((8982, 46), (2246, 46))

In [36]:
one_hot_train_y, one_hot_test_y

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

We can compute the matrix sparsity for these matrices:

In [41]:
1 - (np.count_nonzero(one_hot_train_y, axis=(0, 1)) / one_hot_train_y.size)

0.9782608695652174

In [42]:
1 - (np.count_nonzero(one_hot_test_y, axis=(0, 1)) / one_hot_test_y.size)

0.9782608695652174

Both matrices are very sparse.

## Store Data To file

In [50]:
with open('data/processed_data.npy', 'wb') as f:
    np.save(f, train_X)
    np.save(f, one_hot_train_y)
    np.save(f, test_X)
    np.save(f, one_hot_test_y)