In [2]:
"""
author: William Darko (repurposed from original author Francois Chollet)
date: June, 2021
description: Two-class (binary) classification using IMDB dataset to classify movie review as positive or negative. The original code sample
is provided by Francois Chollet in his 'Deep Learning with Python' (1st Edition, Manning publisher)
"""


import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras.datasets import imdb


In [3]:
# loading the IMBD dataset

(training_data, training_labels), (testing_data, testing_labels) = imdb.load_data(num_words=10000)
# num_words argument denotes taking the top 10000 most frequent words in the training_data
# training_data, and testing_data are list of reviews where each review is a list of word indices like [1, 14, 28, 99, 299, 87...] from a dictionary of words
# thus the first word in a review [1, 14, 28, 99, 299, 87...], the word at index 0 of the review, is the word at index 1 of the dictionary.
# training_labels and testing_labels are list of 1s and 0s classifying a word as positive, or negative, respectively

print(training_data[0])





[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [None]:
# preparing the data

# one way is to pad the lists to the same length and turn them into integer tensors of shape (samples, word_indices) and use Embedding as the first layer
# other way is to 'One-hot' encode lists, meaning to turn them into vectors of 0s and 1s where 0 denotes no presense of that letter and 1 the opposite
# using One-hot encoding we create a list of length 10000 where there are 0s every where except for the indices which corresponding letters appear in the review sequence

def encode_sequences(sequences, dimesion=10000):
    results =  np.zeros((len(sequences),dimesion))
    for i, review in enumerate(sequences):
        results[i, review] = 1 # equivalent of iterating a through review with a second counter j, and doing results[i][j] = 1
    return results

x_train_data = encode_sequences(training_data)
x_test_data = encode_sequences(testing_data)

y_train_labels = np.asarray(training_labels, dtype=np.float32)
y_test_labels = np.asarray(testing_labels, dtype=np.float32)

print("Training Data: ", x_train_data)
print("Training Labels: ", y_train_labels)