# Obtain IMDB Dataset

The first half of this notebook (obtaining, preprocessing, shaping..) follows the collab notebook, provided by Google, that can be found [here]( https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/keras/basic_text_classification.ipynb)

In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import os

In [2]:
# load dataset
imdb = keras.datasets.imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))

Training entries: 25000, labels: 25000


In [4]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [5]:
print(len(train_data[0]), len(train_data[1]))
print("-----------------")
print(train_data[0])
print("----------------")
print(decode_review(train_data[0]))

218 189
-----------------
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
----------------
<START> this film

In [6]:
MAX_LEN = 256
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=MAX_LEN)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=MAX_LEN)

In [7]:
print(len(train_data[0]), len(train_data[1]))
print("-----------------")
print(train_data[0])
print("----------------")
print(decode_review(train_data[0]))

256 256
-----------------
[   1   14   22   16   43  530  973 1622 1385   65  458 4468   66 3941
    4  173   36  256    5   25  100   43  838  112   50  670    2    9
   35  480  284    5  150    4  172  112  167    2  336  385   39    4
  172 4536 1111   17  546   38   13  447    4  192   50   16    6  147
 2025   19   14   22    4 1920 4613  469    4   22   71   87   12   16
   43  530   38   76   15   13 1247    4   22   17  515   17   12   16
  626   18    2    5   62  386   12    8  316    8  106    5    4 2223
 5244   16  480   66 3785   33    4  130   12   16   38  619    5   25
  124   51   36  135   48   25 1415   33    6   22   12  215   28   77
   52    5   14  407   16   82    2    8    4  107  117 5952   15  256
    4    2    7 3766    5  723   36   71   43  530  476   26  400  317
   46    7    4    2 1029   13  104   88    4  381   15  297   98   32
 2071   56   26  141    6  194 7486   18    4  226   22   21  134  476
   26  480    5  144   30 5535   18   51   36   28 

In [8]:
# see what the labels look like
for i,v in enumerate(train_labels):
    print(v)
    if i == 3:
        break

1
0
0
1


In [9]:
x_train = train_data[10000:]
y_train = train_labels[10000:]

x_val = train_data[:10000]
y_val = train_labels[:10000]

x_test = test_data
y_test = test_labels

## Convert to Tensors

In [10]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _int64_features(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[v for v in value]))

In [11]:
def generate_tfrecords(X, y, output_filename):
    print("Start conversion to {}".format(output_filename))
    writer = tf.python_io.TFRecordWriter(output_filename)

    for i, x_single in enumerate(X):
        y_single = y[i]
        
        # ensure correct datatype
        features = [np.int64(val) for val in x_single]
        label = np.int64(y_single)
        
        # create features
        feature = {'/iid': _int64_feature(i),
                   '/features' : _int64_features(features),
                   '/label': _int64_feature(label)}
        
        # create example protocol buffer
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        
        writer.write(example.SerializeToString())

    writer.close()
    print("Successfully created {}".format(output_filename))

In [12]:
def maybe_create_dir(dir_path: str) -> None:
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print("{} created".format(dir_path))
    else:
        print("{} already exists".format(dir_path))
        
maybe_create_dir("../data")

../data already exists


In [13]:
generate_tfrecords(x_train, y_train, os.path.join("..", "data", "train.tfrecords"))
generate_tfrecords(x_val, y_val, os.path.join("..", "data", "val.tfrecords"))
generate_tfrecords(x_test, y_test, os.path.join("..", "data", "test.tfrecords"))

Start conversion to ../data/train.tfrecords
Successfully created ../data/train.tfrecords
Start conversion to ../data/val.tfrecords
Successfully created ../data/val.tfrecords
Start conversion to ../data/test.tfrecords
Successfully created ../data/test.tfrecords
