In [None]:
from collections import defaultdict
import requests
import os
import tarfile
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
import numpy as np

**Get Data**

In [2]:
URL = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
DATA_DIR = 'data'
ROOT_DIR = ''

In [3]:
def get_data(url=URL, root_dir=ROOT_DIR, data_dir=DATA_DIR):
    response = requests.get(url)
    
    with open('temp.tar.gz', 'wb') as f:
        f.write(response.content)
    
    with tarfile.open('temp.tar.gz', 'r:gz') as tar_file:
        tar_file.extractall(path=root_dir)
    
    os.remove('temp.tar.gz')
    os.rename('aclImdb', data_dir)

In [4]:
get_data()

In [3]:
filepath = DATA_DIR
path = Path(filepath)
path

PosixPath('data')

In [4]:
def review_paths(dirpath):
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos = review_paths(path / "train" / "pos")
train_neg = review_paths(path / "train" / "neg")
test_pos = review_paths(path / "test" / "pos")
test_neg = review_paths(path / "test" / "neg")

In [5]:
len(train_pos)

12500

**Train-Test-Valid Split**

In [6]:
np.random.shuffle(test_pos)
np.random.shuffle(test_neg)

valid_pos = test_pos[5000:]
valid_neg = test_neg[5000:]
test_pos = test_pos[:5000]
test_neg = test_neg[:5000]

In [7]:
len(test_pos), len(valid_pos)

(5000, 7500)

**Preprocessing**

In [18]:
# Python way
def imdb_dataset(filepaths_pos, filepaths_neg):
    reviews=[]
    labels=[]
    for filepaths, label in ((filepaths_pos, 1), (filepaths_neg, 0)):
        for filepath in filepaths:
            with open(filepath, 'r') as fp:
                reviews.append(fp.read())
            labels.append(label)
    X = tf.constant(reviews)
    y = tf.constant(labels)
    return tf.data.Dataset.from_tensor_slices((X,y))

In [19]:
dataset = imdb_dataset(train_pos, train_neg)

In [30]:
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(), dtype=string, numpy=b"Despite Disney's best efforts, this is a rather enjoyable movie about following your dreams. I was surprised that it didn't strike me as over-sentimental; this movie played fair. Dennis Quaid was very, very good in the role, which is saying something for a sports movie. I can't recall how many sports movies have had little quirks that bother me; here, everybody looks the part. This movie is surprisingly good, and I predict that it will do surprising business as it is a G-rated movie that doesn't require the viewer to stop thinking. Ebert to the contrary, this movie is a success.">, <tf.Tensor: shape=(), dtype=int32, numpy=1>)


In [28]:
# Tensorflow way
def imdb_dataset_2(filepaths_pos, filepaths_neg, num_parallel_calls=tf.data.experimental.AUTOTUNE):
    dataset_pos = tf.data.TextLineDataset(filepaths_pos, num_parallel_reads=num_parallel_calls)
    dataset_pos = dataset_pos.map(lambda review: (review, 1))
    dataset_neg = tf.data.TextLineDataset(filepaths_neg, num_parallel_reads=num_parallel_calls)
    dataset_neg = dataset_neg.map(lambda review: (review, 0))
    dataset = tf.data.Dataset.concatenate(dataset_pos, dataset_neg)
    return dataset    

In [29]:
dataset = imdb_dataset_2(train_pos, train_neg)

**Saving the Dataset**

In [87]:
for i in dataset.take(1):
    print(type(tf.io.serialize_tensor(i[0]).numpy()))

<class 'bytes'>


In [106]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):  # If value is Tensor
        value = tf.compat.as_bytes(value.numpy().decode('utf-8'))  # Requires eager execution!
    else:
        value = tf.compat.as_bytes(value)  # Directly convert string to bytes
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [109]:
@tf.py_function(Tout=tf.string)
def serialize_example(review, label):
    features = {
        'review': _bytes_feature(review),
        'label': _int64_feature(label)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=features))
    return example_proto.SerializeToString()

**Putting it all together**

In [114]:
def store_in_tfrecord(pos_path, neg_path, target_path):
    dataset = imdb_dataset_2(pos_path, neg_path)
    serialized_dataset = dataset.map(lambda review, label: tf.py_function(
        serialize_example, [review, label], tf.string),
                                     num_parallel_calls=tf.data.experimental.AUTOTUNE)

    writer = tf.data.experimental.TFRecordWriter(target_path)
    writer.write(serialized_dataset)

In [115]:
store_in_tfrecord(train_pos, train_neg, 'data/train.tfrecord')
store_in_tfrecord(valid_pos, valid_neg, 'data/valid.tfrecord')
store_in_tfrecord(test_pos, valid_neg, 'data/test.tfrecord')

Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`


**Loading**