In [1]:
from collections import defaultdict
import requests
import os
import tarfile
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
import numpy as np

2024-02-18 09:55:00.043366: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-18 09:55:00.159133: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-18 09:55:00.159167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-18 09:55:00.177524: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-18 09:55:00.216573: I tensorflow/core/platform/cpu_feature_guar

**Get Data**

In [2]:
URL = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
DATA_DIR = 'data'
ROOT_DIR = ''

In [3]:
def get_data(url=URL, root_dir=ROOT_DIR, data_dir=DATA_DIR):
    response = requests.get(url)
    
    with open('temp.tar.gz', 'wb') as f:
        f.write(response.content)
    
    with tarfile.open('temp.tar.gz', 'r:gz') as tar_file:
        tar_file.extractall(path=root_dir)
    
    os.remove('temp.tar.gz')
    os.rename('aclImdb', data_dir)

In [4]:
get_data()

In [3]:
filepath = DATA_DIR
path = Path(filepath)
path

PosixPath('data')

In [157]:
def review_paths(dirpath):
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos = review_paths(path / "train" / "pos")
train_neg = review_paths(path / "train" / "neg")
test_pos = review_paths(path / "test" / "pos")
test_neg = review_paths(path / "test" / "neg")

In [158]:
len(train_pos)

12500

**Train-Test-Valid Split**

In [159]:
np.random.shuffle(test_pos)
np.random.shuffle(test_neg)

valid_pos = test_pos[5000:]
valid_neg = test_neg[5000:]
test_pos = test_pos[:5000]
test_neg = test_neg[:5000]

In [160]:
len(test_pos), len(valid_pos)

(5000, 7500)

**Preprocessing**

In [161]:
# Python way
def imdb_dataset(filepaths_pos, filepaths_neg):
    reviews=[]
    labels=[]
    for filepaths, label in ((filepaths_pos, 1), (filepaths_neg, 0)):
        for filepath in filepaths:
            with open(filepath, 'r') as fp:
                reviews.append(fp.read())
            labels.append(label)
    X = tf.constant(reviews)
    y = tf.constant(labels)
    return tf.data.Dataset.from_tensor_slices((X,y))

In [162]:
dataset = imdb_dataset(train_pos, train_neg)

In [167]:
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(), dtype=string, numpy=b'I think it was Ebert who gave Stella four out of four stars but, other than his, I have never read a positive review of this sadly misunderstood drama about class divisions, love, and sacrifice (three themes most great romantic stories or films have in common).<br /><br />Here the major theme is class division. Stella is a story from depression era America. That said, it was translated to the screen then in such a memorable fashion that this remake (if you ask a Stanwyck fan or two) was not exactly appreciated. Fans of the original never gave it a chance. Furthermore, this version of Stella was made in the 1990s, not exactly a time of great financial trouble in America (as the depression was).<br /><br />Now is the time to remove the rosy-coloured glasses, in the midst of a new era of recession and poverty in America, and see that this powerful story still rings true, is as timely and relevant as ever, in its updated format.<br /><br />Yes, 

In [165]:
# Tensorflow way
def imdb_dataset_2(filepaths_pos, filepaths_neg, num_parallel_calls=tf.data.experimental.AUTOTUNE):
    dataset_pos = tf.data.TextLineDataset(filepaths_pos, num_parallel_reads=num_parallel_calls)
    dataset_pos = dataset_pos.map(lambda review: (review, 1))
    dataset_neg = tf.data.TextLineDataset(filepaths_neg, num_parallel_reads=num_parallel_calls)
    dataset_neg = dataset_neg.map(lambda review: (review, 0))
    dataset = tf.data.Dataset.concatenate(dataset_pos, dataset_neg)
    return dataset    

In [166]:
dataset = imdb_dataset_2(train_pos, train_neg)

**Saving the Dataset**

In [168]:
for i in dataset.take(1):
    print(type(tf.io.serialize_tensor(i[0]).numpy()))

<class 'bytes'>


In [169]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):  # If value is Tensor
        value = tf.compat.as_bytes(value.numpy().decode('utf-8'))  # Requires eager execution!
    else:
        value = tf.compat.as_bytes(value)  # Directly convert string to bytes
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [170]:
def serialize_example(review, label):
    features = {
        'review': _bytes_feature(review),
        'label': _int64_feature(label)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=features))
    return example_proto.SerializeToString()

**Putting it all together**

In [171]:
def store_in_tfrecord(pos_path, neg_path, target_path):
    dataset = imdb_dataset(pos_path, neg_path)
    serialized_dataset = dataset.map(lambda review, label: tf.py_function(
        serialize_example, [review, label], tf.string),                                                        # dataset.map() will return SymbolicTensors, it wont get a real Tensor until graph has been run
                                     num_parallel_calls=tf.data.experimental.AUTOTUNE)

    writer = tf.data.experimental.TFRecordWriter(target_path)
    writer.write(serialized_dataset)

In [172]:
store_in_tfrecord(train_pos, train_neg, 'data/train.tfrecord')
store_in_tfrecord(valid_pos, valid_neg, 'data/valid.tfrecord')
store_in_tfrecord(test_pos, valid_neg, 'data/test.tfrecord')

**Loading**

In [2]:
def parse_example(serialized_example):
    feature_description = {
        'review': tf.io.VarLenFeature(tf.string),
        'label' : tf.io.VarLenFeature(tf.int64)
    }
    
    example = tf.io.parse_single_example(serialized_example, feature_description)
    
    review = tf.sparse.to_dense(example['review'])
    label = tf.sparse.to_dense(example['label'])
    
    return review, label    

In [3]:
def read_tfrecord(path='data/train.tfrecord', num_parallel_reads=tf.data.experimental.AUTOTUNE):
    dataset = tf.data.TFRecordDataset(path)
    dataset = dataset.map(parse_example, num_parallel_calls=num_parallel_reads)
    dataset = dataset.map(lambda x,y: (x[0], y[0]))
    return dataset

In [None]:
train_dataset = read_tfrecord('data/train.tfrecord')
valid_dataset = read_tfrecord('data/valid.tfrecord')
test_dataset = read_tfrecord('data/test.tfrecord')

**Preprocessing**

In [None]:
text_vec = keras.layers.TextVectorization(max_tokens=400, output_mode='int')
text_vec.adapt(train_dataset.map(lambda x,y: x))

In [ ]:
emb = keras.layers.Embedding(input_dim=400)