In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import numpy as np
import os
import re
import string
import random

In [7]:
# data 

vocab_size = 20000
maxlen = 80

batch_size = 6
filenames  = []
directories = ["aclImdb/train/pos",
                "aclImdb/train/neg",
                "aclImdb/test/pos",
                "aclImdb/test/neg"]
for dir in directories:
    for f in os.listdir(dir):
        filenames.append(os.path.join(dir, f))

print(f"{len(filenames)} files")

# Create a dataset from text files
random.shuffle(filenames)
text_ds = tf.data.TextLineDataset(filenames)
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(batch_size)


def custom_standardization(input_string):
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

50000 files


In [8]:
def prepare_inputs_labels(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    
    return x, y


text_ds = text_ds.map(prepare_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

In [25]:
for data in text_ds:
    print(data[0][0])
    print(data[1][0])
    break

tf.Tensor(
[   13     9 10452     6    13   106  2191  1521    52   460   174     7
     2   663    10    74 15139   165   600     3    10 11502     5   194
  1890  2447     7   286   111     6   571   217  3256     8    33 18750
     3  1700     6   215   240    32  1190   133    67     4    21    11
     2   140    13     9     5    71    14    88   121  1664   101    63
  1213     1     3     6   307    13    18   134     2   304   194    79
     3     7   269    10     9   921    37   216], shape=(80,), dtype=int64)
tf.Tensor(
[    9 10452     6    13   106  2191  1521    52   460   174     7     2
   663    10    74 15139   165   600     3    10 11502     5   194  1890
  2447     7   286   111     6   571   217  3256     8    33 18750     3
  1700     6   215   240    32  1190   133    67     4    21    11     2
   140    13     9     5    71    14    88   121  1664   101    63  1213
     1     3     6   307    13    18   134     2   304   194    79     3
     7   269    10     9 

In [27]:
word_index = dict(zip(range(len(vocab)), vocab))
word_index

{0: '',
 1: '[UNK]',
 2: 'the',
 3: '.',
 4: ',',
 5: 'a',
 6: 'and',
 7: 'of',
 8: 'to',
 9: 'is',
 10: 'it',
 11: 'in',
 12: 'i',
 13: 'this',
 14: 'that',
 15: "'s",
 16: 'was',
 17: 'as',
 18: 'movie',
 19: 'with',
 20: 'for',
 21: 'but',
 22: 'film',
 23: ')',
 24: 'you',
 25: 'on',
 26: "'t",
 27: '"',
 28: 'not',
 29: 'are',
 30: 'he',
 31: 'his',
 32: 'have',
 33: 'be',
 34: 'one',
 35: '!',
 36: 'all',
 37: 'at',
 38: 'they',
 39: 'by',
 40: 'an',
 41: 'who',
 42: 'from',
 43: 'so',
 44: 'like',
 45: '-',
 46: 'there',
 47: 'just',
 48: 'her',
 49: 'about',
 50: 'or',
 51: 'out',
 52: 'has',
 53: 'if',
 54: '?',
 55: 'what',
 56: 'some',
 57: 'good',
 58: 'can',
 59: 'more',
 60: 'when',
 61: 'very',
 62: 'she',
 63: 'up',
 64: 'would',
 65: 'no',
 66: 'my',
 67: 'time',
 68: 'even',
 69: 'only',
 70: 'really',
 71: 'story',
 72: 'their',
 73: 'see',
 74: 'had',
 75: 'which',
 76: 'me',
 77: 'were',
 78: 'we',
 79: 'well',
 80: "'",
 81: 'much',
 82: 'than',
 83: ':',
 84: 'be

In [30]:
type(word_index[9])

numpy.str_

In [54]:
len(vocab)

19999

In [31]:
text1 = ' '.join([word_index[n] for n in data[0][0].numpy()])
print(text1)

this is 2009 and this way underrated gem has lost nothing of the power it had 31 years ago . it connects a pretty wide variety of different characters and stories without appearing to be cluttered . clothes and music might have changed over time , but in the end this is a story that will never lose its up -to [UNK] . and especially this movie does the job pretty well . of course it is cheesy at times


In [32]:
text1 = ' '.join([word_index[n] for n in data[1][0].numpy()])
print(text1)

is 2009 and this way underrated gem has lost nothing of the power it had 31 years ago . it connects a pretty wide variety of different characters and stories without appearing to be cluttered . clothes and music might have changed over time , but in the end this is a story that will never lose its up -to [UNK] . and especially this movie does the job pretty well . of course it is cheesy at times ,


In [None]:
# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "The plot of this moview"
start_tokens = [word_to_index.get(token, 1) for token in start_prompt.split()]
start_tokens