In [1]:
import tensorflow as tf
import numpy as np
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=42,
)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=42,
)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


In [2]:
small_raw_train_ds = raw_train_ds.take(1)
for text_batch, label_batch in small_raw_train_ds.as_numpy_iterator():
    print(text_batch[:3])
    print(label_batch[:3])

[b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
 b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into complicated situatio

---

Scout for potential html tags besides \<br /\> tags

In [3]:
import re

poss_html_tags = np.array([])
for ds in (raw_train_ds, raw_val_ds, raw_test_ds):
    if type(ds) is not tuple:
        ds = ds.as_numpy_iterator()
    for texts, lab in ds:
        for text in texts:
            #print(re.findall(r'<[^>/<]+>.*?|<[^>/<]+ />',str(t)))
            for tag in np.unique(np.array(re.findall(r'<[^<>]+>', str(text)))):
                poss_html_tags = np.unique(np.append(poss_html_tags, tag))
print('possible html tags:')
for tag in poss_html_tags:print(tag)

possible html tags:
< Cough , cough >
< Cough >
< Review posted at FilmDailies.com>
< YES >
<-----Minor Spoilers!----->
<-----Minor Spoilers!---->
<..>
</SPOILER>
</em>
</i>
<SPOILER>
<br />
<em>
<grin>
<hr>
<http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=ANSWERMAN>
<i>
<p>
<sigh>


In [4]:
from tensorflow.keras.layers import TextVectorization
import string

html_tags = "|".join(['<hr>','<i>', '<p>', '</SPOILER>', 
                      '</em>', '</i>', '<SPOILER>', '<br />','<em>',
                      '<http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=ANSWERMAN>'
                     ])

def standardizer(input):
    lc = tf.strings.lower(input)
    nohtml = tf.strings.regex_replace(lc, html_tags, " ")
    nopunct = tf.strings.regex_replace(
        nohtml, f"{re.escape(string.punctuation)}", "")
    return nopunct

max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
    standardize=standardizer,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# remove labels
text_ds = raw_train_ds.map(lambda x, y: x)

vectorize_layer.adapt(text_ds)

In [5]:
from tensorflow.keras import layers

text_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='text')
x = vectorize_layer(text_input)
x = layers.Embedding(max_features + 1, embedding_dim)(x)

x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Dropout(0.5)(x)
x = layers.Conv1D(56, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Dropout(0.2)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(56, activation="relu")(x)

predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(text_input, predictions)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [6]:
epochs = 5

history = model.fit(raw_train_ds, validation_data=raw_val_ds, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
model.evaluate(raw_test_ds)



[0.5485416054725647, 0.8444799780845642]

In [8]:
model.predict(['This movie is really bad and I hate it', 
               'Very good movie'])

array([[0.10528785],
       [0.8687991 ]], dtype=float32)