In [None]:
import tensorflow as tf
import numpy as np

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  18.2M      0  0:00:04  0:00:04 --:--:-- 18.2M


In [None]:
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

Found 75000 files belonging to 3 classes.
Using 60000 files for training.
Found 75000 files belonging to 3 classes.
Using 15000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
for text_batch,label_batch in raw_train_ds.take(1):
  for i in range(6):
    print (text_batch.numpy()[i])
    print (label_batch.numpy()[i])

b'SPOILERS: We sit through ten minutes of AWFUL clich\xc3\xa9d dialog at the beginning from two completely unoriginal characters with bad twangs (ripped off from Kalifornia and Natural Born Killers - there isn\'t an original thing about these two) and you\'re going "either they\'re about to kill everyone in the diner or already have" and lo and behold guess what happens.<br /><br />I can\'t stand all the Tarantino wannabes out there and this guy is one of the worst. I got maybe 25-30 minutes into the thing when I just couldn\'t take it and stopped watching. Miner\'s really bad acting was unbearable - I couldn\'t take it. That, and the terrible script. After reading some of these comments I see there was a big twist - well guess what? No one cares. When you create completely uninteresting, unoriginal and unlikeable character like these two clich\xc3\xa9s, no one cares what big "twist" may have happened. I hope this is the end of these types of movies.'
2
b'This movie is horrible- in a \

Prepare the data

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

In [None]:
def custom_standardization(input_data):

  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase,"<br />"," ")
  return tf.strings.regex_replace(stripped_html,"[%s]" %re.escape(string.punctuation),"")

In [None]:
max_features = 20000
embedding_dim = 128
sequence_length = 500

In [None]:
vectorize_layer = TextVectorization(
    standardize = custom_standardization,
    max_tokens = max_features,
    output_mode = "int",
    output_sequence_length = sequence_length
)

In [None]:
text_ds = raw_train_ds.map(lambda x, y:x)
vectorize_layer.adapt(text_ds)

In [None]:
def vectorize_text(text,label):

  text = tf.expand_dims(text,-1)
  return vectorize_layer(text),label

#Vectorize the data
train_ds = raw_train_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

Build a Model

In [None]:
from tensorflow.keras import layers

inputs = tf.keras.Input(shape=(None,),dtype='int64')

x = layers.Embedding(max_features,embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=3)(x)
x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128,activation='relu')(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1,activation='sigmoid',name="predictions")(x)

model = tf.keras.Model(inputs,predictions)

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=["accuracy"])

In [None]:
epochs = 3

model.fit(train_ds,validation_data=val_ds,epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f0667853750>

In [None]:
model.evaluate(test_ds)

Make an end to end Model for a direct input string

In [None]:
inputs = tf.keras.Input(shape=(1,),dtype="string")

indices = vectorize_layer(inputs)

outputs = model(indices)

end_to_end_model = tf.keras.Model(inputs,outputs)

end_to_end_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=["accuracy"]))

In [None]:
end_to_end_model.evaluate(raw_test_ds)