In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
print(tf.__version__)

2.8.0


In [3]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
dataset_list = tfds.list_builders()
dataset_list

['abstract_reasoning',
 'accentdb',
 'aeslc',
 'aflw2k3d',
 'ag_news_subset',
 'ai2_arc',
 'ai2_arc_with_ir',
 'amazon_us_reviews',
 'anli',
 'arc',
 'bair_robot_pushing_small',
 'bccd',
 'beans',
 'big_patent',
 'bigearthnet',
 'billsum',
 'binarized_mnist',
 'binary_alpha_digits',
 'blimp',
 'bool_q',
 'c4',
 'caltech101',
 'caltech_birds2010',
 'caltech_birds2011',
 'cars196',
 'cassava',
 'cats_vs_dogs',
 'celeb_a',
 'celeb_a_hq',
 'cfq',
 'cherry_blossoms',
 'chexpert',
 'cifar10',
 'cifar100',
 'cifar10_1',
 'cifar10_corrupted',
 'citrus_leaves',
 'cityscapes',
 'civil_comments',
 'clevr',
 'clic',
 'clinc_oos',
 'cmaterdb',
 'cnn_dailymail',
 'coco',
 'coco_captions',
 'coil100',
 'colorectal_histology',
 'colorectal_histology_large',
 'common_voice',
 'coqa',
 'cos_e',
 'cosmos_qa',
 'covid19',
 'covid19sum',
 'crema_d',
 'curated_breast_imaging_ddsm',
 'cycle_gan',
 'd4rl_adroit_door',
 'd4rl_adroit_hammer',
 'd4rl_adroit_pen',
 'd4rl_adroit_relocate',
 'd4rl_mujoco_ant',
 'd4

In [5]:
print("imdb_reviews" in dataset_list)

True


In [6]:
(training_data, testing_data), metadata = tfds.load("imdb_reviews",
                                                    as_supervised=True,
                                                    shuffle_files=False,
                                                    with_info=True,
                                                    split=["train", "test"])

In [7]:
training_data

<PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [8]:
metadata.features['label'].names

['neg', 'pos']

In [9]:
for i in training_data.take(1):
    text, label = i
    print(text, label)

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string) tf.Tensor(0, shape=(), dtype=int64)


In [10]:
for i in testing_data.take(1):
    text, label = i
    print(text, label)

tf.Tensor(b"There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned movies. I haven't laughed this hard since I saw THE FULL MONTY. (And, even then, I don't think I laughed quite this hard... So to speak.) Tukel's talent is considerable: DING-A-LING-LESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a line-by-line examination of it to fully appreciate the, uh, breadth and width of it. Every shot is beautifully composed (a clear sign of a sure-handed director), and the performances all around are solid (there's none of the over-the-top scenery chewing one might've expected from a film like this). DING-A-LING-LESS is a film whose time has come.", shape=(), dtype=string) tf.Tensor(1, shape=(),

In [11]:
len(training_data)

25000

In [12]:
training_data_sentences = []
training_data_labels = []
for data in training_data.as_numpy_iterator():
    sentence, label = data
    training_data_sentences.append(sentence)
    training_data_labels.append(label)

In [13]:
len(training_data_sentences), len(training_data_labels)

(25000, 25000)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(np.array(training_data_sentences),
                                                                            np.array(training_data_labels),
                                                                            test_size=0.2,
                                                                            random_state=42)

In [16]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(20000, 5000, 20000, 5000)

In [17]:
train_sentences[0]

b'Zombi 3 has an interesting history in it\'s making. Firstly, it is a sequel to Fulci\'s hit Zombi 2, with Zombi 2 itself being of course a marketing ploy to trick people into thinking it was a sequel to George A. Romero\'s Dawn of the Dead aka Zombi. Confusing enough? Basically, none of the films have anything to do with one another, but who cares when they make money. I guess Fulci himself starting to not care about the production about half way through Zombi 3 when he decided to walk out. Bruno Mattei was brought on board to help pad the film with additional scenes to lengthen the running time.<br /><br />Zombi 3\'s plot is your typical zombie fare. Scientists develop a serum on an island in the Philippines, terrorists steal it unleashing a plague, and zombie run amok. The scientists want to create an antidote, while the military is set on mowing down everyone without prejudice. There are also brief inserts of a Radio DJ preaching about how we treat the planet. <br /><br />Overall,

In [18]:
import re
import regex
def keep_only_alphabet(sentence):
    original_text = sentence.decode('utf-8', errors='ignore')
    result_sentence = regex.sub(r'[^a-zA-Z ]', '', original_text)
    result_sentence = re.sub(r'\s+', ' ', result_sentence)
    return result_sentence

In [19]:
train_sentences_only_alphabet = [keep_only_alphabet(sentence) for sentence in train_sentences]
val_sentences_only_alphabet = [keep_only_alphabet(sentence) for sentence in val_sentences]

In [20]:
train_sentences_only_alphabet[0]

'Zombi has an interesting history in its making Firstly it is a sequel to Fulcis hit Zombi with Zombi itself being of course a marketing ploy to trick people into thinking it was a sequel to George A Romeros Dawn of the Dead aka Zombi Confusing enough Basically none of the films have anything to do with one another but who cares when they make money I guess Fulci himself starting to not care about the production about half way through Zombi when he decided to walk out Bruno Mattei was brought on board to help pad the film with additional scenes to lengthen the running timebr br Zombi s plot is your typical zombie fare Scientists develop a serum on an island in the Philippines terrorists steal it unleashing a plague and zombie run amok The scientists want to create an antidote while the military is set on mowing down everyone without prejudice There are also brief inserts of a Radio DJ preaching about how we treat the planet br br Overall I actually liked this film I heard horrible thin

In [21]:
# Text vetorization
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [22]:
max_length = 0
for seq in train_sentences_only_alphabet:
    words = seq.split()
    if len(words) > max_length:
        max_length = len(words)
max_length

1829

In [23]:
sentence_len = [len(sentence.split()) for sentence in train_sentences_only_alphabet]
mean_sentence_len = np.mean(sentence_len)
mean_sentence_len

231.8681

In [24]:
np.percentile(sentence_len, 95)

592.0499999999993

In [25]:
max_vocab_length = 10000
max_length = int(np.percentile(sentence_len, 95))

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [26]:
text_vectorizer.adapt(train_sentences_only_alphabet)

In [27]:
import random
sample_sentence = random.choice(train_sentences_only_alphabet)
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 592), dtype=int64, numpy=
array([[ 586,   10,   39,  916,    1,  119,   49,  916,   95,   25,  403,
           6, 3931,   18,   11,    7,   42,  388,   10,  239, 2760,    9,
         185,  134,    3,  516,  430,  598,    2,   62, 1551, 2357,    5,
          11,  512,   25,    2,  650,    1, 4228,  308,   90,   79,   70,
         356,  627,    1,    8,   11,  512,   18,  165,  308,   63, 4587,
          46,   31,   70,   39,    2,    1,    1,   14, 6133,   13,   10,
         254,   11,   19,   14,  777,   21,    3, 5986, 6941,  873,    3,
           1,   31,  217,  196,  109, 1483,   17,  125,  357, 1199,    4,
         125,  112,   15, 2296,   13,  112, 2057,   18,   22,    8,    2,
        6971,   41,  360, 1784,  428,    5, 2563,   13,   64, 2352,   18,
         175,   49,   24,   79,   37,   94,   39,  128,   53,    1,   13,
         706, 3983, 2057,   56,  322,  162,  588,  157,   52, 3869,  706,
          16,  253,  316,   21,    1,   13, 2984,    4,  402, 20

In [28]:
len(text_vectorizer.get_vocabulary())

10000

In [29]:
words_in_vocab = text_vectorizer.get_vocabulary()

In [30]:
words_in_vocab[:5]

['', '[UNK]', 'the', 'a', 'and']

In [31]:
words_in_vocab[-5:]

['robs', 'rewind', 'rewatch', 'retrospect', 'restoration']

In [59]:
# Embedding
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
text_embedding = Embedding(input_dim=len(text_vectorizer.get_vocabulary()),
                           output_dim=128,
                           mask_zero=True)
text_embedding

<keras.layers.embeddings.Embedding at 0x260f87a1e50>

In [60]:
text_embedding(text_vectorizer([sample_sentence]))

<tf.Tensor: shape=(1, 592, 128), dtype=float32, numpy=
array([[[-0.04520159,  0.01508335, -0.02234305, ..., -0.02887383,
          0.00754311, -0.02291046],
        [-0.04130728,  0.01152085, -0.04503908, ...,  0.03870931,
          0.00176971,  0.0114314 ],
        [-0.01618984,  0.01445836, -0.01063599, ..., -0.03516191,
          0.03759159,  0.00547756],
        ...,
        [-0.02880355,  0.02532539, -0.04626453, ...,  0.02026334,
         -0.00040517, -0.01193007],
        [-0.02880355,  0.02532539, -0.04626453, ...,  0.02026334,
         -0.00040517, -0.01193007],
        [-0.02880355,  0.02532539, -0.04626453, ...,  0.02026334,
         -0.00040517, -0.01193007]]], dtype=float32)>

In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support

def model_evaluataion_metrics(y_true, y_preds):
    acc = accuracy_score(y_true, y_preds)
    pre, rec, f1, _ = precision_recall_fscore_support(y_true, y_preds, average="weighted")
    return {"acc": acc,
            "pre": pre,
            "rec": rec,
            "f1": f1}

In [35]:
train_sentences_only_alphabet = np.array(train_sentences_only_alphabet)
train_labels = np.array(train_labels)

val_sentences_only_alphabet = np.array(val_sentences_only_alphabet)
val_labels = np.array(val_labels)

In [36]:
# Create performant dataset
train_sentences_tensor = tf.data.Dataset.from_tensor_slices(train_sentences_only_alphabet)
train_labels_tensor = tf.data.Dataset.from_tensor_slices(train_labels)
train_dataset = tf.data.Dataset.zip((train_sentences_tensor, train_labels_tensor)).batch(32).prefetch(tf.data.AUTOTUNE)

val_sentences_tensor = tf.data.Dataset.from_tensor_slices(val_sentences_only_alphabet)
val_labels_tensor = tf.data.Dataset.from_tensor_slices(val_labels)
val_dataset = tf.data.Dataset.zip((val_sentences_tensor, val_labels_tensor)).batch(32).prefetch(tf.data.AUTOTUNE)

In [37]:
# Model 0
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences_only_alphabet, train_labels)

In [38]:
model_0.score(val_sentences_only_alphabet, val_labels)

0.863

In [39]:
model_0_preds = model_0.predict(val_sentences_only_alphabet)
model_0_preds[:5]

array([1, 0, 1, 1, 0], dtype=int64)

In [40]:
model_0_results = model_evaluataion_metrics(val_labels, model_0_preds)
model_0_results

{'acc': 0.863,
 'pre': 0.8631259559351397,
 'rec': 0.863,
 'f1': 0.8629256359007806}

In [41]:
# Callbacks
import os
import datetime

LOGS_PATH = "model_logs/imdb_reviews"
CHECKPOINT_PATH = "model_experiments/imdb_reviews"

def tensorboard(model_name):
    return tf.keras.callbacks.TensorBoard(os.path.join(LOGS_PATH, 
                                                       model_name, 
                                                       datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))

def checkpoint(model_name):
    return tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(CHECKPOINT_PATH, 
                                                                    model_name), 
                                              monitor="val_loss", 
                                              verbose=1,
                                              save_best_only=True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", 
                                                 factor=0.1, 
                                                 patience=3, 
                                                 min_lr=1e-5)

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-4 * 10 **(epoch/20))

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", 
                                                  patience=5, 
                                                  restore_best_weights=True)

In [68]:
# Model 1
tf.keras.backend.clear_session()

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = text_embedding(x)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_1 = tf.keras.Model(inputs, outputs, name="model_1")

model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                metrics="accuracy")

model_1_history = model_1.fit(train_dataset,
                              validation_data=val_dataset,
                              epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
model_1.evaluate(train_dataset), model_1.evaluate(val_dataset)



([0.14720936119556427, 0.95660001039505],
 [0.31498533487319946, 0.8709999918937683])

In [44]:
model_1_preds_probs = model_1.predict(val_sentences_only_alphabet)
model_1_preds_probs

array([[0.97480935],
       [0.02142371],
       [0.9543825 ],
       ...,
       [0.97795314],
       [0.9902203 ],
       [0.89309305]], dtype=float32)

In [45]:
model_1_preds = tf.squeeze(tf.round(model_1_preds_probs))
model_1_preds

<tf.Tensor: shape=(5000,), dtype=float32, numpy=array([1., 0., 1., ..., 1., 1., 1.], dtype=float32)>

In [46]:
model_1_results = model_evaluataion_metrics(val_labels, model_1_preds)
model_1_results

{'acc': 0.871, 'pre': 0.873373775085252, 'rec': 0.871, 'f1': 0.870966226145081}

In [47]:
model_1.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 592)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 592, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 32)                20608     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1,300,641
Trainable params: 1,300,641
Non-trainable params: 0
_________________________________________________

In [99]:
tf.keras.backend.clear_session()

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = text_embedding(x)
x = layers.Conv1D(64, 3, activation="relu")(x)
x = layers.Conv1D(64, 3, activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_2 = tf.keras.Model(inputs, outputs, name="model_2")

model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                metrics=["accuracy"])

model_2_history = model_2.fit(train_dataset,
                              validation_data=val_dataset,
                              epochs=50,
                              callbacks=[tensorboard(model_2.name),
                                         checkpoint(model_2.name),
                                         early_stopping])

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.29744, saving model to model_experiments/imdb_reviews\model_2
INFO:tensorflow:Assets written to: model_experiments/imdb_reviews\model_2\assets


INFO:tensorflow:Assets written to: model_experiments/imdb_reviews\model_2\assets


Epoch 2/50
Epoch 2: val_loss improved from 0.29744 to 0.27488, saving model to model_experiments/imdb_reviews\model_2
INFO:tensorflow:Assets written to: model_experiments/imdb_reviews\model_2\assets


INFO:tensorflow:Assets written to: model_experiments/imdb_reviews\model_2\assets


Epoch 3/50
Epoch 3: val_loss did not improve from 0.27488
Epoch 4/50
Epoch 4: val_loss did not improve from 0.27488
Epoch 5/50
Epoch 5: val_loss did not improve from 0.27488
Epoch 6/50
Epoch 6: val_loss did not improve from 0.27488
Epoch 7/50
Epoch 7: val_loss did not improve from 0.27488


In [96]:
model_2.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 592)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 592, 128)          1280000   
                                                                 
 conv1d (Conv1D)             (None, 590, 64)           24640     
                                                                 
 conv1d_1 (Conv1D)           (None, 588, 64)           12352     
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                    

In [50]:
testing_sentences = []
testing_labels = []
for data in testing_data.as_numpy_iterator():
    sentence, label = data
    testing_sentences.append(sentence)
    testing_labels.append(label)

In [51]:
testing_sentences_only_alphabet = [keep_only_alphabet(sentence) for sentence in testing_sentences]

In [52]:
testing_sentences_only_alphabet[0]

'There are films that make careers For George Romero it was NIGHT OF THE LIVING DEAD for Kevin Smith CLERKS for Robert Rodriguez EL MARIACHI Add to that list Onur Tukels absolutely amazing DINGALINGLESS Flawless filmmaking and as assured and as professional as any of the aforementioned movies I havent laughed this hard since I saw THE FULL MONTY And even then I dont think I laughed quite this hard So to speak Tukels talent is considerable DINGALINGLESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a linebyline examination of it to fully appreciate the uh breadth and width of it Every shot is beautifully composed a clear sign of a surehanded director and the performances all around are solid theres none of the overthetop scenery chewing one mightve expected from a film like this DINGALINGLESS is a film whose time has come'

In [100]:
model_2_preds_probs = model_2.predict(tf.expand_dims(testing_sentences, axis=1))
model_2_preds = tf.squeeze(tf.round(model_2_preds_probs))
model_2_results = model_evaluataion_metrics(testing_labels, model_2_preds)
model_2_results

{'acc': 0.87952,
 'pre': 0.8796670459581003,
 'rec': 0.87952,
 'f1': 0.879508333322079}

In [54]:
testing_labels[:10], model_2_preds[:10]

([1, 1, 0, 0, 1, 1, 1, 1, 0, 1],
 <tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 0., 0., 1., 1., 1., 1., 1., 1.], dtype=float32)>)

In [101]:
model_2.evaluate(train_dataset), model_2.evaluate(val_dataset)



([0.1262417584657669, 0.9692999720573425],
 [0.2748786509037018, 0.8889999985694885])