<a href="https://colab.research.google.com/github/sumanyurosha/tensorflow-specialization/blob/master/Hands-on%20ML/chapter13/IMDB_Movie_Reviews_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import tensorflow as tf
from tensorflow import keras
import os

# **Downloading the file structure**

In [22]:
from pathlib import Path

DOWNLOAD_ROOT = "http://ai.stanford.edu/~amaas/data/sentiment/"
FILENAME = "aclImdb_v1.tar.gz"
filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)


In [23]:
path = Path(filepath).parent / "aclImdb"
path

PosixPath('/root/.keras/datasets/aclImdb')

In [24]:
for name, subdirs, files in os.walk(path):
    indent = len(Path(name).parts) - len(path.parts)
    print("    " * indent + Path(name).parts[-1] + os.sep)
    for index, filename in enumerate(sorted(files)):
        if index == 3:
            print("    " * (indent + 1) + "...")
            break
        print("    " * (indent + 1) + filename)

aclImdb/
    README
    imdb.vocab
    imdbEr.txt
    test/
        labeledBow.feat
        urls_neg.txt
        urls_pos.txt
        pos/
            0_10.txt
            10000_7.txt
            10001_9.txt
            ...
        neg/
            0_2.txt
            10000_4.txt
            10001_1.txt
            ...
    train/
        labeledBow.feat
        unsupBow.feat
        urls_neg.txt
        ...
        pos/
            0_9.txt
            10000_8.txt
            10001_10.txt
            ...
        unsup/
            0_0.txt
            10000_0.txt
            10001_0.txt
            ...
        neg/
            0_3.txt
            10000_4.txt
            10001_4.txt
            ...


In [25]:
def list_review_paths(dirpath):
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos_reviews = list_review_paths(path / "train" / "pos")
train_neg_reviews = list_review_paths(path / "train" / "neg")
test_pos_reviews = list_review_paths(path / "test" / "pos")
test_neg_reviews = list_review_paths(path / "test" / "neg")

print(len(train_pos_reviews), len(train_neg_reviews), len(test_pos_reviews), len(test_neg_reviews))

12500 12500 12500 12500


### **Splitting the test set into Validation set and Test set**

In [26]:
import numpy as np

np.random.shuffle(test_pos_reviews)
np.random.shuffle(test_neg_reviews)

SPLIT_INDEX = 5000

test_pos = test_pos_reviews[:SPLIT_INDEX]
test_neg = test_neg_reviews[:SPLIT_INDEX]

valid_pos = test_pos_reviews[SPLIT_INDEX:]
valid_neg = test_neg_reviews[SPLIT_INDEX:]

print(len(test_pos), len(test_neg), len(valid_pos), len(valid_neg))

5000 5000 7500 7500


In [27]:
tf.io.read_file(train_pos_reviews[1])

<tf.Tensor: shape=(), dtype=string, numpy=b"Godard once said a way to criticize a movie is to just make one, and probably the strongest kind that could be made about Ralph Bakshi's take on Tolkien's magnum opus the Lord of the Rings, has actually been made by Peter Jackson. The recent trilogy, to me, aren't even total masterpieces, but they are given enough room with each book to breath in all the post-modern techniques crossed with classical storytelling to make them very good, sweeping entertainments. <br /><br />But as one who has not read the books, I end up now looking upon the two versions, live-action (albeit partly animated in its big visual effects way) and animated (albeit partly done with actual live action as the framework) in relation to just the basic story, not even complete faithfulness to the books. And with Bakshi's version, it's almost not fair in a way, as what we do see is really not the complete vision, not what Jackson really had (probably final cut). Robbed of R

In [28]:
with open(train_pos_reviews[1]) as file:
    x = file.read()

print(x)

Godard once said a way to criticize a movie is to just make one, and probably the strongest kind that could be made about Ralph Bakshi's take on Tolkien's magnum opus the Lord of the Rings, has actually been made by Peter Jackson. The recent trilogy, to me, aren't even total masterpieces, but they are given enough room with each book to breath in all the post-modern techniques crossed with classical storytelling to make them very good, sweeping entertainments. <br /><br />But as one who has not read the books, I end up now looking upon the two versions, live-action (albeit partly animated in its big visual effects way) and animated (albeit partly done with actual live action as the framework) in relation to just the basic story, not even complete faithfulness to the books. And with Bakshi's version, it's almost not fair in a way, as what we do see is really not the complete vision, not what Jackson really had (probably final cut). Robbed of Return of the King's big climactic rush of th

# **Create a Dataset from the reviews**

In [29]:
# now we have the positive and the negative reviews so we need to create two lists, one for reviews and one for labels

def create_imdb_dataset(pos_reviews, neg_reviews):
    reviews = []
    labels = []
    for filepaths, label in ((pos_reviews, 0), (neg_reviews, 1)):
        for file in filepaths:
            reviews.append(tf.io.read_file(file))
            labels.append(label)

    return tf.data.Dataset.from_tensor_slices((reviews,
                                               tf.constant(labels)))


In [30]:
for review, label in create_imdb_dataset(train_pos_reviews, train_neg_reviews).take(3):
    print(review, label)

tf.Tensor(b'B Movie? Yes. DIY? Yes. First Movie? Yes. But Aestheically A+. This movie definitely had some bad sound/editing/lighting/acting/etc. etc. problems. However, this movie has many positive things about it. First off, the most annoying character dies first! Second, its made to be a parody/funny B Rated Horror movie. The comments our killer makes to his victims left me and my friends rolling around on the floor laughing.<br /><br />The problem is a lot of people try to take every independent movie and expect it to be a masterpiece. Take it for what it is, a bunch of kids right out of high school made their first movie. For what it is, just that, it is really good.', shape=(), dtype=string) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(b"Godard once said a way to criticize a movie is to just make one, and probably the strongest kind that could be made about Ralph Bakshi's take on Tolkien's magnum opus the Lord of the Rings, has actually been made by Peter Jackson. The recent tril

In [31]:
def create_imdb_dataset(pos_reviews, neg_reviews):
    pos_dataset = tf.data.TextLineDataset(pos_reviews, num_parallel_reads=5)
    pos_dataset = pos_dataset.map(lambda review: (review, 0))

    neg_dataset = tf.data.TextLineDataset(neg_reviews, num_parallel_reads=5)
    neg_dataset = neg_dataset.map(lambda review: (review, 1))

    dataset = tf.data.Dataset.concatenate(pos_dataset, neg_dataset)
    return dataset

In [32]:
for review, label in create_imdb_dataset(train_pos_reviews, train_neg_reviews).take(3):
    print(review, label)

tf.Tensor(b'B Movie? Yes. DIY? Yes. First Movie? Yes. But Aestheically A+. This movie definitely had some bad sound/editing/lighting/acting/etc. etc. problems. However, this movie has many positive things about it. First off, the most annoying character dies first! Second, its made to be a parody/funny B Rated Horror movie. The comments our killer makes to his victims left me and my friends rolling around on the floor laughing.<br /><br />The problem is a lot of people try to take every independent movie and expect it to be a masterpiece. Take it for what it is, a bunch of kids right out of high school made their first movie. For what it is, just that, it is really good.', shape=(), dtype=string) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(b"Godard once said a way to criticize a movie is to just make one, and probably the strongest kind that could be made about Ralph Bakshi's take on Tolkien's magnum opus the Lord of the Rings, has actually been made by Peter Jackson. The recent tril

In [35]:
batch_size = 32

train_set = create_imdb_dataset(train_pos_reviews, train_neg_reviews).shuffle(25000).batch(batch_size).prefetch(1)
valid_set = create_imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
test_set = create_imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

In [40]:
text_vectorization = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=1000, standardize='lower_and_strip_punctuation', output_mode='int',
    output_sequence_length=50)

review_example = tf.constant(["It's a great, great movie! I loved it.", "It was terrible, run away!!!"])

In [41]:
text_vectorization.adapt(review_example)
text_vectorization(review_example)

<tf.Tensor: shape=(2, 50), dtype=int64, numpy=
array([[ 9, 12,  3,  3,  7, 10,  8,  2,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 2,  4,  5,  6, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]])>

In [43]:
sample_review_batches = train_set.map(lambda review, label : review)
sample_reviews = np.concatenate(list(sample_review_batches.as_numpy_iterator()), 
                                axis=0)

text_vectorization.adapt(sample_reviews)

In [44]:
text_vectorization(review_example)

<tf.Tensor: shape=(2, 50), dtype=int64, numpy=
array([[ 30,   4,  85,  85,  18,  10, 434,   9,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  9,  14, 384, 530, 242,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>

In [46]:
text_vectorization.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

In [53]:
class BagOfWords(keras.layers.Layer):
    def __init__(self, n_tokens, dtype=tf.int32, **kwargs):
        super().__init__(dtype=tf.int32, **kwargs)
        self.n_tokens = n_tokens
    
    def call(self, inputs):
        one_hot = tf.one_hot(inputs, self.n_tokens)
        return tf.reduce_sum(one_hot, axis=1)[:, 1:] # 1: to ignore the padding value


In [54]:
simple_example = tf.constant([[1, 3, 1, 0, 0], [2, 2, 0, 0, 0]])

bow = BagOfWords(n_tokens=4)
bow(simple_example)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[2., 0., 1.],
       [0., 2., 0.]], dtype=float32)>

In [55]:
n_tokens = 1000 + 100 + 1
bag_of_words = BagOfWords(n_tokens)

In [57]:
model = keras.models.Sequential([
    text_vectorization,
    bag_of_words,
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

model.fit(train_set, epochs=5, validation_data=valid_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f6beffd7e80>