In [1]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert
import pandas as pd

In [2]:
movie_reviews = pd.read_csv("BOOK1.csv",encoding="latin-1")

movie_reviews.isnull().values.any()

movie_reviews.shape

(499, 2)

In [3]:
import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

reviews = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    reviews.append(preprocess_text(sen))
print(movie_reviews.columns.values)

['review' 'sentiment']


In [4]:
movie_reviews.sentiment.unique()
print(movie_reviews)

                                                review sentiment
0    One of the other reviewers has mentioned that ...  positive
1    A wonderful little production. <br /><br />The...  positive
2    I thought this was a wonderful way to spend ti...  positive
3    Basically there's a family where a little boy ...  negative
4    Petter Mattei's "Love in the Time of Money" is...  positive
..                                                 ...       ...
494  Despite some reviews being distinctly Luke-war...  positive
495  "American Nightmare" is officially tied, in my...  negative
496  First off, I have to say that I loved the book...  negative
497  This movie was extremely boring. I only laughe...  negative
498  I was disgusted by this movie. No it wasn't be...  negative

[499 rows x 2 columns]


In [5]:
import numpy as np
y = movie_reviews['sentiment']

y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

In [6]:
print(reviews[10])

Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines At first it was very odd and pretty funny but as the movie progressed didn find the jokes or oddness funny anymore Its low budget film thats never problem in itself there were some pretty interesting characters but eventually just lost interest imagine this film would appeal to stoner who is currently partaking For something similar but better try Brother from another planet 


In [7]:
print(y[2])

1


In [8]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [9]:
tokenizer.tokenize("don't be so judgmental")

['don', "'", 't', 'be', 'so', 'judgment', '##al']

In [10]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("dont be so judgmental"))

[2123, 2102, 2022, 2061, 8689, 2389]

In [11]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

In [12]:
tokenized_reviews = [tokenize_reviews(review) for review in reviews]

In [13]:
reviews_with_len = [[review, y[i], len(review)]
                 for i, review in enumerate(tokenized_reviews)]

In [14]:
import random
random.shuffle(reviews_with_len)

In [15]:
reviews_with_len.sort(key=lambda x: x[2])

In [16]:
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]

In [17]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))

In [18]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [19]:
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 75), dtype=int32, numpy=
 array([[ 1037,  5790,  1997, ...,     0,     0,     0],
        [ 2065,  2017,  2066, ...,     0,     0,     0],
        [ 2023,  3185,  2838, ...,     0,     0,     0],
        ...,
        [ 1045,  2031,  3427, ...,     0,     0,     0],
        [ 2023,  3185,  2003, ...,  3085,     0,     0],
        [ 1045,  2064,  2102, ..., 21090,  4283, 28802]])>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 1])>)

In [20]:
import math
TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [21]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [22]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [23]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [24]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [25]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x273fbdfd888>

In [26]:
results = text_model.evaluate(test_data)
print(results)

      1/Unknown - 0s 308ms/step - loss: 0.6001 - accuracy: 0.6875[0.600087583065033, 0.6875]


In [27]:
revie=tokenize_reviews(preprocess_text("""Is nothing else on TV? Are you really bored? Well, then watch Phat Beach. However, don't rent it and definitely DO NOT buy it. That would be a big mistake.<br /><br />I watched this on TV and found myself laughing at certain points. I did not laugh long and I did not laugh hard. However, there were subtle jokes and comments I laughed at. If you are looking for an extremely funny "hood" movie then watch Friday. If you are looking for a powerful emotional movie (something that this movie tries at..kind of) watch something like hoop dreams or Jason's Lyric. If you are lookin for some good black "booty" go watch a Dominique Simone porn flick, because the nudity in this movie is nearly non-existent. However, if you have nothing better to do and this is on cable, go ahead and watch it. You will be slightly amused.<br /><br />***3 out of 10***"""))
print(revie)
p=text_model.predict([revie])
print(p)

[2003, 2498, 2842, 2006, 2694, 2024, 2017, 2428, 11471, 2092, 2059, 3422, 6887, 4017, 3509, 2174, 2123, 9278, 2009, 1998, 5791, 2079, 2025, 4965, 2009, 2008, 2052, 2022, 2502, 6707, 3427, 2023, 2006, 2694, 1998, 2179, 2870, 5870, 2012, 3056, 2685, 2106, 2025, 4756, 2146, 1998, 2106, 2025, 4756, 2524, 2174, 2045, 2020, 11259, 13198, 1998, 7928, 4191, 2012, 2065, 2017, 2024, 2559, 2005, 2019, 5186, 6057, 7415, 3185, 2059, 3422, 5958, 2065, 2017, 2024, 2559, 2005, 3928, 6832, 3185, 2242, 2008, 2023, 3185, 5363, 2012, 2785, 1997, 3422, 2242, 2066, 27669, 5544, 2030, 4463, 13677, 2065, 2017, 2024, 2298, 2378, 2005, 2070, 2204, 2304, 9573, 2100, 2175, 3422, 18165, 14072, 22555, 17312, 2138, 1996, 16371, 25469, 1999, 2023, 3185, 2003, 3053, 2512, 25953, 2174, 2065, 2017, 2031, 2498, 2488, 2000, 2079, 1998, 2023, 2003, 2006, 5830, 2175, 3805, 1998, 3422, 2009, 2017, 2097, 2022, 3621, 11770, 2041, 1997]
[[0.03535727]]
