# Stage 1: Importing dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[?25l[K     |████████                        | 10 kB 17.0 MB/s eta 0:00:01[K     |████████████████                | 20 kB 11.3 MB/s eta 0:00:01[K     |███████████████████████▉        | 30 kB 9.4 MB/s eta 0:00:01[K     |███████████████████████████████▉| 40 kB 8.6 MB/s eta 0:00:01[K     |████████████████████████████████| 41 kB 127 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30535 sha256=e0aa1beaf91eac3865a2339f63045b65b5996ac7ef999f26d171165b92a760db
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Buildi

In [3]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [4]:
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/Datasets/SentimentData/training.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [7]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [8]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Preprocessing

### Cleaning

In [9]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Delete the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Delete URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Just keep letters and important punctuation
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Remove additional spaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [10]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [11]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [13]:
print(f'No of training examples : {len(data_clean)}')

for i in range(5):
  print(f'{i}: Original: {data.iloc[i]["text"]}')
  print(f'{i}: Cleaned: {data_clean[i]}')
  print(f'------')
  
  

No of training examples : 1600000
0: Original: @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
0: Cleaned:  Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D
------
1: Original: is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
1: Cleaned: is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
------
2: Original: @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
2: Cleaned:  I dived many times for the ball. Managed to save The rest go out of bounds
------
3: Original: my whole body feels itchy and like its on fire 
3: Cleaned: my whole body feels itchy and like its on fire 
------
4: Original: @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
4: Cleaned:  no it's not behaving at 

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [14]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

We only use the first sentence for BERT inputs so we add the CLS token at the beginning and the SEP token at the end of each sentence.

In [15]:
def encode_sentence(sent):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [16]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [18]:
for i in range(5):
  print(f'{i} Data: {data_clean[i]}')
  print(f'{i} Data Encoded: {data_inputs[i]}')
  print(f'----')
    

0 Data:  Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D
0 Data Encoded: ['[CLS]', 'aw', '##w', '##w', 'that', "'", 's', 'a', 'bum', '##mer', '.', 'you', 'should', '##a', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', '.', 'd', '[SEP]']
----
1 Data: is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
1 Data Encoded: ['[CLS]', 'is', 'upset', 'that', 'he', 'can', "'", 't', 'update', 'his', 'facebook', 'by', 'text', '##ing', 'it', '.', '.', '.', 'and', 'might', 'cry', 'as', 'a', 'result', 'school', 'today', 'also', '.', 'blah', '!', '[SEP]']
----
2 Data:  I dived many times for the ball. Managed to save The rest go out of bounds
2 Data Encoded: ['[CLS]', 'i', 'dive', '##d', 'many', 'times', 'for', 'the', 'ball', '.', 'managed', 'to', 'save', 'the', 'rest', 'go', 'out', 'of', 'bounds', '[SEP]']
----
3 Data: my whole body feels itchy and like its on fire 
3 Data Encoded: ['[CLS]', 'my', 'wh

### Dataset creation

We need to create the 3 different inputs for each sentence.

In [19]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # convert 1 into 0 and vice versa
    return seg_ids

In [21]:
test_tokens = data_inputs[3]
print(f'tokens : {test_tokens}')
print(f'get_ids : {get_ids(test_tokens)}')
print(f'get_masks : {get_mask(test_tokens)}')
print(f'get_segments: {get_segments(test_tokens)}')


tokens : ['[CLS]', 'my', 'whole', 'body', 'feels', 'it', '##chy', 'and', 'like', 'its', 'on', 'fire', '[SEP]']
get_ids : [101, 2026, 2878, 2303, 5683, 2009, 11714, 1998, 2066, 2049, 2006, 2543, 102]
get_masks : [1 1 1 1 1 1 1 1 1 1 1 1 1]
get_segments: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


We will create padded batches (so we pad sentences for each batch inpedendently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [22]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [23]:
print(f'Training examples (after filtering): {len(sorted_all)}')
print(f'{0} Example: {sorted_all[0]}')


Training examples (after filtering): 1444341
0 Example: ([[101, 1045, 4299, 1045, 2001, 1999, 6278, 102], array([1, 1, 1, 1, 1, 1, 1, 1]), [0, 0, 0, 0, 0, 0, 0, 0]], 0)


In [24]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [25]:
next(iter(all_dataset))

(<tf.Tensor: shape=(3, 8), dtype=int32, numpy=
 array([[ 101, 1045, 4299, 1045, 2001, 1999, 6278,  102],
        [   1,    1,    1,    1,    1,    1,    1,    1],
        [   0,    0,    0,    0,    0,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [26]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((3, None), ()))

In [58]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 3, 8), dtype=int32, numpy=
 array([[[  101,  1045,  4299,  1045,  2001,  1999,  6278,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  2012,  2147,  2007, 12476, 25358,  2891,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  2074,  2288, 18666,  2011, 12082,  1012,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  2204,  2851,  2031,  1037,  3835,  2154,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  4394,  2026,  3336,  3016,  3016,   999,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],

In [59]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [64]:
print(f'No of batches: {NB_BATCHES}')


No of batches: 45136


# Stage 3: Model building

In [65]:
my_sent = ["[CLS]"] + tokenizer.tokenize("Roses are red.") + ["[SEP]"]
bert_layer([tf.expand_dims(tf.cast(get_ids(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_mask(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_segments(my_sent), tf.int32), 0)])

[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-9.27935421e-01, -4.10335243e-01, -9.65754986e-01,
          9.07317698e-01,  8.12913716e-01, -1.74174413e-01,
          9.11234379e-01,  3.41952085e-01, -8.74521196e-01,
         -9.99989390e-01, -7.78409779e-01,  9.69385147e-01,
          9.86160517e-01,  6.36963248e-01,  9.48631287e-01,
         -7.51192927e-01, -4.58339483e-01, -7.08104432e-01,
          4.62098330e-01, -6.57926798e-01,  7.60414362e-01,
          9.99994695e-01, -3.96861076e-01,  3.44166100e-01,
          6.16488576e-01,  9.94400024e-01, -7.76633620e-01,
          9.38316405e-01,  9.59452212e-01,  7.32879162e-01,
         -6.93436623e-01,  2.93080419e-01, -9.93785441e-01,
         -1.64551854e-01, -9.67019558e-01, -9.95549619e-01,
          5.32935262e-01, -6.88060999e-01,  1.34716183e-02,
          2.98195966e-02, -9.18356478e-01,  4.20526266e-01,
          9.99988914e-01,  2.52676159e-01,  6.06235325e-01,
         -3.50750089e-01, -1.00000000e+00,  4.975

In [66]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)
        
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs
    
    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

In [67]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 2

In [68]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [69]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [70]:
checkpoint_path = "./content/drive/MyDrive/BERT_UDEMY/models/cnn_sentiment_bert_embedding"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint restored!")

In [71]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [72]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/2
  40460/Unknown - 4801s 117ms/step - loss: 0.3968 - accuracy: 0.8223

KeyboardInterrupt: ignored

# Stage 5: Evaluation

In [73]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.35029757022857666, 0.8509306311607361]


In [74]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)

    input_ids = get_ids(tokens)
    input_mask = get_mask(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    inputs = tf.expand_dims(inputs, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [75]:
get_prediction("This movie was pretty interesting.")

Output of the model: [[0.8897263]]
Predicted sentiment: positive.


In [76]:
get_prediction("I'd rather not do that again.")

Output of the model: [[0.28766114]]
Predicted sentiment: negative.


In [77]:
get_prediction("Wow. its working")

Output of the model: [[0.7819323]]
Predicted sentiment: positive.
