<a href="https://colab.research.google.com/github/gk19989/NLP/blob/main/bert_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Stage-1: Importing Dependencies**

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[?25l[K     |████████                        | 10 kB 24.1 MB/s eta 0:00:01[K     |████████████████                | 20 kB 26.4 MB/s eta 0:00:01[K     |███████████████████████▉        | 30 kB 11.2 MB/s eta 0:00:01[K     |███████████████████████████████▉| 40 kB 9.1 MB/s eta 0:00:01[K     |████████████████████████████████| 41 kB 110 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30534 sha256=592dca91cb3deb9d37b251f99bb106d96eb905268d6d64cf93f380717916accc
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Build

In [None]:
try:
    %tensorflow_version 2.x
except Exception: 
    pass

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

**Stage-2 Loading Data**

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("/content/drive/MyDrive/Projects/BERT/sentiment_data/train.csv",
                   header = None,
                   names = cols,
                   engine = "python",
                   encoding = "latin1")

In [None]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [None]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


**Cleaning Data**

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^A-Za-z.!?]", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

**Tokenization**

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", 
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
tokenizer.tokenize("My dog loves chicken.. haha..")

['my', 'dog', 'loves', 'chicken', '.', '.', 'ha', '##ha', '.', '.']

In [None]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

**Dataset Creation**

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]

In [None]:
random.shuffle(data_with_len)

In [None]:
data_with_len.sort(key=lambda x: x[2])

In [None]:
sorted_all = [(sent_lab[0], sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
sorted_all[0]

([2417, 28765, 2290, 5446, 2622, 2005, 4469, 4923], 1)

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, 
                                             output_types = (tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([ 2417, 28765,  2290,  5446,  2622,  2005,  4469,  4923],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ),()))

In [None]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2417, 28765,  2290,  5446,  2622,  2005,  4469,  4923],
        [22091,  2860,  1045,  3246,  2017,  2514,  2488,  2574],
        [ 2350, 28516,  3291,  1012,  2054,  1037,  2166,  1012],
        [ 3407,  5798,  3566,  1045,  2293,  2017,   999,   999],
        [ 2053,  2057,  2024,  2012,  5726,  6770,  1012,  3374],
        [ 2045,  2024, 16111,  2006,  2026,  8840,  6894, 16340],
        [13132,  2038,  2000,  2022,  2028,  1997,  2026, 20672],
        [ 2026,  2406,  7861,  8237,  8180,  8583,  2033,  2823],
        [ 1999, 26353,  2007,  2026,  2567,  1998,  1996,  2137],
        [ 2054,  2015,  1996,  4957,  2000,  3789,  1029,  1060],
        [ 1061,  2099,  2053,  2721,  5391,  8134,  4826,  1012],
        [ 2018,  2379,  2331,  6322,  1999,  2847,  1012,  1060],
        [ 2021,  2009,  2001,  2126,  2205,  2172,  2833,  1012],
        [ 2748,  4757,  4757,  8692,  3185,  2982,  4826,  2305],
        [15775, 24860, 16523

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

**Model Building**

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim = 128,
                 nb_filters = 50,
                 FFN_units = 512,
                 nb_classes = 2,
                 dropout_rate = 0.1,
                 training = False,
                 name = "dcnn"):
        super(DCNN, self).__init__(name=name)

        self.embedding = layers.Embedding(vocab_size, emb_dim)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size = 2,
                                    padding = "valid",
                                    activation = "relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size = 3,
                                    padding = "valid",
                                    activation = "relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                    kernel_size = 4,
                                    padding = "valid",
                                    activation = "relu")
        self.pool = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(units=FFN_units,
                                    activation="relu")
        
        self.dropout = layers.Dropout(rate=dropout_rate)

        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
            

    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)

        merged = tf.concat([x_1, x_2, x_3], axis=-1)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output


In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2
NB_EPOCHS = 5


In [None]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMB_DIM,
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                  optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "/content/drive/MyDrive/Projects/BERT/ckpt_bert_tok"

ckpt = tf.train.Checkpoint(Dcnn = Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("latest check_point has been restored")


In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
Checkpoint saved at /content/drive/MyDrive/Projects/BERT/ckpt_bert_tok
Epoch 2/5
Checkpoint saved at /content/drive/MyDrive/Projects/BERT/ckpt_bert_tok
Epoch 3/5
Checkpoint saved at /content/drive/MyDrive/Projects/BERT/ckpt_bert_tok
Epoch 4/5
Checkpoint saved at /content/drive/MyDrive/Projects/BERT/ckpt_bert_tok
Epoch 5/5
Checkpoint saved at /content/drive/MyDrive/Projects/BERT/ckpt_bert_tok


<keras.callbacks.History at 0x7f6808dc2750>

**Stage-5: Evaluation**

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.43745726346969604, 0.8321293592453003]


In [None]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0) #this is to simulate batch input

    output = Dcnn(inputs,training = False)

    sentimemt = math.floor(output * 2) #as the output is btw o & 1 we make it btw 0 & 2, 1 being division line

    if sentimemt == 0:
        print("Output of the model: {}\nPredicted sentimemt: negative.".format(output))
    elif sentimemt == 1:
        print("Output of the model: {}\nPredicted sentiment: positive.".format(output))


In [None]:
get_prediction("This movie was pretty interesting!")

Output of the model: [[0.9999138]]
Predicted sentiment: positive.


In [None]:
get_prediction("I'd rather not do it again")

Output of the model: [[0.2500885]]
Predicted sentimemt: negative.
