# Stage 1: Importing dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

import bert

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub

print("TensorFlow Version:", tf.__version__)
print("TensorFlow Hub Version:", hub.__version__)
print("bert-for-tf2 Version:", bert.__version__)

from utility import mask_busy_gpus
mask_busy_gpus(1)  # randomly select 1 unused GPU

TensorFlow Version: 2.1.0
TensorFlow Hub Version: 0.8.0
bert-for-tf2 Version: 0.14.4
Query free memories from all GPUs: nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits
Free memory list (MB): [11018, 11176, 11176, 11159]
Query names of processes running on the GPU index 0: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=0
Names of processes running on the GPU index 0: []
Query names of processes running on the GPU index 1: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=1
Names of processes running on the GPU index 1: []
Query names of processes running on the GPU index 2: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=2
Names of processes running on the GPU index 2: []
Query names of processes running on the GPU index 3: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=3
Names of processes running on the GPU index 3: []
Left next 1 GPU(s) unma

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [2]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("./data/train.csv",
                   header=None,
                   names=cols,
                   engine="python",
                   encoding="latin1")

In [3]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

## Preprocessing

### Cleaning

In [4]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [5]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [6]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [7]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

We only use the first sentence for BERT inputs so we add the CLS token at the beginning and the SEP token at the end of each sentence.

In [8]:
def encode_sentence(sent):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [9]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

# print first 10 tokenized tweets
for i, tw in enumerate(data_inputs[:10]):
    print("Tokenized tweet #%d: %s" % (i, tw))

Tokenized tweet #0: ['[CLS]', 'aw', '##w', '##w', 'that', "'", 's', 'a', 'bum', '##mer', '.', 'you', 'should', '##a', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', '.', 'd', '[SEP]']
Tokenized tweet #1: ['[CLS]', 'is', 'upset', 'that', 'he', 'can', "'", 't', 'update', 'his', 'facebook', 'by', 'text', '##ing', 'it', '.', '.', '.', 'and', 'might', 'cry', 'as', 'a', 'result', 'school', 'today', 'also', '.', 'blah', '!', '[SEP]']
Tokenized tweet #2: ['[CLS]', 'i', 'dive', '##d', 'many', 'times', 'for', 'the', 'ball', '.', 'managed', 'to', 'save', 'the', 'rest', 'go', 'out', 'of', 'bounds', '[SEP]']
Tokenized tweet #3: ['[CLS]', 'my', 'whole', 'body', 'feels', 'it', '##chy', 'and', 'like', 'its', 'on', 'fire', '[SEP]']
Tokenized tweet #4: ['[CLS]', 'no', 'it', "'", 's', 'not', 'be', '##ha', '##ving', 'at', 'all', '.', 'i', "'", 'm', 'mad', '.', 'why', 'am', 'i', 'here', '?', 'because', 'i', 'can', "'", 't', 'see', 'you', 'all', 'over', 'there', '.', '[SEP]']
Tokenized twee

### Dataset creation

We need to create the 3 different inputs for each sentence.

In [10]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [11]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [12]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [13]:
BATCH_SIZE = 256
all_batched = all_dataset.padded_batch(BATCH_SIZE,
                                       padded_shapes=((3, None), ()),
                                       padding_values=(0, 0))

In [14]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [15]:
next(iter(train_dataset))

(<tf.Tensor: shape=(256, 3, 10), dtype=int32, numpy=
 array([[[  101,  1045,  2123, ...,  4431,  1012,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2865, 11360, ...,  3980,  1012,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2821,  2158, ...,  2157,  2085,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        ...,
 
        [[  101,  1045,  4299, ...,  3861,  5653,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  4487,  9284, ...,  2006,  1012,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1045,  2134, ...,  2156,  200

# Stage 3: Model building

In [16]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

In [17]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [18]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [19]:
test_sentence = encode_sentence("This is a test sentence.")
print(test_sentence)

['[CLS]', 'this', 'is', 'a', 'test', 'sentence', '.', '[SEP]']


In [20]:
Dcnn.bert_layer([tf.expand_dims(tf.cast(get_ids(test_sentence), tf.int32), 0),
                 tf.expand_dims(tf.cast(get_mask(test_sentence), tf.int32), 0),
                 tf.expand_dims(tf.cast(get_segments(test_sentence), tf.int32), 0)])

[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-0.9315209 , -0.4680269 , -0.82146406,  0.8277694 ,  0.6000099 ,
         -0.16843943,  0.9150352 ,  0.30513808, -0.7012272 , -0.9999936 ,
         -0.15500334,  0.86598194,  0.98336756,  0.4282941 ,  0.9460871 ,
         -0.7464459 , -0.19611955, -0.61434007,  0.33048514, -0.7218173 ,
          0.6563666 ,  0.9999436 ,  0.40550685,  0.35627264,  0.46196997,
          0.9589823 , -0.68519145,  0.9341063 ,  0.96247625,  0.7443774 ,
         -0.7930337 ,  0.16870758, -0.98790234, -0.20499817, -0.86913955,
         -0.99292016,  0.42868108, -0.7316571 , -0.01420928, -0.00326651,
         -0.91100085,  0.33216992,  0.99997747, -0.53717005,  0.38309842,
         -0.34688258, -0.99999994,  0.2892068 , -0.9086961 ,  0.8077502 ,
          0.8028393 ,  0.7044046 ,  0.21862246,  0.5343277 ,  0.539189  ,
         -0.28988   , -0.07221564,  0.18363605, -0.3172046 , -0.6191995 ,
         -0.66389024,  0.38110268, -0.65375316, -0.9328118 , 

In [21]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [22]:
checkpoint_path = "./ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [23]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

## Result

In [24]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
   5078/Unknown - 987s 194ms/step - loss: 0.3957 - accuracy: 0.8213Checkpoint saved at ./ckpt/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f11dad912e8>

In [25]:
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   multiple                  109482241 
_________________________________________________________________
conv1d (Conv1D)              multiple                  153700    
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  230500    
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  307300    
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  77056     
_________________________________________________________________
dropout (Dropout)            multiple                  0      

# Stage 5: Evaluation

In [26]:
results = Dcnn.evaluate(test_dataset)
print(results)

    564/Unknown - 46s 82ms/step - loss: 0.3412 - accuracy: 0.8526[0.34118841488099266, 0.85255986]


In [27]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)

    input_ids = get_ids(tokens)
    input_mask = get_mask(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    inputs = tf.expand_dims(inputs, 0) # simulates a batch

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive".format(
            output))

In [28]:
test_sentences = ["He is a good teacher!",
                  "This actor is a deception.",
                  "It's an awesome movie!",
                  "it's the worst movie I've ever seen",
                  "that's bullshit"]

for sentence in test_sentences:
    print("Input sentence: {}".format(sentence))
    get_prediction(sentence)

Input sentence: He is a good teacher!
Output of the model: [[0.95472294]]
Predicted sentiment: positive
Input sentence: This actor is a deception.
Output of the model: [[0.17139843]]
Predicted sentiment: negative
Input sentence: It's an awesome movie!
Output of the model: [[0.95668626]]
Predicted sentiment: positive
Input sentence: it's the worst movie I've ever seen




Output of the model: [[0.00916532]]
Predicted sentiment: negative
Input sentence: that's bullshit




Output of the model: [[0.0866849]]
Predicted sentiment: negative
