# Stage 1: Importing dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

import bert

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub

print("TensorFlow Version:", tf.__version__)
print("TensorFlow Hub Version:", hub.__version__)
print("bert-for-tf2 Version:", bert.__version__)

from utility import mask_busy_gpus
mask_busy_gpus(1)  # randomly select 1 unused GPU

TensorFlow Version: 2.1.0
TensorFlow Hub Version: 0.8.0
bert-for-tf2 Version: 0.14.4
Query free memories from all GPUs: nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits
Free memory list (MB): [10979, 10091, 11168, 10522]
Query names of processes running on the GPU index 0: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=0
Names of processes running on the GPU index 0: []
Query names of processes running on the GPU index 1: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=1
Names of processes running on the GPU index 1: ['/usr/bin/python3']
Query names of processes running on the GPU index 2: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=2
Names of processes running on the GPU index 2: []
Query names of processes running on the GPU index 3: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=3
Names of processes running on the GPU index 3: ['/usr/b

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [2]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("./data/train.csv",
                   header=None,
                   names=cols,
                   engine="python",
                   encoding="latin1")

In [3]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

## Preprocessing

### Cleaning

In [4]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [5]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

# print first 10 cleaned tweets
for i, tw in enumerate(data_clean[:10]):
    print("Cleaned tweet #%d: %s" % (i, tw))

Cleaned tweet #0:  Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D
Cleaned tweet #1: is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
Cleaned tweet #2:  I dived many times for the ball. Managed to save The rest go out of bounds
Cleaned tweet #3: my whole body feels itchy and like its on fire 
Cleaned tweet #4:  no it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
Cleaned tweet #5:  not the whole crew 
Cleaned tweet #6: Need a hug 
Cleaned tweet #7:  hey long time no see! Yes.. Rains a bit only a bit LOL I'm fine thanks how's you ?
Cleaned tweet #8:  K nope they didn't have it 
Cleaned tweet #9:  que me muera ? 


In [6]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [7]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [8]:
type(bert_layer)

tensorflow_hub.keras_layer.KerasLayer

In [9]:
type(bert_layer.resolved_object)

tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject

In [10]:
type(bert_layer.resolved_object.vocab_file)

tensorflow.python.training.tracking.tracking.Asset

In [11]:
type(bert_layer.resolved_object.vocab_file.asset_path)

tensorflow.python.framework.ops.EagerTensor

In [12]:
vocab_file, do_lower_case

(b'/tmp/tfhub_modules/03d6fb3ce1605ad9e5e9ed5346b2fb9623ef4d3d/assets/vocab.txt',
 True)

In [13]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [14]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

# print first 10 tokenized tweets
for i, tw in enumerate(data_inputs[:10]):
    print("Tokenized tweet #%d: %s" % (i, tw))

Tokenized tweet #0: [22091, 2860, 2860, 2008, 1005, 1055, 1037, 26352, 5017, 1012, 2017, 2323, 2050, 2288, 2585, 12385, 1997, 2353, 2154, 2000, 2079, 2009, 1012, 1040]
Tokenized tweet #1: [2003, 6314, 2008, 2002, 2064, 1005, 1056, 10651, 2010, 9130, 2011, 3793, 2075, 2009, 1012, 1012, 1012, 1998, 2453, 5390, 2004, 1037, 2765, 2082, 2651, 2036, 1012, 27984, 999]
Tokenized tweet #2: [1045, 11529, 2094, 2116, 2335, 2005, 1996, 3608, 1012, 3266, 2000, 3828, 1996, 2717, 2175, 2041, 1997, 19202]
Tokenized tweet #3: [2026, 2878, 2303, 5683, 2009, 11714, 1998, 2066, 2049, 2006, 2543]
Tokenized tweet #4: [2053, 2009, 1005, 1055, 2025, 2022, 3270, 6455, 2012, 2035, 1012, 1045, 1005, 1049, 5506, 1012, 2339, 2572, 1045, 2182, 1029, 2138, 1045, 2064, 1005, 1056, 2156, 2017, 2035, 2058, 2045, 1012]
Tokenized tweet #5: [2025, 1996, 2878, 3626]
Tokenized tweet #6: [2342, 1037, 8549]
Tokenized tweet #7: [4931, 2146, 2051, 2053, 2156, 999, 2748, 1012, 1012, 15811, 1037, 2978, 2069, 1037, 2978, 8840, 214

### Dataset creation

We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [15]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [16]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [17]:
all_dataset_iterator = iter(all_dataset)

In [18]:
next(all_dataset_iterator)

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([ 2045,  2024,  2420,  2127,  2026, 10680,  3428,  4164],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [19]:
next(all_dataset_iterator)

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([1045, 1005, 1049, 2061, 6517, 2157, 2085, 1012], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [20]:
BATCH_SIZE = 64
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None,), ()))

In [21]:
all_batched_iterator = iter(all_batched)

In [22]:
next(all_batched_iterator)

(<tf.Tensor: shape=(64, 8), dtype=int32, numpy=
 array([[ 2045,  2024,  2420,  2127,  2026, 10680,  3428,  4164],
        [ 1045,  1005,  1049,  2061,  6517,  2157,  2085,  1012],
        [ 1045,  2514,  2066,  2021,  7880,  2379,  1047,  4481],
        [ 2339,  1051,  2339,  2572,  1045,  3110,  2023,  2126],
        [ 2108,  2023,  5112,  7840, 19237,  4757,  2015,   999],
        [22091,  2860,  2860,  2860,  9152,  6137,  3401,  4402],
        [ 4931,  3158, 15549,  4095,  2146,  2051,  2053,  2831],
        [ 7697,  1012,  1012,  1012,  1012,  1012,  1012,  1012],
        [ 6207,  3573,  1012,  2026,  6097,  2003,  5305,  1012],
        [ 2115,  2062, 12476,  1060,  2094,  4283, 22038, 20348],
        [ 4283,  2005, 21461,  5254,   999,  1037,  2843,   999],
        [ 3331,  2096,  5505,  1999,  4165,  2428,  6881,  1012],
        [ 1045,  2031,  1037,  2310, 21194,  2146,  2154,  4826],
        [20228,  3126,  2243,  1999,  1996,  7381,  1012,  1012],
        [ 1045,  8823,  2205

In [23]:
next(all_batched_iterator)

(<tf.Tensor: shape=(64, 8), dtype=int32, numpy=
 array([[ 2009,  1005,  1055,  2026,  2197,  2733,  1999,  5483],
        [ 2025,  3110,  2005,  2147,  2651,  1012,  1012,  1012],
        [ 5292,  2050,  2033,  2205,  2012,  2431, 17324,  2050],
        [ 4451,  3468,  2098,   999,  2024,  2017,  7929,  1029],
        [ 2339,  2876,  1005,  1056,  1045,  2156,  2151,  1029],
        [ 2197,  2305,  2001, 12476,  1045,  2293,  2026,  2814],
        [12476,  2609,  1012,  1012,  1012,  4283,   999,   999],
        [ 1045,  2106,  2025,  2130,  2131,  2000,  5510,  2009],
        [ 9779, 23644, 23644,  1996, 24067,  2100, 16373,  4633],
        [ 2069,  3426,  2017,  4694,  1005,  1056,  1999,  2009],
        [ 1045,  2572,  2061,  5305,  1012,  2793,  2525,  1012],
        [ 8915,  5369,  2008,  1005,  1055,  1996,  2391,  1012],
        [ 4931, 11317,  1012,  4283,  2153,   999,   999,   999],
        [ 2003,  2770,  2659,  1997,  8915,  2232,  9152,  2361],
        [ 3241,  2055,  1996

In [24]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Stage 3: Model building

In [25]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

In [26]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [27]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [28]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [29]:
checkpoint_path = "./ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [30]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [31]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
  18598/Unknown - 620s 33ms/step - loss: 0.4263 - accuracy: 0.8033Checkpoint saved at ./ckpt/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f17028881d0>

In [32]:
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6104400   
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  77056     
_________________________________________________________________
dropout (Dropout)            multiple                  0      

# Stage 5: Evaluation

In [33]:
results = Dcnn.evaluate(test_dataset)
print(results)

   2066/Unknown - 18s 9ms/step - loss: 0.4937 - accuracy: 0.8281[0.49369014525517374, 0.82805693]


In [34]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Ouput of the model: {}\nPredicted sentiment: negative.".format(output))
    elif sentiment == 1:
        print("Ouput of the model: {}\nPredicted sentiment: positive.".format(output))

In [35]:
test_sentences = ["He is a good teacher!",
                  "It's an awesome movie!",
                  "it's the worst movie I've ever seen",
                  "that's bullshit"]

for sentence in test_sentences:
    print("Input sentence: {}".format(sentence))
    get_prediction(sentence)

Input sentence: He is a good teacher!
Ouput of the model: [[0.99985445]]
Predicted sentiment: positive.
Input sentence: It's an awesome movie!
Ouput of the model: [[0.99999595]]
Predicted sentiment: positive.
Input sentence: it's the worst movie I've ever seen
Ouput of the model: [[0.11321876]]
Predicted sentiment: negative.
Input sentence: that's bullshit
Ouput of the model: [[0.01931121]]
Predicted sentiment: negative.
