# Stage 1: Importing dependencies.

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
print("TensorFlow Version:", tf.__version__)
print("TensorFlow Dataset Version:", tfds.__version__)

from utility import mask_busy_gpus
mask_busy_gpus(1)  # randomly select 1 unused GPU

TensorFlow Version: 2.1.0
TensorFlow Dataset Version: 3.1.0
Query free memories from all GPUs: nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits
Free memory list (MB): [10220, 10094, 6555, 10994]
Query names of processes running on the GPU index 0: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=0
Names of processes running on the GPU index 0: ['python']
Query names of processes running on the GPU index 1: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=1
Names of processes running on the GPU index 1: ['/home/HDriss/miniconda3/envs/geoenv/bin/python']
Query names of processes running on the GPU index 2: nvidia-smi --query-compute-apps=process_name --format=csv,noheader,nounits --id=2
Names of processes running on the GPU index 2: ['/home/HDriss/miniconda3/envs/geoenv/bin/python', '/home/HDriss/miniconda3/envs/geoenv/bin/python']
Query names of processes running on the GPU index 3: nvidia-smi --query-compute-

# Stage 2: Data preprocessing

## Loading files

In [2]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv("./data/train.csv",
                         header=None,
                         names=cols,
                         engine='python',
                         encoding='latin1')
test_data = pd.read_csv("./data/test.csv",
                        header=None,
                        names=cols,
                        engine='python',
                        encoding='latin1')
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (1600000, 6)
Test data shape: (498, 6)


The test dataset has 3 different labels (a negative, a positive and a neutral one) while the train dataset has only two so we will not use the test file, and split the train file later by ourselves.


In [3]:
data = train_data

## Preprocessing

### Cleaning

In [4]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [5]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [6]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

# print first 10 cleaned tweets
for i, tw in enumerate(data_clean[:10]):
    print("Cleaned tweet #%d: %s" % (i, tw))

Cleaned tweet #0:  Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D
Cleaned tweet #1: is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
Cleaned tweet #2:  I dived many times for the ball. Managed to save The rest go out of bounds
Cleaned tweet #3: my whole body feels itchy and like its on fire 
Cleaned tweet #4:  no it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
Cleaned tweet #5:  not the whole crew 
Cleaned tweet #6: Need a hug 
Cleaned tweet #7:  hey long time no see! Yes.. Rains a bit only a bit LOL I'm fine thanks how's you ?
Cleaned tweet #8:  K nope they didn't have it 
Cleaned tweet #9:  que me muera ? 


In [7]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

In [8]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(data_clean,
                                                                    target_vocab_size=2**16)
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

# print first 10 tokenized tweets
for i, tw in enumerate(data_inputs[:10]):
    print("Tokenized tweet #%d: %s" % (i, tw))

Tokenized tweet #0: [65316, 1570, 113, 65323, 10, 6, 3553, 1, 135, 5262, 50, 1484, 38165, 16, 13337, 606, 2, 49, 33, 1, 65352]
Tokenized tweet #1: [11, 1090, 23, 122, 77, 65323, 15, 754, 195, 1841, 124, 2975, 33, 27, 8, 327, 818, 78, 6, 3642, 1830, 80, 3006, 1, 6353, 65317]
Tokenized tweet #2: [65316, 3, 41563, 117, 339, 524, 13, 4, 3798, 1, 11861, 2, 1194, 104, 610, 42, 41, 16, 10504, 65399]
Tokenized tweet #3: [7, 494, 1036, 597, 4898, 8, 37, 81, 18, 1767]
Tokenized tweet #4: [65316, 51, 33, 65323, 10, 32, 22118, 29, 426, 1, 65389, 65323, 19, 2819, 1, 158, 56, 9, 280, 25, 223, 3, 77, 65323, 15, 70, 12, 40, 144, 220, 1]
Tokenized tweet #5: [65316, 32, 4, 494, 3719]
Tokenized tweet #6: [980, 6, 1342]
Tokenized tweet #7: [65316, 313, 202, 71, 51, 1259, 5, 1693, 47, 60451, 65316, 6, 288, 121, 6, 288, 371, 65357, 65323, 19, 801, 157, 1404, 65323, 10, 55, 861]
Tokenized tweet #8: [65316, 1140, 1717, 96, 150, 65323, 15, 20, 17]
Tokenized tweet #9: [65316, 2651, 22, 46219, 5047, 331]


### Padding

In [9]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

# print first 10 padded & tokenized tweets
print('MAX_LEN = %d' % MAX_LEN)
for i, tw in enumerate(data_inputs[:10]):
    print("Padded & Tokenized tweet #%d:\n %s" % (i, tw))

MAX_LEN = 73
Padded & Tokenized tweet #0:
 [65316  1570   113 65323    10     6  3553     1   135  5262    50  1484
 38165    16 13337   606     2    49    33     1 65352     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]
Padded & Tokenized tweet #1:
 [   11  1090    23   122    77 65323    15   754   195  1841   124  2975
    33    27     8   327   818    78     6  3642  1830    80  3006     1
  6353 65317     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]
Padded & Tokenized tweet #2:
 [6531

### Spliting into training/testing set



In [10]:
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

In [11]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

# Stage 3: Model building

In [12]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Application

## Config

In [13]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2  #len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 64
NB_EPOCHS = 5

## Training

In [14]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [15]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [16]:
checkpoint_path = "./ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [17]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS,
         verbose=2)
ckpt_manager.save()

Train on 1584084 samples
Epoch 1/5
1584084/1584084 - 2886s - loss: 0.3962 - accuracy: 0.8202
Epoch 2/5
1584084/1584084 - 2924s - loss: 0.3221 - accuracy: 0.8616
Epoch 3/5
1584084/1584084 - 2892s - loss: 0.2556 - accuracy: 0.8944
Epoch 4/5
1584084/1584084 - 2889s - loss: 0.1929 - accuracy: 0.9225
Epoch 5/5
1584084/1584084 - 2905s - loss: 0.1471 - accuracy: 0.9418


'./ckpt/ckpt-1'

In [18]:
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  13108000  
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  77056     
_________________________________________________________________
dropout (Dropout)            multiple                  0      

## Evaluation

In [19]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[0.5774225264787674, 0.8163125]


In [20]:
test_sentences = ["He is a good teacher!",
                  "It's an awesome movie!",
                  "it's the worst movie I've ever seen!",
                  "Bullshit"]
test_sentences = [clean_tweet(sentence) for sentence in test_sentences]
test_sentences = [tokenizer.encode(sentence) for sentence in test_sentences]
test_sentences = tf.keras.preprocessing.sequence.pad_sequences(test_sentences,
                                                               value=0,
                                                               padding="post",
                                                               maxlen=MAX_LEN)
for i, sentence in enumerate(test_sentences):
    print('test sentences #%d (cleaned, tokenized, padded):\n %s' % (i, sentence))
    
Dcnn.predict(test_sentences)

test sentences #0 (cleaned, tokenized, padded):
 [  541    11     6    43  7391 65317     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]
test sentences #1 (cleaned, tokenized, padded):
 [  128 65323    10    84   286   875 65317     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]
test sente

array([[0.78085494],
       [0.9981805 ],
       [0.004097  ],
       [0.02757967]], dtype=float32)