In [None]:
import numpy as np
import math
import re #for string operations like cleaning,preproceesing
import time #for epoch time
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers #for the model layers
import tensorflow_datasets as tfds #for tokenizing the sentences 
from bs4 import BeautifulSoup

In [None]:
#Load Data
cols = ["sentiment", "id", "date", "query", "user", "text"]

#the file has no header. Hence, the columns are not already there and need to be defined and put in
#Python engine needs to be defined for proper decoding
#the encoding latin1 is selected as it is the one used for english
train_data = pd.read_csv(
    "../input/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
test_data = pd.read_csv(
    "../input/testdata.manual.2009.06.14.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

#The test dataset has 3 different labels (negative, positive and neutral) while the train dataset 
#has only two so we will not use the test file, and split the train file later by ourselves.
data = train_data

In [None]:
#Visualizing the data
train_data.head(3)

In [None]:
#Pre-Processing
#Data Cleaning

#we get rid of the columns that we dont need for the model training, we will only be left with the sentiment label
#and the text column
#axis is set so we get of the columns and not the rows and axis = 1 is the header column
#inplace basically updates the data with the deleted columns 
data.drop(["id", "date", "query", "user"], 
          axis=1,
          inplace=True)

In [None]:
#Visualize cleaned data 
data.head(5)

In [None]:
#since data has been taken from the net there are certain elements in the string that come with it so we need to clean 
#we want to make it into a regular string from the xml format
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text() #we ustilise the beautifulsoup module for intepreting tweet from XML 
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) #re.sub() essentially is like a replace function for regex in strings
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet) #replace thats not in the brackets
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
#Cleaning the data labels
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1 #because the labels took values 0, 2, 4,we make it 0 and 1 (Binary for better reading)

In [None]:
#Tokenization
#here is where the words are converted to numbers and we use an ecoder module in TF for it
#target_vocab_size is the number of words we want to see in our vocab and we use 64k (2^16) words 
#this is useful because with words that do not have a number attached to them, the encoder will compose it with words
#that already exists. most of the time it will be letter by letter but that actually can be quite useful and sometimes
#it can be powerful with a word that only appears one time in all copies.

tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [None]:
#maximum length of sentences
#Now the length of a sentence is the number of words in it but prev it was the number of characters
#padding is basically we are making sure that all the inputs are of the same size so if there are varied lengths 0 are added
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [None]:
#Splitting data into test and train sets
#our dataset has 1600000 elements with 50-50 of positive and negative sentiments
test_idx = np.random.randint(0, 800000, 8000) #negative tweets for testing (first half)
test_idx = np.concatenate((test_idx, test_idx+800000)) #containing both positive and negatives

In [None]:
test_inputs = data_inputs[test_idx] 
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0) #removes the testing values
train_labels = np.delete(data_labels, test_idx)

In [None]:
#CNN MODEL
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name) #call the model from the class we are inheriting from, model initialised
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training): #the function for getting input and output, boolean training
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters) #axis -1 means last axis
        merged = self.dense_1(merged)               #the first axis is the batches and the second one being the pooled 
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
#Configuration
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2 #len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [None]:
#Training
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
#Compiling the model
if NB_CLASSES == 2: #0 or 1
    Dcnn.compile(loss="binary_crossentropy", #binary classification
                 optimizer="adam",           
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy", # n classes output vector of dim n
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
#Storing the model weights
weights_path = "../input/sentiments140/"

weights = tf.train.Checkpoint(Dcnn=Dcnn) #model saved

weights_manager = tf.train.CheckpointManager(weights, weights_path, max_to_keep=5) #file saving the weights

if weights_manager.latest_checkpoint:
    weights.restore(weights_manager.latest_checkpoint)
    print("Latest weights restored!!")

In [None]:
#calling the training function
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
weights_manager.save()

In [None]:
#Evaluation
#Testing on the Test data
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

In [None]:
#testing on unseen data
Dcnn(np.array([tokenizer.encode("i dont like my job")]), training=False).numpy()