Importing Required Packages

In [54]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer

## Dataset Preparation

In [55]:
# Read in data into a dataframe

'''
This data is a sample of 50000 rows from the original dataset of 1.6 million rows.
'''

df= pd.read_csv("/content/Synthetic_from_NB.csv",index_col=None)
df.columns=['labels','text']
df.head()

Unnamed: 0,labels,text
0,1,helping sorry hope awesome band knicks place l...
1,0,make online hehehe banned work canreally call ...
2,0,one lost find pages well thanks messing around
3,0,busy breakfast done bars hit good luck hiring ...
4,0,itso hot gotta pi law head work oh nohope twit...


In [56]:
# Cleaning data

df.text= df.text.str.lower() #to lower-case

punc=string.punctuation
def cleaning_punctuations(text): #removing punctuation
  translator=str.maketrans("","",punc)
  return text.translate(translator)

def pre_processing_01(tweet):
  '''
  Basic pre-processing to clean data. Removing:
  1. Usernames
  2. URLs
  3. Special Characters
  4. Multiple Spaces
  5. Emails
  6. Numbers
  7. Single Chars
  '''
  tweet = tweet.apply(lambda x:re.sub('@[^\s]+','',str(x))) # Remove Handles (aka usernames)
  tweet = tweet.apply(lambda x:re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',str(x))) # Remove URLs
  tweet= tweet.apply(lambda x:' '.join(re.findall(r'\w+', str(x)))) #remove special chars
  tweet= tweet.apply(lambda x:cleaning_punctuations(x)) #remove punctuations
  tweet = tweet.apply(lambda x:re.sub('@[^\s]+','',str(x))) # Remove emails
  tweet = tweet.apply(lambda x:re.sub('[0-9]+','',str(x))) # Remove numbers
  tweet = tweet.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
  return tweet

df.text= pre_processing_01(df.text)
df.text.head()


0    helping sorry hope awesome band knicks place l...
1    make online hehehe banned work canreally call ...
2       one lost find pages well thanks messing around
3    busy breakfast done bars hit good luck hiring ...
4    itso hot gotta pi law head work oh nohope twit...
Name: text, dtype: object

Dataset Preparation: Pre-processing and Tokenization

In [57]:
# Declare fields for tweets and labels
import torchtext
from torchtext.legacy import data

#Tokenization
TEXT = data.Field(tokenize='spacy', lower=True, include_lengths= True) # include_lengths tells the RNN how long the actual sequences are
LABEL = data.LabelField(dtype=torch.float)

# Map data to fields
fields = [('label', LABEL), ('text', TEXT)]

# Apply field definition to create torch dataset
dataset = torchtext.legacy.data.TabularDataset(
        path="/content/Synthetic_from_NB.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

#Split data into train, test, validation sets
(train_data, test_data, valid_data) = dataset.split(split_ratio=[0.8,0.1,0.1])

print("Number of train data: {}".format(len(train_data)))
print("Number of test data: {}".format(len(test_data)))
print("Number of validation data: {}".format(len(valid_data)))

Number of train data: 1601
Number of test data: 200
Number of validation data: 200


In [58]:
print(vars(train_data.examples[0])) #visualizing train data


{'label': '0', 'text': ['one', 'today', 'reason', 'things', 'quiet', 'minimum', 'lake', 'twitter']}


Data Pre-processing: Embedding using pre-trained GloVe embeddings.


In [59]:
MAX_VOCAB_SIZE = 25000

# unk_init initializes words in the vocab using the Gaussian distribution
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

# embedding
LABEL.build_vocab(train_data)

# Printing the most frequent tokens
TEXT.vocab.freqs.most_common(10)

[('day', 152),
 ('work', 130),
 ('good', 129),
 ('go', 116),
 ('going', 110),
 ('today', 100),
 ('quot', 99),
 ('time', 97),
 ('get', 97),
 ('like', 87)]

Dataset Preparation: Sequencing and Padding using BucketIterator

In [60]:
BATCH_SIZE = 128

# sort_within_batch sorts all the tensors within a batch by their lengths
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True)

## Modelling

In [61]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        """
        Define the layers of the module.

        vocab_size - vocabulary size
        embedding_dim - size of the dense word vectors
        hidden_dim - size of the hidden states
        output_dim - number of classes
        n_layers - number of multi-layer RNN
        bidirectional - boolean - use both directions of LSTM
        dropout - dropout probability
        pad_idx -  string representing the pad token
        """
        
        super().__init__()

        # 1. Feed the tweets in the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        # 2. LSTM layer: returns the output and a tuple of the final hidden state and final cell state
        self.encoder = nn.LSTM(embedding_dim, 
                               hidden_dim, 
                               num_layers=n_layers,
                               bidirectional=bidirectional,
                               dropout=dropout)
        
        # 3. Fully-connected layer: Final hidden state has both a forward and a backward component concatenated together
        # The size of the input to the nn.Linear layer is twice that of the hidden dimension size
        self.predictor = nn.Linear(hidden_dim*2, output_dim)

        # Initialize dropout layer for regularization
        self.dropout = nn.Dropout(dropout)
      
    def forward(self, text, text_lengths):
        """
        The forward method is called when data is fed into the model.

        text - [tweet length, batch size]
        text_lengths - lengths of tweet
        """

        # embedded = [sentence len, batch size, emb dim]
        embedded = self.dropout(self.embedding(text))    
        #By packing the embeddings, we cause RNN to only process non-padded elements. This speeds up computation
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)

        # output of encoder
        packed_output, (hidden, cell) = self.encoder(packed_embedded)

        # unpack sequence - transform packed sequence to a tensor
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output = [sentence len, batch size, hid dim * num directions]
        # output over padding tokens are zero tensors
        
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]
        
        # Get the final layer forward and backward hidden states  
        # concat the final forward and backward hidden layers and apply dropout
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))

        # hidden = [batch size, hid dim * num directions]

        return self.predictor(hidden)



Create Model

In [62]:
INPUT_DIM = len(TEXT.vocab) # dim is equal to the dim of pre-trained GloVe vectors
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2 # 2 layers of biLSTM
BIDIRECTIONAL = True #Bi-directional LSTM
DROPOUT = 0.5 # Dropout probability

# Get pad token index from vocab
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# Create an instance of LSTM class
model = LSTM(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

In [63]:
# Copy the pre-trained word embeddings into the embedding layer
pretrained_embeddings = TEXT.vocab.vectors

# [vocab size, embedding dim]
print(pretrained_embeddings.shape)

torch.Size([4560, 100])


In [64]:
# Replace the initial weights of the embedding layer with the pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.1678, -0.0336,  0.3413,  ...,  0.2604, -0.8232,  0.2224],
        [-0.2053, -0.7776,  0.7440,  ..., -0.0724, -0.7689, -0.2080],
        [-0.3669,  0.4154,  0.1348,  ...,  0.0244,  0.2211,  0.4317],
        ...,
        [ 0.1238,  0.0467,  0.1646,  ..., -0.1151,  0.2209, -0.4480],
        [-0.2372, -0.3621,  0.8482,  ...,  0.6932,  1.7314,  1.0317],
        [ 0.0804, -0.0131, -0.3026,  ..., -0.0917, -0.5893, -0.0245]])

In [65]:
# Initialize <unk> and <pad> both to all zeros - irrelevant for sentiment analysis
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# Setting row in the embedding weights matrix to zero using the token index
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3669,  0.4154,  0.1348,  ...,  0.0244,  0.2211,  0.4317],
        ...,
        [ 0.1238,  0.0467,  0.1646,  ..., -0.1151,  0.2209, -0.4480],
        [-0.2372, -0.3621,  0.8482,  ...,  0.6932,  1.7314,  1.0317],
        [ 0.0804, -0.0131, -0.3026,  ..., -0.0917, -0.5893, -0.0245]])


## Model Training

In [66]:
# Adam optimizer used to update the weights. We specify learning rate as 0.002
optimizer = optim.Adam(model.parameters(), lr=2e-2)

# Loss function: binary cross entropy with logits
# It restricts the predictions to a number between 0 and 1 using the logit function
criterion = nn.BCEWithLogitsLoss()

In [67]:
# Helper functions

def batch_accuracy(predictions, label):
    """
    Returns accuracy per batch.

    predictions - float
    label - 0 or 1
    """

    # Round predictions to the closest integer using the sigmoid function
    preds = torch.round(torch.sigmoid(predictions))
    # If prediction is equal to label
    correct = (preds == label).float()
    # Average correct predictions
    accuracy = correct.sum() / len(correct)

    return accuracy

def timer(start_time, end_time):
    """
    Returns the minutes and seconds.
    """

    time = end_time - start_time
    mins = int(time / 60)
    secs = int(time - (mins * 60))

    return mins, secs

In [68]:
def train(model, iterator, optimizer, criterion):
    """
    Function to evaluate training loss and accuracy.

    iterator - train iterator
    """
    
    # Cumulated Training loss
    training_loss = 0.0
    # Cumulated Training accuracy
    training_acc = 0.0
    
    # Set model to training mode
    model.train()
    
    # For each batch in the training iterator
    for batch in iterator:
        
        # 1. Zero the gradients
        optimizer.zero_grad()
        
        # batch.text is a tuple (tensor, len of seq)
        text, text_lengths = batch.text
        
        # 2. Compute the predictions
        predictions = model(text, text_lengths).squeeze(1)
        
        # 3. Compute loss
        loss = criterion(predictions, batch.label)
        
        # Compute accuracy
        accuracy = batch_accuracy(predictions, batch.label)
        
        # 4. Use loss to compute gradients
        loss.backward()
        
        # 5. Use optimizer to take gradient step
        optimizer.step()
        
        training_loss += loss.item()
        training_acc += accuracy.item()
    
    # Return the loss and accuracy, averaged across each epoch
    # len of iterator = num of batches in the iterator
    return training_loss / len(iterator), training_acc / len(iterator)

In [69]:
def evaluate(model, iterator, criterion):
    """
    Function to evaluate the loss and accuracy of validation and test sets.

    iterator - validation or test iterator
    """
    
    # Cumulated Training loss
    eval_loss = 0.0
    # Cumulated Training accuracy
    eval_acc = 0
    
    # Set model to evaluation mode
    model.eval()
    
    # Don't calculate the gradients
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            accuracy = batch_accuracy(predictions, batch.label)

            eval_loss += loss.item()
            eval_acc += accuracy.item()
        
    return eval_loss / len(iterator), eval_acc / len(iterator)

In [3]:
# Number of epochs
NUM_EPOCHS = 6

# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):

    start_time = time.time()
    
    # Evaluate training loss and accuracy
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    # Evaluate validation loss and accuracy
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    mins, secs = timer(start_time, end_time)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # Save the parameters of the model
        torch.save(model.state_dict(), 'model-small_syn.pt')

    print("Epoch {}:".format(epoch+1))
    print("\t Total Time: {}m {}s".format(mins, secs))
    print("\t Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 2), round(train_acc*100, 2)))
    print("\t Validation Loss {} | Validation Accuracy: {}%".format(round(valid_loss, 2), round(valid_acc*100, 2)))

Epoch 1:
	 Total Time: 0m 5s
	 Train Loss 0.7 | Train Accuracy: 52.6%
	 Validation Loss 0.72 | Validation Accuracy: 45.53%
Epoch 2:
	 Total Time: 0m 5s
	 Train Loss 0.67 | Train Accuracy: 57.68%
	 Validation Loss 0.70 | Validation Accuracy: 48.78%
Epoch 3:
	 Total Time: 0m 5s
	 Train Loss 0.61 | Train Accuracy: 67.29%
	 Validation Loss 0.61 | Validation Accuracy: 52.56%
Epoch 4:
	 Total Time: 0m 5s
	 Train Loss 0.5 | Train Accuracy: 76.83%
	 Validation Loss 0.42 | Validation Accuracy: 51.87%
Epoch 5:
	 Total Time: 0m 5s
	 Train Loss 0.42 | Train Accuracy: 80.13%
	 Validation Loss 0.41 | Validation Accuracy: 53.73%
Epoch 6:
	 Total Time: 0m 5s
	 Train Loss 0.33 | Train Accuracy: 85.95%
	 Validation Loss 0.41 | Validation Accuracy: 54.82%


In [71]:
torch.backends.cudnn.deterministic = True

# Load the model with the best validation loss
model.load_state_dict(torch.load("/content/model-small_syn.pt"))
model.eval()

LSTM(
  (embedding): Embedding(4560, 100, padding_idx=1)
  (encoder): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (predictor): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

## Model Predictions


In [1]:
# Evaluate test loss and accuracy
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print("Test Loss: {} | Test Acc: {}%".format(round(0.32, 2), round(test_acc*100, 2)))


Test Loss: 0.33 | Test Acc: 53.67%


In [74]:
nlp = spacy.load('en')

def predict(model, text, tokenized=True):
    """
    Given a tweet, predict the sentiment.

    text - a string or a a list of tokens
    tokenized - True if text is a list of tokens, False if passing in a string
    """

    # Sets the model to evaluation mode
    model.eval()

    if tokenized == False:
        # Tokenizes the sentence
        tokens = [token.text for token in nlp.tokenizer(text)]
    else:
        tokens = text

    # Index the tokens by converting to the integer representation from the vocabulary
    indexed_tokens = [TEXT.vocab.stoi[t] for t in tokens]
    # Get the length of the text
    length = [len(indexed_tokens)]
    # Convert the indices to a tensor
    tensor = torch.LongTensor(indexed_tokens).to(device)
    # Add a batch dimension by unsqueezeing
    tensor = tensor.unsqueeze(1)
    # Converts the length into a tensor
    length_tensor = torch.LongTensor(length)
    # Convert prediction to be between 0 and 1 with the sigmoid function
    prediction = torch.sigmoid(model(tensor, length_tensor))

    # Return a single value from the prediction
    return prediction.item()

In [77]:
# device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')
# print('Using device:', device)
# print()

# if device.type == 'cpu':
#     print(torch.cuda.get_device_name(0))
#     print('Memory Usage:')
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')


Using device: cpu

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB




In [78]:
# Single example prediction from the test set
print("Tweet: {}".format(TreebankWordDetokenizer().detokenize(test_data[10].text)))
print("Prediction: {}".format(round(predict(model, test_data[10].text), 2)))
print("True Label: {}".format(test_data[10].label))

Tweet: next weekend bummed thatmissed left building school going days late spent likegood
Prediction: 0.4
True Label: 0


In [79]:
# Example prediction from the test set

# List to append data to
d = []


for idx in range(10):

    # Detokenize the tweets from the test set
    tweet = TreebankWordDetokenizer().detokenize(test_data[idx].text)
                                                 
    # Append tweet, prediction, and true label
    d.append({'Tweet': tweet, 'Prediction': predict(model, test_data[idx].text), 'True Label': test_data[idx].label})

# Convert list to dataframe
pd.DataFrame(d)

Unnamed: 0,Tweet,Prediction,True Label
0,going call miss lollove rock ca nt ugh are nt,0.47663,0
1,performing gotta go solve nowve similar gotta ...,0.634752,0
2,watch i ve next weekend called utter pieces sign,0.663592,1
3,platform change plans wear end anymore worse t...,0.54917,0
4,work forgot tweet beef kinda mood wit cooking ...,0.655245,1
5,work could does nt quiet let know hehe keeps a...,0.47372,0
6,work going well slow perth cost lace wear,0.580295,0
7,voice hate money fun love well signed meeting ...,0.513391,1
8,night ahead aw ready classs would know lt omg,0.46701,1
9,learning genius pretty watch new best eating c...,0.581298,0
