Importing Required Packages

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer

## Dataset Preparation

In [3]:
# Read in data into a dataframe

'''
This data is a sample of 50000 rows from the original dataset of 1.6 million rows.
'''

df= pd.read_csv("/content/sentiment140-small.csv",header=None,index_col=None,names=["label","ID","date","query","user","text"])
df.head()

Unnamed: 0,label,ID,date,query,user,text
0,1,1963243661,Fri May 29 12:25:40 PDT 2009,NO_QUERY,pauliinainen,chillin' with my boyfriend this weekend
1,1,1983325558,Sun May 31 12:56:32 PDT 2009,NO_QUERY,maynaseric,@jshe going good. thanks! || gloves too! keep...
2,0,2203773645,Wed Jun 17 00:30:09 PDT 2009,NO_QUERY,DianaZhang,SUCKS. SUCKS. SUCKS.
3,1,2058740220,Sat Jun 06 15:32:14 PDT 2009,NO_QUERY,DCBrent,@kayoungche I like to pretend bi boys are stra...
4,0,1793363889,Thu May 14 02:24:30 PDT 2009,NO_QUERY,Nicholas698,Got a blood test later for Gladular Fever!


In [4]:
# Cleaning data

df.text= df.text.str.lower() #to lower-case

punc=string.punctuation
def cleaning_punctuations(text): #removing punctuation
  translator=str.maketrans("","",punc)
  return text.translate(translator)

def pre_processing_01(tweet):
  '''
  Basic pre-processing to clean data. Removing:
  1. Usernames
  2. URLs
  3. Special Characters
  4. Multiple Spaces
  5. Emails
  6. Numbers
  7. Single Chars
  '''
  tweet = tweet.apply(lambda x:re.sub('@[^\s]+','',str(x))) # Remove Handles (aka usernames)
  tweet = tweet.apply(lambda x:re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',str(x))) # Remove URLs
  tweet= tweet.apply(lambda x:' '.join(re.findall(r'\w+', str(x)))) #remove special chars
  tweet= tweet.apply(lambda x:cleaning_punctuations(x)) #remove punctuations
  tweet = tweet.apply(lambda x:re.sub('@[^\s]+','',str(x))) # Remove emails
  tweet = tweet.apply(lambda x:re.sub('[0-9]+','',str(x))) # Remove numbers
  tweet = tweet.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
  return tweet

df.text= pre_processing_01(df.text)
df.text.head()


0               chillin with my boyfriend this weekend
1    going good thanks gloves too keep hands warm g...
2                                    sucks sucks sucks
3    i like to pretend bi boys are straight andm ju...
4               gotblood test later for gladular fever
Name: text, dtype: object

Dataset Preparation: Pre-processing and Tokenization

In [5]:
# Declare fields for tweets and labels
import torchtext
from torchtext.legacy import data

#Tokenization
TEXT = data.Field(tokenize='spacy', lower=True, include_lengths= True) # include_lengths tells the RNN how long the actual sequences are
LABEL = data.LabelField(dtype=torch.float)

# Map data to fields
fields = [('label', LABEL), ('id',None),('date',None),('query',None),
      ('user',None), ('text', TEXT)]

# Apply field definition to create torch dataset
dataset = torchtext.legacy.data.TabularDataset(
        path="/content/sentiment140-small.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

#Split data into train, test, validation sets
(train_data, test_data, valid_data) = dataset.split(split_ratio=[0.8,0.1,0.1])

print("Number of train data: {}".format(len(train_data)))
print("Number of test data: {}".format(len(test_data)))
print("Number of validation data: {}".format(len(valid_data)))

Number of train data: 40000
Number of test data: 5000
Number of validation data: 5000


In [6]:
print(vars(train_data.examples[0])) #visualizing train data


{'label': '1', 'text': ['let', "'s", 'see', 'how', 'much', 'i', 'can', 'get', 'done', 'by', '9', 'pm', 'today', '.']}


Data Pre-processing: Embedding using pre-trained GloVe embeddings.


In [7]:
MAX_VOCAB_SIZE = 25000

# unk_init initializes words in the vocab using the Gaussian distribution
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

# embedding
LABEL.build_vocab(train_data)

# Printing the most frequent tokens
TEXT.vocab.freqs.most_common(10)

[('i', 25006),
 ('!', 22806),
 ('.', 20300),
 (' ', 14625),
 ('to', 14227),
 ('the', 13204),
 (',', 11827),
 ('a', 9548),
 ('my', 7894),
 ('and', 7642)]

Dataset Preparation: Sequencing and Padding using BucketIterator

In [8]:
BATCH_SIZE = 128

# sort_within_batch sorts all the tensors within a batch by their lengths
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True)

## Modelling

In [9]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        """
        Define the layers of the module.

        vocab_size - vocabulary size
        embedding_dim - size of the dense word vectors
        hidden_dim - size of the hidden states
        output_dim - number of classes
        n_layers - number of multi-layer RNN
        bidirectional - boolean - use both directions of LSTM
        dropout - dropout probability
        pad_idx -  string representing the pad token
        """
        
        super().__init__()

        # 1. Feed the tweets in the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        # 2. LSTM layer: returns the output and a tuple of the final hidden state and final cell state
        self.encoder = nn.LSTM(embedding_dim, 
                               hidden_dim, 
                               num_layers=n_layers,
                               bidirectional=bidirectional,
                               dropout=dropout)
        
        # 3. Fully-connected layer: Final hidden state has both a forward and a backward component concatenated together
        # The size of the input to the nn.Linear layer is twice that of the hidden dimension size
        self.predictor = nn.Linear(hidden_dim*2, output_dim)

        # Initialize dropout layer for regularization
        self.dropout = nn.Dropout(dropout)
      
    def forward(self, text, text_lengths):
        """
        The forward method is called when data is fed into the model.

        text - [tweet length, batch size]
        text_lengths - lengths of tweet
        """

        # embedded = [sentence len, batch size, emb dim]
        embedded = self.dropout(self.embedding(text))    
        #By packing the embeddings, we cause RNN to only process non-padded elements. This speeds up computation
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)

        # output of encoder
        packed_output, (hidden, cell) = self.encoder(packed_embedded)

        # unpack sequence - transform packed sequence to a tensor
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output = [sentence len, batch size, hid dim * num directions]
        # output over padding tokens are zero tensors
        
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]
        
        # Get the final layer forward and backward hidden states  
        # concat the final forward and backward hidden layers and apply dropout
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))

        # hidden = [batch size, hid dim * num directions]

        return self.predictor(hidden)



Create Model

In [10]:
INPUT_DIM = len(TEXT.vocab) # dim is equal to the dim of pre-trained GloVe vectors
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2 # 2 layers of biLSTM
BIDIRECTIONAL = True #Bi-directional LSTM
DROPOUT = 0.5 # Dropout probability

# Get pad token index from vocab
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# Create an instance of LSTM class
model = LSTM(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

In [11]:
# Copy the pre-trained word embeddings into the embedding layer
pretrained_embeddings = TEXT.vocab.vectors

# [vocab size, embedding dim]
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [12]:
# Replace the initial weights of the embedding layer with the pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.4161, -0.5342, -0.4833,  ...,  1.0740, -0.3773,  0.1998],
        [ 2.6477, -1.5362,  0.0863,  ...,  0.8388,  0.8975, -1.4126],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [-0.5065,  0.7538,  3.3818,  ...,  0.1182,  0.0792, -1.3691],
        [ 0.8347, -0.7449, -0.0376,  ..., -1.8831,  2.6144,  0.2968],
        [-1.2980, -0.6180,  1.2960,  ..., -0.0291,  3.0614, -0.6824]])

In [13]:
# Initialize <unk> and <pad> both to all zeros - irrelevant for sentiment analysis
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# Setting row in the embedding weights matrix to zero using the token index
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [-0.5065,  0.7538,  3.3818,  ...,  0.1182,  0.0792, -1.3691],
        [ 0.8347, -0.7449, -0.0376,  ..., -1.8831,  2.6144,  0.2968],
        [-1.2980, -0.6180,  1.2960,  ..., -0.0291,  3.0614, -0.6824]])


## Model Training

In [14]:
# Adam optimizer used to update the weights. We specify learning rate as 0.002
optimizer = optim.Adam(model.parameters(), lr=2e-2)

# Loss function: binary cross entropy with logits
# It restricts the predictions to a number between 0 and 1 using the logit function
criterion = nn.BCEWithLogitsLoss()

In [15]:
# Helper functions

def batch_accuracy(predictions, label):
    """
    Returns accuracy per batch.

    predictions - float
    label - 0 or 1
    """

    # Round predictions to the closest integer using the sigmoid function
    preds = torch.round(torch.sigmoid(predictions))
    # If prediction is equal to label
    correct = (preds == label).float()
    # Average correct predictions
    accuracy = correct.sum() / len(correct)

    return accuracy

def timer(start_time, end_time):
    """
    Returns the minutes and seconds.
    """

    time = end_time - start_time
    mins = int(time / 60)
    secs = int(time - (mins * 60))

    return mins, secs

In [16]:
def train(model, iterator, optimizer, criterion):
    """
    Function to evaluate training loss and accuracy.

    iterator - train iterator
    """
    
    # Cumulated Training loss
    training_loss = 0.0
    # Cumulated Training accuracy
    training_acc = 0.0
    
    # Set model to training mode
    model.train()
    
    # For each batch in the training iterator
    for batch in iterator:
        
        # 1. Zero the gradients
        optimizer.zero_grad()
        
        # batch.text is a tuple (tensor, len of seq)
        text, text_lengths = batch.text
        
        # 2. Compute the predictions
        predictions = model(text, text_lengths).squeeze(1)
        
        # 3. Compute loss
        loss = criterion(predictions, batch.label)
        
        # Compute accuracy
        accuracy = batch_accuracy(predictions, batch.label)
        
        # 4. Use loss to compute gradients
        loss.backward()
        
        # 5. Use optimizer to take gradient step
        optimizer.step()
        
        training_loss += loss.item()
        training_acc += accuracy.item()
    
    # Return the loss and accuracy, averaged across each epoch
    # len of iterator = num of batches in the iterator
    return training_loss / len(iterator), training_acc / len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    """
    Function to evaluate the loss and accuracy of validation and test sets.

    iterator - validation or test iterator
    """
    
    # Cumulated Training loss
    eval_loss = 0.0
    # Cumulated Training accuracy
    eval_acc = 0
    
    # Set model to evaluation mode
    model.eval()
    
    # Don't calculate the gradients
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            accuracy = batch_accuracy(predictions, batch.label)

            eval_loss += loss.item()
            eval_acc += accuracy.item()
        
    return eval_loss / len(iterator), eval_acc / len(iterator)

In [26]:
# Number of epochs
NUM_EPOCHS = 6

# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):

    start_time = time.time()
    
    # Evaluate training loss and accuracy
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    # Evaluate validation loss and accuracy
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    mins, secs = timer(start_time, end_time)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # Save the parameters of the model
        torch.save(model.state_dict(), 'model-small.pt')

    print("Epoch {}:".format(epoch+1))
    print("\t Total Time: {}m {}s".format(mins, secs))
    print("\t Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 2), round(train_acc*100, 2)))
    print("\t Validation Loss {} | Validation Accuracy: {}%".format(round(valid_loss, 2), round(valid_acc*100, 2)))

Epoch 1:
	 Total Time: 5m 33s
	 Train Loss 0.7 | Train Accuracy: 56.29%
	 Validation Loss 0.67 | Validation Accuracy: 58.85%
Epoch 2:
	 Total Time: 5m 48s
	 Train Loss 0.62 | Train Accuracy: 66.25%
	 Validation Loss 0.57 | Validation Accuracy: 71.25%
Epoch 3:
	 Total Time: 6m 20s
	 Train Loss 0.54 | Train Accuracy: 73.31%
	 Validation Loss 0.54 | Validation Accuracy: 72.4%
Epoch 4:
	 Total Time: 8m 49s
	 Train Loss 0.58 | Train Accuracy: 70.47%
	 Validation Loss 0.58 | Validation Accuracy: 69.57%
Epoch 5:
	 Total Time: 12m 15s
	 Train Loss 0.6 | Train Accuracy: 68.23%
	 Validation Loss 0.58 | Validation Accuracy: 69.94%
Epoch 6:
	 Total Time: 15m 51s
	 Train Loss 0.61 | Train Accuracy: 68.1%
	 Validation Loss 0.59 | Validation Accuracy: 67.97%


In [21]:
torch.backends.cudnn.deterministic = True

# Load the model with the best validation loss
model.load_state_dict(torch.load("/content/model-small.pt"))
model.eval()

LSTM(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (encoder): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (predictor): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

## Model Predictions


In [22]:
# Evaluate test loss and accuracy
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print("Test Loss: {} | Test Acc: {}%".format(round(test_loss, 2), round(test_acc*100, 2)))

Test Loss: 0.75 | Test Acc: 58.54%
