# NLP - Binary Text Classification using RNNs

By [Akshaj Verma](https://akshajverma.com)  

This notebook takes you through the implementation of binary text classification in the form of sentiment analysis on yelp reviews using RNNs in PyTorch.

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from collections import Counter 
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

torch.manual_seed(1)

<torch._C.Generator at 0x7f7176a8d9b0>

## Prepare Data

In [2]:
df = pd.read_csv("../../../data/nlp/text_classification/yelp_labelled.txt", sep="\t", header=None, names=['text', 'tag'])
df.head()

Unnamed: 0,text,tag
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Convert from dataframe to list

In [3]:
sentence_list = [t for t in df['text'].to_list()]
tag_list = [t for t in df['tag'].to_list()]

#### The input sentences.

In [4]:
sentence_list[1:10]

['Crust is not good.',
 'Not tasty and the texture was just nasty.',
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
 'The selection on the menu was great and so were the prices.',
 'Now I am getting angry and I want my damn pho.',
 "Honeslty it didn't taste THAT fresh.)",
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
 'The fries were great too.',
 'A great touch.']

#### The output tags.

In [5]:
tag_list[1:10]

[0, 0, 1, 1, 0, 0, 0, 1, 1]

### Clean the input data.

In [6]:
# Convert to lowercase
sentence_list = [s.lower() for s in sentence_list]

# Remove non alphavets
regex_remove_nonalphabets = re.compile('[^a-zA-Z]')
sentence_list = [regex_remove_nonalphabets.sub(' ', s) for s in sentence_list]

# Remove words with less than 2 letters
regex_remove_shortwords = re.compile(r'\b\w{1,2}\b')
sentence_list = [regex_remove_shortwords.sub("", s) for s in sentence_list]

# Remove words that appear only once
c = Counter(w for s in sentence_list for w in s.split())
sentence_list = [' '.join(y for y in x.split() if c[y] > 1) for x in sentence_list]

# Strip extra whitespaces
sentence_list = [" ".join(s.split()) for s in sentence_list]

In [7]:
sentence_list[1:10]

['crust not good',
 'not tasty and the texture was just nasty',
 'stopped during the late may off recommendation and loved',
 'the selection the menu was great and were the prices',
 'now getting and want damn pho',
 'didn taste that fresh',
 'the potatoes were like and you could tell they had been made time being kept under',
 'the fries were great too',
 'great touch']

### Create a vocab and dictionary for input.

#### Vocab for input.

In [8]:
words = []
for sentence in sentence_list:
    for w in sentence.split():
        words.append(w)
    
words = list(set(words))
print(f"Size of word-vocablury: {len(words)}\n")

Size of word-vocablury: 844



#### Input <=> ID.

In [9]:
word2idx = {word: i for i, word in enumerate(words)}

### Create a vocab and dictionary for output.

#### Vocab for output.

In [10]:
tags = []
for tag in tag_list:
    tags.append(tag)
tags = list(set(tags))
print(f"Size of tag-vocab: {len(tags)}\n")
print(tags)

Size of tag-vocab: 2

[0, 1]


#### Output <=> ID.

In [11]:
tag2idx = {word: i for i, word in enumerate(tags)}
print(tag2idx)

{0: 0, 1: 1}


### Encode the input and output to numbers.

#### Input

In [12]:
X = [[word2idx[w] for w in s.split()] for s in sentence_list]
X[:3]

[[178, 617, 319, 120],
 [548, 467, 512],
 [467, 506, 768, 289, 317, 834, 214, 174]]

#### Output

In [13]:
y = [tag2idx[t] for t in tag_list]
y[:3]

[1, 0, 0]

### Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [15]:
print("X_train size: ", len(X_train))
print("X_test size: ", len(X_test))

X_train size:  700
X_test size:  300


## Sample Neural Network

### Sample Parameters.

In [16]:
BATCH_SIZE_SAMPLE = 2
EMBEDDING_SIZE_SAMPLE = 5
VOCAB_SIZE = len(word2idx)
TARGET_SIZE = len(tag2idx)
HIDDEN_SIZE_SAMPLE = 3
STACKED_LAYERS_SAMPLE = 4

### Sample Dataloader.

In [17]:
class SampleData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [18]:
sample_data = SampleData(X_train, y_train)
sample_loader = DataLoader(sample_data, batch_size=BATCH_SIZE_SAMPLE, collate_fn=lambda x:x)

In [19]:
tl = iter(sample_loader)

i,j = map(list, zip(*next(tl)))

print(i,"\n\n", j, "\n")

[[727, 592, 350, 360], [289, 834, 699, 243, 554, 221, 768, 834]] 

 [1, 0] 



### Sample RNN class.

In [20]:
class ModelGRUSample(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size, stacked_layers):
        super(ModelGRUSample, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.gru = nn.GRU(input_size = embedding_size, hidden_size = hidden_size, batch_first = True, num_layers=stacked_layers)
        self.linear = nn.Linear(in_features = hidden_size, out_features=1)
        
    def forward(self, x_batch):
        print("\nList of tensor lengths in a batch: ")
        len_list = list(map(len, x_batch))
        print(len_list)
        
        padded_batch = pad_sequence(x_batch, batch_first=True)
        print("\nPadded X_batch: ", padded_batch.size(), "\n", padded_batch)

        
        embeds = self.word_embeddings(padded_batch)
        print("\nEmbeddings: ", embeds.size(), "\n", embeds)

        pack_embeds = pack_padded_sequence(embeds, lengths=len_list, batch_first=True, enforce_sorted=False)
        
        rnn_out, rnn_hidden = self.gru(pack_embeds)
        print("\nRNN hidden last layer: ", rnn_hidden.size(), "\n", rnn_hidden)
        
        linear_out = self.linear(rnn_hidden)
        print("\nLinear Output: ", linear_out.size(), "\n", linear_out)
        
        y_out = torch.sigmoid(linear_out)
        y_out = y_out[-1]
        print("\nSigmoid:\n", y_out)

        
        return y_out

In [21]:
gru_model_sample = ModelGRUSample(embedding_size=EMBEDDING_SIZE_SAMPLE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE_SAMPLE, target_size=len(tag2idx), stacked_layers=STACKED_LAYERS_SAMPLE)
print(gru_model_sample)

ModelGRUSample(
  (word_embeddings): Embedding(844, 5)
  (gru): GRU(5, 3, num_layers=4, batch_first=True)
  (linear): Linear(in_features=3, out_features=1, bias=True)
)


### Sample Output.

output = [batch size, sent len, hid dim]  
hidden = [batch size, 1, hid dim]

In [22]:
with torch.no_grad():
    for batch in sample_loader:
        x_batch, y_batch = map(list, zip(*batch))
        x_batch = [torch.tensor(i) for i in x_batch]
        y_batch = [torch.tensor(i) for i in y_batch]
        
        
        print("X batch: ")
        pprint(x_batch)
        print("\ny batch: ")
        pprint(y_batch)
        
        y_out = gru_model_sample(x_batch)
                        
        y_out_tag = torch.round(y_out)
        print("\nY Output Tag: \n", y_out_tag)
        
        
        print("\nActual Output: ")
        print(y_batch)

        break

X batch: 
[tensor([727, 592, 350, 360]), tensor([289, 834, 699, 243, 554, 221, 768, 834])]

y batch: 
[tensor(1), tensor(0)]

List of tensor lengths in a batch: 
[4, 8]

Padded X_batch:  torch.Size([2, 8]) 
 tensor([[727, 592, 350, 360,   0,   0,   0,   0],
        [289, 834, 699, 243, 554, 221, 768, 834]])

Embeddings:  torch.Size([2, 8, 5]) 
 tensor([[[ 0.4587,  1.0819, -1.2467,  0.3633,  0.4429],
         [-0.4451, -0.6583,  1.6095,  0.3690, -0.2631],
         [ 1.7288, -0.4040, -0.8616,  1.7713,  0.8700],
         [ 1.1721, -1.4974,  1.0943, -1.3289,  1.2544],
         [-0.6540, -1.6095, -0.1002, -0.6092, -0.9798],
         [-0.6540, -1.6095, -0.1002, -0.6092, -0.9798],
         [-0.6540, -1.6095, -0.1002, -0.6092, -0.9798],
         [-0.6540, -1.6095, -0.1002, -0.6092, -0.9798]],

        [[-1.5452,  0.0326,  0.8171, -0.0246, -0.3599],
         [-1.0061, -1.1881,  0.4155, -0.4172, -2.2063],
         [ 0.8112, -0.4186, -0.1767, -0.5618,  0.2393],
         [ 0.3384, -1.1396, -0.3602

## Acutal Neural Network.

### Model parameters.

In [23]:
EPOCHS = 100
BATCH_SIZE = 128
EMBEDDING_SIZE = 256
VOCAB_SIZE = len(word2idx)
TARGET_SIZE = len(tag2idx)
HIDDEN_SIZE = 8
LEARNING_RATE = 0.001
STACKED_LAYERS = 8

### Data Loader.

#### Train Loader.

In [24]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [25]:
train_data = TrainData(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=lambda x:x)

#### Test Loader

In [26]:
class TestData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [27]:
test_data = TestData(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=1, collate_fn=lambda x:x)

### LSTM Model Class.

In [28]:
class ModelLSTM(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size, stacked_layers):
        super(ModelLSTM, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, hidden_size = hidden_size, batch_first = True, num_layers = stacked_layers, dropout = 0.3)
        self.linear = nn.Linear(in_features = hidden_size, out_features=1)
        self.tanh = nn.Tanh()
        
    def forward(self, x_batch):
        len_list = list(map(len, x_batch))
        padded_batch = pad_sequence(x_batch, batch_first=True)
        embeds = self.word_embeddings(padded_batch)
        pack_embeds = pack_padded_sequence(embeds, lengths=len_list, batch_first=True, enforce_sorted=False)
        rnn_out, (rnn_h, rnn_c) = self.lstm(pack_embeds)
        linear_out = self.linear(self.tanh(rnn_h))
        y_out = linear_out[-1]
        
        return y_out

In [29]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [30]:
lstm_model = ModelLSTM(embedding_size=EMBEDDING_SIZE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE, target_size=len(tag2idx), stacked_layers=STACKED_LAYERS)

lstm_model.to(device)
print(lstm_model)

criterion = nn.BCEWithLogitsLoss()

optimizer =  optim.Adam(lstm_model.parameters())

ModelLSTM(
  (word_embeddings): Embedding(844, 256)
  (lstm): LSTM(256, 8, num_layers=8, batch_first=True, dropout=0.3)
  (linear): Linear(in_features=8, out_features=1, bias=True)
  (tanh): Tanh()
)


## Train model.

In [31]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [32]:
lstm_model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for batch in train_loader:
        x_batch, y_batch = map(list, zip(*batch))
        x_batch = [torch.tensor(i).to(device) for i in x_batch]
        y_batch = torch.tensor(y_batch).long().to(device)
                
        optimizer.zero_grad()
        
        y_pred = lstm_model(x_batch)      
        
        loss = criterion(y_pred.squeeze(1), y_batch.float())
        acc = binary_acc(y_pred.squeeze(1), y_batch.float())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {acc}')
    

Epoch 001: | Loss: 0.69650 | Acc: 58.0
Epoch 002: | Loss: 0.69541 | Acc: 58.0
Epoch 003: | Loss: 0.69487 | Acc: 58.0
Epoch 004: | Loss: 0.69375 | Acc: 58.0
Epoch 005: | Loss: 0.69320 | Acc: 58.0
Epoch 006: | Loss: 0.69265 | Acc: 58.0
Epoch 007: | Loss: 0.69314 | Acc: 58.0
Epoch 008: | Loss: 0.69256 | Acc: 58.0
Epoch 009: | Loss: 0.69163 | Acc: 58.0
Epoch 010: | Loss: 0.69232 | Acc: 58.0
Epoch 011: | Loss: 0.69193 | Acc: 58.0
Epoch 012: | Loss: 0.69257 | Acc: 58.0
Epoch 013: | Loss: 0.69203 | Acc: 57.0
Epoch 014: | Loss: 0.69166 | Acc: 58.0
Epoch 015: | Loss: 0.69106 | Acc: 62.0
Epoch 016: | Loss: 0.69195 | Acc: 60.0
Epoch 017: | Loss: 0.69239 | Acc: 57.0
Epoch 018: | Loss: 0.69153 | Acc: 58.0
Epoch 019: | Loss: 0.68823 | Acc: 67.0
Epoch 020: | Loss: 0.68475 | Acc: 70.0
Epoch 021: | Loss: 0.67692 | Acc: 78.0
Epoch 022: | Loss: 0.66500 | Acc: 80.0
Epoch 023: | Loss: 0.64755 | Acc: 83.0
Epoch 024: | Loss: 0.62325 | Acc: 80.0
Epoch 025: | Loss: 0.59756 | Acc: 82.0
Epoch 026: | Loss: 0.5656

## Test Model.

In [33]:
y_out_tags_list = []
with torch.no_grad():
    for batch in test_loader:
        x_batch, y_batch = map(list, zip(*batch))
        x_batch = [torch.tensor(i).to(device) for i in x_batch]
        y_batch = torch.tensor(y_batch).long().to(device)
        
        y_pred = lstm_model(x_batch)
        y_pred = torch.sigmoid(y_pred)
        y_pred_tag = torch.round(y_pred)

        y_out_tags_list.append(y_pred_tag.squeeze(0).cpu().numpy())

## Confusion Matrix.

In [34]:
y_out_tags_list = [a.squeeze().tolist() for a in y_out_tags_list]

In [35]:
print(confusion_matrix(y_test, y_out_tags_list))

[[107  52]
 [ 39 102]]


## Classification Report.

In [36]:
print(classification_report(y_test, y_out_tags_list))

              precision    recall  f1-score   support

           0       0.73      0.67      0.70       159
           1       0.66      0.72      0.69       141

    accuracy                           0.70       300
   macro avg       0.70      0.70      0.70       300
weighted avg       0.70      0.70      0.70       300



## View model output.

In [37]:
idx2word = {v: k for k, v in word2idx.items()}
idx2tag = {v: k for k, v in tag2idx.items()}

In [38]:
print('{:80}: {:15}\n'.format("Sentence", "Sentiment"))
for sentence, tag in zip(X_test[:10], y_out_tags_list[:10]):
    s = " ".join([idx2word[w] for w in sentence])
    print('{:80}: {:5}\n'.format(s, tag))


Sentence                                                                        : Sentiment      

omg the food was                                                                :   0.0

very filling meals                                                              :   1.0

too bad the food damn                                                           :   0.0

far the best have ever had                                                      :   1.0

think this restaurant from not trying hard enough                               :   1.0

awful service                                                                   :   1.0

the owners are super friendly and the staff                                     :   1.0

dont think will back for very long time                                         :   0.0

their chicken fried steak and eggs all time favorite                            :   1.0

after all the reviews couldn wait eat here what disappointment                  :   0.0

