# NLP - Binary Text Classification using CNN+RNN - Sample

By [Akshaj Verma](https://akshajverma.com)  

This notebook takes you through a sample implementation of binary text classification in the form of sentiment analysis on yelp reviews using CNN+RNN in PyTorch.

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


%matplotlib inline

torch.manual_seed(1)

<torch._C.Generator at 0x7f1bbfcb9470>

References:
    
1. https://towardsdatascience.com/nlp-learning-series-part-3-attention-cnn-and-what-not-for-text-classification-4313930ed566
        

## Prepare Data

In [2]:
training_data = [
    ("Ronaldo loves apples and mangoes.".split(), "Postive"),
    ("Rooney hates apple and bananas.".split(), "Negative")
]

sentence_list = [training_data[x][0] for x in range(len(training_data))]
tag_list = [training_data[x][1] for x in range(len(training_data))]

### The input sentences.

In [3]:
sentence_list

[['Ronaldo', 'loves', 'apples', 'and', 'mangoes.'],
 ['Rooney', 'hates', 'apple', 'and', 'bananas.']]

### The output tags.

In [4]:
tag_list

['Postive', 'Negative']

### Clean the input data by converting it into lower case.

In [5]:
data_clean_list = []
for sentence, tags in training_data:
    clean_sentence = [x.lower().split('.')[0] for x in sentence]
    data_clean_list += [(clean_sentence, tags)]

    
sentence_clean_list = [data_clean_list[x][0] for x in range(len(data_clean_list))]

In [6]:
sentence_clean_list

[['ronaldo', 'loves', 'apples', 'and', 'mangoes'],
 ['rooney', 'hates', 'apple', 'and', 'bananas']]

### Create a vocab for input words.

In [7]:
words = []
for sentence in sentence_clean_list:
    words += sentence
words = list(set(words))
print(f"Size of word-vocablury: {len(words)}\n")
print(words)

Size of word-vocablury: 9

['apples', 'apple', 'loves', 'and', 'ronaldo', 'bananas', 'mangoes', 'hates', 'rooney']


### Create a dictionary for input <=> ID.

In [8]:
word2idx = {word: i for i, word in enumerate(words)}
print(word2idx)

{'apples': 0, 'apple': 1, 'loves': 2, 'and': 3, 'ronaldo': 4, 'bananas': 5, 'mangoes': 6, 'hates': 7, 'rooney': 8}


### Create a vocab for output tags.

In [9]:
tags = []
for tag in tag_list:
    tags.append(tag)
tags = list(set(tags))
print(f"Size of tag-vocab: {len(tags)}\n")
print(tags)

Size of tag-vocab: 2

['Negative', 'Postive']


### Create a dictionary for output <=> ID.

In [10]:
tag2idx = {word: i for i, word in enumerate(tags)}
print(tag2idx)

{'Negative': 0, 'Postive': 1}


### Encode the words to numbers.

In [11]:
sentence_clean_list, tag_list

([['ronaldo', 'loves', 'apples', 'and', 'mangoes'],
  ['rooney', 'hates', 'apple', 'and', 'bananas']],
 ['Postive', 'Negative'])

In [12]:
X = [[word2idx[w] for w in s] for s in sentence_clean_list]
X

[[4, 2, 0, 3, 6], [8, 7, 1, 3, 5]]

In [13]:
y = [tag2idx[t] for t in tag_list]
y

[1, 0]

## Neural Network Params and Data Loader

Input -> CNN -> Linear -> Sigmoid

### Define the model parameters.

In [14]:
EMBEDDING_SIZE = 5
HIDDEN_SIZE = 2
LEARNING_RATE = 0.01
EPOCH = 10
BATCH_SIZE = 1

### Data Loader.

In [15]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [16]:
train_data = TrainData(torch.Tensor(X).to(torch.long), torch.Tensor(y).to(torch.float32))
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE)

In [17]:
for i, j in train_loader:
    print(i, j)

tensor([[4, 2, 0, 3, 6]]) tensor([1.])
tensor([[8, 7, 1, 3, 5]]) tensor([0.])


## CNN MODEL

In [23]:
class CnnRnnModel(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size):
        super(CnnRnnModel, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.cnn = nn.Conv1d(in_channels=embedding_size, out_channels=3, kernel_size=3, stride=1, padding = 1)
        self.gru = nn.GRU(input_size = 3, hidden_size=hidden_size, batch_first = True)
        self.linear = nn.Linear(in_features = hidden_size, out_features=1)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        embeds_t = embeds.transpose(1, 2)
        cnn = torch.relu(self.cnn(embeds_t))
        gru_input = cnn.transpose(1, 2)
        _, gru_hidden = self.gru(gru_input)
        
#         cnn, _ = torch.max(cnn, dim = 2)
#         cnn = torch.relu(cnn)
        linear = self.linear(gru_hidden)
        
        return linear

In [24]:
cnn_rnn_model = CnnRnnModel(embedding_size=EMBEDDING_SIZE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE, target_size=len(tag2idx))
print(cnn_rnn_model)

criterion = nn.BCEWithLogitsLoss()
optimizer =  optim.Adam(cnn_rnn_model.parameters())

CnnRnnModel(
  (word_embeddings): Embedding(9, 5)
  (cnn): Conv1d(5, 3, kernel_size=(3,), stride=(1,), padding=(1,))
  (gru): GRU(3, 2, batch_first=True)
  (linear): Linear(in_features=2, out_features=1, bias=True)
)


### See how the output from the CNN model looks.

In [25]:
with torch.no_grad():
    for x_batch, y_batch in train_loader:
        print("Input: \n", x_batch)
        y_out = cnn_rnn_model(x_batch)
        
        y_out_sigmoid = torch.sigmoid(y_out)
        y_out_tags = torch.round(y_out_sigmoid)

        
        print("\nLinear Output:", y_out.shape)
        print(y_out)
        
        print("\nSigmoid Output:")
        print(y_out_sigmoid)
        
        print("\nOutput Indices:")
        print(y_out_tags)
        
        print("\nActual Output:")
        print(y_batch)
        
        print("=" * 50)

Input: 
 tensor([[4, 2, 0, 3, 6]])

Linear Output: torch.Size([1, 1, 1])
tensor([[[-0.0270]]])

Sigmoid Output:
tensor([[[0.4933]]])

Output Indices:
tensor([[[0.]]])

Actual Output:
tensor([1.])
Input: 
 tensor([[8, 7, 1, 3, 5]])

Linear Output: torch.Size([1, 1, 1])
tensor([[[0.0384]]])

Sigmoid Output:
tensor([[[0.5096]]])

Output Indices:
tensor([[[1.]]])

Actual Output:
tensor([0.])


### Train the CNN model

In [26]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [27]:
cnn_rnn_model.train()
for e in range(1, EPOCH+1):
    epoch_loss = 0
    epoch_acc = 0
    for x_batch, y_batch in train_loader:
        
        optimizer.zero_grad()
        
        y_out = cnn_rnn_model(x_batch)
        
        loss = criterion(y_out.squeeze(0), y_batch.unsqueeze(0))
        acc = binary_acc(y_out.squeeze(0), y_batch.unsqueeze(0))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    print(f'Epoch: {e+0:02} | Loss: {epoch_loss/len(train_loader):.5f} | Accuracy: {acc}')

Epoch: 01 | Loss: 0.71090 | Accuracy: 0.0
Epoch: 02 | Loss: 0.70929 | Accuracy: 0.0
Epoch: 03 | Loss: 0.70835 | Accuracy: 0.0
Epoch: 04 | Loss: 0.70749 | Accuracy: 0.0
Epoch: 05 | Loss: 0.70666 | Accuracy: 0.0
Epoch: 06 | Loss: 0.70585 | Accuracy: 0.0
Epoch: 07 | Loss: 0.70505 | Accuracy: 0.0
Epoch: 08 | Loss: 0.70427 | Accuracy: 0.0
Epoch: 09 | Loss: 0.70349 | Accuracy: 0.0
Epoch: 10 | Loss: 0.70271 | Accuracy: 0.0
