# Sentiment Analysis Tutorial 1

In [1]:
import torch
import torchtext
from torchtext.legacy import data
from torch.utils.data import DataLoader, random_split
import pandas as pd
import random
from tqdm import tqdm
import torch.nn.functional as F

torchtext.__version__
# torchtext.__version__

'0.9.1'

#### Creating deterministic 

In [12]:
SEED = 42  # The answer to life, the universe, and everything

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
available_device= "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(available_device)
print(device)
# device

cuda


In [17]:
# !wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz

In [5]:
!gunzip -f movie_data.csv.gz

### Creating the dataset and iterator

In [5]:
imdb_df = pd.read_csv("movie_data.csv")
print(imdb_df.shape)
del imdb_df

(50000, 2)


In [98]:
import spacy
spacy_en = spacy.load('en_core_web_sm')

In [99]:
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]
tokenizer("The quick fox can't jump over a lazy dog.")

['The', 'quick', 'fox', 'ca', "n't", 'jump', 'over', 'a', 'lazy', 'dog', '.']

In [24]:
TEXT = data.Field(tokenize='spacy', # tokenizer 
                  tokenizer_language='en_core_web_sm', #none
                 ) 

LABEL = data.LabelField(dtype = torch.float)

fields = [('REVIEWS', TEXT), ('LABEL', LABEL)]

In [29]:
dataset = data.TabularDataset(path='movie_data.csv', 
                              format='csv',
                              fields=fields,
                              skip_header=True)
len(dataset)

50000

In [44]:
train_set, test_set = dataset.split(split_ratio=[0.9, 0.1],
                                    random_state=random.seed(SEED))

In [45]:
train_set, val_set = train_set.split(split_ratio=[0.9, 0.1],
                                    random_state=random.seed(SEED))

In [46]:
MAX_VOCAB_SIZE = 20000

TEXT.build_vocab(train_set, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_set)

print(f"Vocab size : {len(TEXT.vocab)}")
print(f"Label size : {len(LABEL.vocab)}")

Vocab size : 20002
Label size : 2


**Why do we only build the vocabulary on the training set?**
- When testing any machine learning system you do not want to look at the test set in any way.
  We do not include the validation set as we want it to reflect the test set as much as possible.


In [47]:
top_commom_words = TEXT.vocab.freqs.most_common(10)
top_commom_words

[('the', 464964),
 (',', 441024),
 ('.', 378556),
 ('a', 250866),
 ('and', 250751),
 ('of', 231374),
 ('to', 214406),
 ('is', 173465),
 ('in', 141132),
 ('I', 125873)]

In [48]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


**We can also check the labels, ensuring 0 is for O(-ve) labeled review and 1 is for 1(+ve) review .**

In [49]:
print(LABEL.vocab.stoi)

defaultdict(None, {'0': 0, '1': 1})


In [50]:
LABEL.vocab.freqs

Counter({'1': 20235, '0': 20265})

#### Creating the iterator

The final step of preparing the data is creating the iterators.
We iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.

We'll use a BucketIterator which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.

We also want to place the tensors returned by the iterator on the GPU (if you're using one). 
PyTorch handles this using torch.device, we then pass this device to the iterator.


In [81]:
BATCH_SIZE = 64

# https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
                                              (train_set, val_set, test_set),
                                              batch_size=BATCH_SIZE,
                                              sort_within_batch=False,          #
                                              sort_key=lambda x: len(x.REVIEWS), # 
                                              device=device)

In [82]:
for t_i in train_iterator:
    print(t_i.REVIEWS.size())
    print(t_i.LABEL.size())
    break

for v_i in val_iterator:
    print(v_i.REVIEWS.size())
    print(v_i.LABEL.size())
    break

torch.Size([1013, 64])
torch.Size([64])
torch.Size([52, 64])
torch.Size([64])


## Defining the model

In [57]:
from torch import nn

In [60]:
class SentimentAnalayzer(nn.Module):
    def __init__(self,input_dim=20002, embedding_size=128, hidden_dim=256, num_classes=1):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_dim)
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        # x = [sent len, batch size]
        
        embeddings = self.embedding(x)
        # embedded dim: [sentence length, batch size, embedding dim]
        
        output, hidden = self.rnn(embeddings)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]
        
        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]

        out = self.fc(hidden)
        return out

### Initialize the model

In [89]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = SentimentAnalayzer(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)

### Define optimizer and criterion

In [90]:
from torch.optim import Adam

lr = 0.005
optimizer = Adam(model.parameters(), lr=lr)


"""
Next, we'll define our loss function. In PyTorch this is commonly called a criterion.

The loss function here is binary cross entropy with logits.

Our model currently outputs an unbound real number. As our labels are either 0 or 1, 
we want to restrict the predictions to a number between 0 and 1. We do this using the sigmoid or logit functions.
"""

criterion = nn.BCEWithLogitsLoss()
# The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps.

### Accuracy measuring function 

In [84]:
def batch_accuracy(preds, y):
    # preds = [batch_size]
    # y = [batch_size, 1]
    
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc  = correct.sum() / len(correct)
    
    return acc

### Evaluation function

In [85]:
def evaluate(model, iterator, criterion):
    total_loss = 0
    total_acc = 0
    with torch.no_grad():
        for batch in iterator:
            out = model(batch.REVIEWS)
            scores = out.squeeze(1)
            
            loss = criterion(scores, batch.LABEL)
            acc = batch_accuracy(scores, batch.LABEL)
            
            total_acc+=acc.item()
            total_loss+=loss.item()
    
    return total_loss/len(iterator), total_acc/len(iterator)

### Training loop

In [72]:
NUM_EPOCHS = 5

In [69]:
len(train_iterator)

633

In [None]:
for epoch in range(NUM_EPOCHS):
    loop = tqdm(train_iterator,
                total=len(train_iterator),
                leave=True)
    
    for batch in loop:
        out = model(batch.REVIEWS)
        scores = out.squeeze(1)
        
        optimizer.zero_grad()
        
        loss = criterion(scores, batch.LABEL)
        loss.backward()
        
        optimizer.step()
        
        train_acc = batch_accuracy(scores, batch.LABEL)
#         loop.set_description(f"Epoch [])
        loop.set_description(f"Epoch [{epoch}/{NUM_EPOCHS}]")
        loop.set_postfix(loss=loss.item(), train_acc=train_acc.item())
                             
#         break
    
    val_loss, val_acc  = evaluate(model, val_iterator, criterion)
        
    loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=val_loss, val_acc=val_acc)
 
print("Done training")

#### Val loss

In [92]:
val_loss

0.7265335380191534

In [93]:
val_acc

0.4951584507042254

#### Test Loss

In [94]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.722 | Test Acc: 50.00%


In [96]:
# model.load_state_dict(torch.load('SA_model_1.pt'))

### Generate Results

In [106]:
def predict_sentiment(model, review):
    model.eval()
    tokens = tokenizer(review)
    indexed = [TEXT.vocab.stoi[t] for t in tokens]
#     length = [len(indexed)]
    
    in_tensor = torch.LongTensor(indexed).to(device)
    in_tensor = in_tensor.unsqueeze(1)
#     length_tensor = torch.LongTensor(le)

    pred = torch.sigmoid(model(in_tensor))
    return pred.item()

In [117]:
predict_sentiment(model, "An awesome movie and no cheesy scences, love it! Must watch")

0.6613641381263733