In [2]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
nltk.download("all")
import matplotlib.pyplot as plt
import torch

%matplotlib inline

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

In [3]:
from nltk.tokenize import word_tokenize
from collections import defaultdict
from tqdm import tqdm_notebook #notebook.tqdm
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from torch.utils.data import SequentialSampler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import time

In [4]:
URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
# website from where we get the Movie Review (MR) Datasets from (Pang and Lee, 2005)
!wget -P 'Data/' $URL
# Unzipping the data 
!tar xvzf 'Data/rotten_imdb.tar.gz' -C 'Data/'

--2021-05-19 18:11:30--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘Data/rotten_imdb.tar.gz’


2021-05-19 18:11:31 (1.44 MB/s) - ‘Data/rotten_imdb.tar.gz’ saved [519599/519599]

quote.tok.gt9.5000
plot.tok.gt9.5000
subjdata.README.1.0


In [5]:
def load_text(path):
  file = open(path,'rb')
  text = []
  for line in file:
    text.append(line.decode(errors='ignore').lower().strip())
  return text

subj_text = load_text('Data/quote.tok.gt9.5000')
obj_text = load_text('Data/plot.tok.gt9.5000')

texts = np.array(subj_text + obj_text)
labels = np.array([0]*len(subj_text) + [1]*len(obj_text))

In [6]:
print(len(texts))

10000


In [7]:
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

--2021-05-19 18:11:31--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘fastText/crawl-300d-2M.vec.zip’


2021-05-19 18:12:06 (42.1 MB/s) - ‘fastText/crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  fastText/crawl-300d-2M.vec.zip
  inflating: fastText/crawl-300d-2M.vec  


In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("No. of GPU(s) available: {}".format(torch.cuda.device_count()))
    print('Device name:', torch.cuda.get_device_name(0))       
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No. of GPU(s) available: 1
Device name: Tesla T4


In [9]:
def tokenize(texts):

    max_len = 0
    tokenized_texts = []
    word2idx = {}
    lengths_of_sentences = []

    # <pad> refers to padding and <unk> refers to unknown word 
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    idx = 2
    for sent in texts:
        tokenized_sent = word_tokenize(sent)
        tokenized_texts.append(tokenized_sent)
        count = 0
        
        for token in tokenized_sent:
            count+=1
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1
        lengths_of_sentences.append(count)
                
        max_len = max(max_len, len(tokenized_sent))
    lengths_of_sentences = np.array(lengths_of_sentences)
    print("Average Sentence Length = {}".format(np.mean(lengths_of_sentences)))

    return tokenized_texts, word2idx, max_len

In [10]:
def encode(tokenized_texts, word2idx, max_len):
    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Padding the sentences to max_len
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)

In [11]:
def load_pretrained_vectors(word2idx, fname):
    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    # Initilize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    # Load pretrained vectors
    count = 0
    for line in tqdm_notebook(fin):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    print("There are {} / {} pretrained vectors found.".format(count,len(word2idx)))

    return embeddings


In [12]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_texts, word2idx, max_len = tokenize(texts)
input_ids = encode(tokenized_texts, word2idx, max_len)

# Load pretrained vectors
embeddings = load_pretrained_vectors(word2idx, "fastText/crawl-300d-2M.vec")
embeddings = torch.tensor(embeddings)

Tokenizing...

Average Sentence Length = 24.5983
Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


There are 20686 / 22603 pretrained vectors found.


In [13]:
def data_loader(train_inputs, val_inputs, train_labels, val_labels,batch_size=50):

    train_inputs = torch.tensor(train_inputs)
    train_labels = torch.tensor(train_labels)
    val_inputs = torch.tensor(val_inputs)
    val_labels = torch.tensor(val_labels)

    # Create DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=50)

    # Create DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

In [14]:


# Train Test Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.1, random_state=42)

# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = \
data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [15]:
class CNN_NLP(nn.Module):
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):

        super(CNN_NLP, self).__init__()
        # Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):

        x_embed = self.embedding(input_ids).float()
        x_reshaped = x_embed.permute(0, 2, 1)
        #Applying CNN and Relu
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        logits = self.fc(self.dropout(x_fc))

        return logits

In [16]:
def initilize_model(pretrained_embedding=None,
                    freeze_embedding=False,
                    vocab_size=None,
                    embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    num_classes=2,
                    dropout=0.5,
                    learning_rate=0.01):

    assert (len(filter_sizes) == len(num_filters)), "filter_sizes and \
    num_filters need to be of the same length."

    # Instantiate CNN model
    cnn_model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=embed_dim,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        num_classes=2,
                        dropout=0.5)
    
    cnn_model.to(device)

    # Instantiate Adadelta optimizer
    optimizer = optim.Adadelta(cnn_model.parameters(),
                               lr=learning_rate,
                               rho=0.95)

    return cnn_model, optimizer


In [17]:
# Specify loss function
#Here we are using cross entropy loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer, train_dataloader, val_dataloader=None, epochs=10):
    
    # Tracking best validation accuracy
    best_accuracy = 0

    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {\
    'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================

        # Tracking time and loss
        t0_epoch = time.time()
        total_loss = 0

        # Put the model into the training mode
        model.train()

        for step, batch in enumerate(train_dataloader):
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update parameters
            optimizer.step()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        # =======================================
        #               Evaluation
        # =======================================
        if val_dataloader is not None:
            # After the completion of each training epoch, measure the model's
            # performance on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Track the best accuracy
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {\
            val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            
    print("\n")
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")

def evaluate(model, val_dataloader):
    # Put the model into the evaluation mode. The dropout layers are disabled
    # during the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [18]:
# CNN-rand: Word vectors are randomly initialized.
set_seed(42)
cnn_rand, optimizer = initilize_model(vocab_size=len(word2idx),
                                      embed_dim=300,
                                      learning_rate=0.25,
                                      dropout=0.5)
train(cnn_rand, optimizer, train_dataloader, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.538103   |  0.411311  |   82.50   |   2.44   
   2    |   0.371911   |  0.353353  |   84.00   |   2.00   
   3    |   0.287519   |  0.322773  |   85.80   |   2.01   
   4    |   0.216464   |  0.307361  |   87.50   |   2.02   
   5    |   0.171799   |  0.294630  |   87.40   |   2.02   
   6    |   0.124279   |  0.298507  |   87.90   |   2.01   
   7    |   0.093694   |  0.318203  |   87.60   |   2.02   
   8    |   0.072563   |  0.295311  |   88.00   |   2.04   
   9    |   0.057001   |  0.298977  |   87.50   |   2.03   
  10    |   0.039542   |  0.304720  |   88.30   |   2.03   
  11    |   0.032479   |  0.317744  |   87.10   |   2.03   
  12    |   0.027298   |  0.325577  |   88.50   |   2.04   
  13    |   0.021505   |  0.324176  |   88.40   |   2.04   
  14    |   0.018217   |  0.335937  |   88.30   |   2.04   
  15    |   0.016200

In [19]:
# CNN-static: fastText pretrained word vectors are used and freezed during training.
set_seed(42)
cnn_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                        freeze_embedding=True,
                                        learning_rate=0.25,
                                        dropout=0.5)
train(cnn_static, optimizer, train_dataloader, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.397957   |  0.262969  |   89.40   |   1.21   
   2    |   0.218151   |  0.219098  |   91.50   |   1.22   
   3    |   0.163483   |  0.206098  |   92.10   |   1.22   
   4    |   0.125811   |  0.196106  |   92.90   |   1.22   
   5    |   0.092236   |  0.194029  |   92.90   |   1.22   
   6    |   0.071394   |  0.191029  |   92.70   |   1.23   
   7    |   0.054524   |  0.197126  |   93.30   |   1.23   
   8    |   0.038503   |  0.205798  |   93.10   |   1.23   
   9    |   0.028741   |  0.206026  |   93.10   |   1.24   
  10    |   0.022989   |  0.213769  |   93.00   |   1.24   
  11    |   0.017188   |  0.217765  |   92.80   |   1.24   
  12    |   0.014800   |  0.223221  |   93.10   |   1.25   
  13    |   0.011720   |  0.221663  |   93.20   |   1.25   
  14    |   0.008708   |  0.230347  |   93.40   |   1.25   
  15    |   0.007514

In [20]:
# CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
set_seed(42)
cnn_non_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                            freeze_embedding=False,
                                            learning_rate=0.25,
                                            dropout=0.5)
train(cnn_non_static, optimizer, train_dataloader, val_dataloader, epochs=20)


Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.395160   |  0.258008  |   89.80   |   3.06   
   2    |   0.210909   |  0.210636  |   92.20   |   3.09   
   3    |   0.154489   |  0.196019  |   92.60   |   3.10   
   4    |   0.116275   |  0.186867  |   92.90   |   3.10   
   5    |   0.082834   |  0.186100  |   93.30   |   3.11   
   6    |   0.061945   |  0.186704  |   93.00   |   3.10   
   7    |   0.045453   |  0.192097  |   93.50   |   3.09   
   8    |   0.031788   |  0.201407  |   93.10   |   3.08   
   9    |   0.022912   |  0.205841  |   93.00   |   3.08   
  10    |   0.017606   |  0.214181  |   93.30   |   3.07   
  11    |   0.013030   |  0.218625  |   92.90   |   3.07   
  12    |   0.011093   |  0.227802  |   93.40   |   3.05   
  13    |   0.008822   |  0.223588  |   93.40   |   3.05   
  14    |   0.006672   |  0.230831  |   93.60   |   3.05   
  15    |   0.005536

In [21]:
def predict(text, model=cnn_non_static.to("cpu"), max_len=62):
    """Predict probability that a sentence is objective."""

    # Tokenize, pad and encode text
    tokens = word_tokenize(text.lower())
    padded_tokens = tokens + ['<pad>'] * (max_len - len(tokens))
    input_id = [word2idx.get(token, word2idx['<unk>']) for token in padded_tokens]

    # Convert to PyTorch tensors
    input_id = torch.tensor(input_id).unsqueeze(dim=0)

    # Compute logits
    logits = model.forward(input_id)

    #  Compute probability
    probs = F.softmax(logits, dim=1).squeeze(dim=0)

    print(f"This review is {probs[1] * 100:.2f}% objective.")

In [22]:
predict("The battery life of this camera is very good")
predict("this is a story that zings all the way through with originality , humour and pathos . ")
predict("he makes all efforts to bring ghisu back but does not succeed .")
predict("beautifully shot sequences episodically shift back and forth from the past to the present .")
predict("- john quincy archibald's son michael collapses while playing baseball as a result of a heart failure .")
predict("an inexplicable crack in the pyrenees mountains provokes excitement and scientific curiosity .")
predict("about as big a crowdpleaser as they possibly come .")
predict("iwai creates yuichi's world as much through disembodied moments of sight and sound as through action , building to a surprising stab of melancholy .")
predict("what's next ? the porky's revenge : ultimate edition ?")

This review is 6.72% objective.
This review is 0.00% objective.
This review is 99.27% objective.
This review is 98.87% objective.
This review is 99.43% objective.
This review is 99.25% objective.
This review is 0.00% objective.
This review is 0.01% objective.
This review is 0.16% objective.
