In this project, we build machine learning models to detect the sentiment of movie reviews using the IMDb movie reviews dataset. Specifically, we implement a Convolutional Neural Networks (CNN) model and a Recurrent Neural Networks (RNN) model.

Firstly, we select "GPU" as runtime type and import all libraries that we need. 


In [None]:
from collections import defaultdict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
import torchtext 
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if __name__=='__main__':
    print('Using device:', device)

Using device: cuda


# 1: Download the dataset
We download the dataset using torchtext, which is a package that supports NLP for PyTorch. The dataset we use is the IMDb movie reviews dataset.



In [None]:
def preprocess(review):
    res = []
    for x in review.split(' '):
        remove_beg=True if x[0] in {'(', '"', "'"} else False
        remove_end=True if x[-1] in {'.', ',', ';', ':', '?', '!', '"', "'", ')'} else False
        if remove_beg and remove_end: res += [x[0], x[1:-1], x[-1]]
        elif remove_beg: res += [x[0], x[1:]]
        elif remove_end: res += [x[:-1], x[-1]]
        else: res += [x]
    return res

if __name__=='__main__':
    train_data = torchtext.datasets.IMDB(root='.data', split='train')
    train_data = list(train_data)
    train_data = [(x[0], preprocess(x[1])) for x in train_data]
    train_data, test_data = train_data[0:10000] + train_data[12500:12500+10000], train_data[10000:12500] + train_data[12500+10000:], 

    print('Num. Train Examples:', len(train_data))
    print('Num. Test Examples:', len(test_data))

    print("\nSAMPLE DATA:")
    for x in random.sample(train_data, 5):
        print('Sample text:', x[1])
        print('Sample label:', x[0], '\n')

100%|██████████| 84.1M/84.1M [00:01<00:00, 42.8MB/s]


Num. Train Examples: 20000
Num. Test Examples: 5000

SAMPLE DATA:
Sample text: ['When', 'I', 'go', 'out', 'to', 'the', 'video', 'store', 'to', 'rent', 'a', 'flick', 'I', 'usually', 'trust', "IMDb's", 'views', 'on', 'a', 'film', 'and', ',', 'until', 'this', 'one', ',', 'had', 'never', 'seen', 'a', 'flick', 'rated', '7.0', 'or', 'above', 'on', 'the', 'site', 'I', 'did', 'not', 'enjoy.<br', '/><br', '/>Sidney', 'Lumet', ',', 'a', 'legendary', 'director', 'of', 'some', 'of', 'the', 'best', 'films', 'of', 'the', '20th', 'century', ',', 'really', 'misstepped', 'here', 'by', 'making', 'one', 'of', 'the', 'biggest', 'mistakes', 'a', 'filmmaker', 'can', ':', 'filling', 'a', "film's", 'cast', 'with', 'thoroughly', 'unlikeable', 'characters', 'with', 'no', 'real', 'redeeming', 'qualities', 'whatsoever.<br', '/><br', '/>I', 'like', 'films', 'with', 'flawed', 'characters', ',', 'but', 'no', 'matter', 'how', 'dark', "someone's", 'personality', 'is', 'we', 'all', 'have', 'a', 'bit', 'of', 'light', 'i

# 2: Data Processing



In [None]:
PAD = '<PAD>'
END = '<END>'
UNK = '<UNK>'

class TextDataset(data.Dataset):
    def __init__(self, examples, split, threshold, max_len, idx2word=None, word2idx=None):

        self.examples = examples
        assert split in {'train', 'val', 'test'}
        self.split = split
        self.threshold = threshold
        self.max_len = max_len

        # Dictionaries
        self.idx2word = idx2word
        self.word2idx = word2idx
        if split == 'train':
            self.build_dictionary()
        self.vocab_size = len(self.word2idx)
        
        # Convert text to indices
        self.textual_ids = []
        self.convert_text()

    
    def build_dictionary(self): 
        assert self.split == 'train'

        self.idx2word = {0:PAD, 1:END, 2: UNK}
        self.word2idx = {PAD:0, END:1, UNK: 2}

        counter = {}
        for line in self.examples:
            for word in line[1]:
                wordLower = word.lower()
                if wordLower in counter:
                    counter[wordLower] = counter[wordLower] + 1
                else:
                    counter[wordLower] = 1
                    
            index = 3
            for word in counter:
                if self.threshold <= counter[word]:
                    self.idx2word[index] = word
                    self.word2idx[word] = index
                    index = 1 + index
        print(self.word2idx)
    
    def convert_text(self):
        self.labels = []
        for i in range(len(self.examples)):
            self.textual_ids.append([])
            current = self.textual_ids[i]
            line = self.examples[i]
            for word in line[1]:
                wordLower = word.lower()
                if word not in self.word2idx:
                    current.append(self.word2idx[UNK])
                else:
                    current.append(self.word2idx[wordLower])
            current.append(self.word2idx[END])
            self.labels.append(self.get_label(i))

    def get_text(self, idx):
        text = []
        review = []
        
        line = self.examples[idx]
        review = line[1].copy()
        
        text_len = len(line[1])
        if text_len >= self.max_len:
            review = review[0:self.max_len]
        else:
            while len(review) < self.max_len:
                review.append(PAD)

        for word in review:
            wordLower = word.lower()
            if wordLower not in self.word2idx:
                text.append(self.word2idx[UNK])
            else:
                text.append(self.word2idx[wordLower])
        
        return (torch.LongTensor(text),torch.LongTensor([text_len]).squeeze())
    
    def get_label(self, idx):
        label = self.examples[idx][0]
        if label != "pos":
            label = torch.LongTensor([0]).squeeze()
        else:
            label = torch.LongTensor([1]).squeeze()
        return label

    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        text, text_len = self.get_text(idx)
        label = self.get_label(idx)

        return text, label

In [None]:
if __name__=='__main__':
    # Sample item
    Ds = TextDataset(train_data, 'train', threshold=10, max_len=150)
    print('Vocab size:', Ds.vocab_size)

    text, label = Ds[random.randint(0, len(Ds))]
    print('Example text:', text)
    print('Example label:', label)

Vocab size: 19002
Example text: tensor([ 1008,    54,  1370,    41,    54,  1370,    41,    54,    24,  1550,
           70,   334,    11,     7,   157,  4585,   772,    41,   143,     2,
           24,   458,   167,     7,     2,    52,  5377,  1152,  2746,    28,
           12,    24,   267,   507,   846,   799,  3011,   359,    34,     2,
           64,    24,    13,   755,  1130,  1330,    27,    13,   971,  3006,
           50,    12,    34,   384,   100,   162,     2,    52,     2,    34,
           12,    24,    41,   768,    38,  6153,    22,  6154,  4201,   232,
           31,    17,   103,   155,  7171,    55,   329,  1954,  8809,     2,
           11,    13,  1103,    45,     2,    38,   180,   634,   148,  7369,
           38,   203,  2463,    52, 18723,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,    

# 3: Convolutional Neural Network (CNN)

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, out_channels, filter_heights, stride, dropout, num_classes, pad_idx):
        super(CNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)

        number_layers = 3
        input_channels = 1
        self.conv1 = nn.Conv2d(input_channels, out_channels, [filter_heights[0], embed_size])
        self.conv2 = nn.Conv2d(input_channels, out_channels, [filter_heights[1], embed_size])
        self.conv3 = nn.Conv2d(input_channels, out_channels, [filter_heights[2], embed_size])        
        self.dropout = nn.Dropout()
        self.linear = nn.Linear(3 * out_channels, num_classes)        


    def forward(self, texts):
        emb = self.embedding(texts)
        
        conv1 = self.conv1(torch.unsqueeze(emb, 1))
        conv2 = self.conv2(torch.unsqueeze(emb, 1))
        conv3 = self.conv3(torch.unsqueeze(emb, 1))
        
        relu1 = torch.squeeze(conv1, 3)
        relu2 = torch.squeeze(conv2, 3)
        relu3 = torch.squeeze(conv2, 3)
        
        fRelu1 = F.relu(relu1)
        fRelu2 = F.relu(relu2)
        fRelu3 = F.relu(relu3)
        
        pool1 = F.max_pool1d(fRelu1, kernel_size = fRelu1.shape[2]).squeeze(2)
        pool2 = F.max_pool1d(fRelu2, kernel_size = fRelu2.shape[2]).squeeze(2)
        pool3 = F.max_pool1d(fRelu3, kernel_size = fRelu3.shape[2]).squeeze(2)
        
        concat = torch.cat([pool1, pool2, pool3], dim = 1)
        
        dropConcat = self.dropout(concat)
        output = self.linear(concat)
        
        return output

Initialize the train and test dataloaders.

In [None]:
if __name__=='__main__':
    THRESHOLD = 5 
    MAX_LEN = 100 
    BATCH_SIZE = 32 

    train_Ds = TextDataset(train_data, 'train', THRESHOLD, MAX_LEN)
    train_loader = torch.utils.data.DataLoader(train_Ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)

    test_Ds = TextDataset(test_data, 'test', THRESHOLD, MAX_LEN, train_Ds.idx2word, train_Ds.word2idx)
    test_loader = torch.utils.data.DataLoader(test_Ds, batch_size=1, shuffle=False, num_workers=1, drop_last=False)



In [None]:
from tqdm.notebook import tqdm

def train_model(model, num_epochs, data_loader, optimizer, criterion):
    print('Training Model...')
    model.train()
    for epoch in tqdm(range(num_epochs)):
        epoch_loss = 0
        epoch_acc = 0
        for texts, labels in data_loader:
            texts = texts.to(device) # shape: [batch_size, MAX_LEN]
            labels = labels.to(device) # shape: [batch_size]

            optimizer.zero_grad()

            output = model(texts)
            acc = accuracy(output, labels)
            
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        print('[TRAIN]\t Epoch: {:2d}\t Loss: {:.4f}\t Train Accuracy: {:.2f}%'.format(epoch+1, epoch_loss/len(data_loader), 100*epoch_acc/len(data_loader)))
    print('Model Trained!\n')

Some other helper functions.

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def accuracy(output, labels):
    preds = output.argmax(dim=1) 
    correct = (preds == labels).sum().float() 
    acc = correct / len(labels)
    return acc

Now you can instantiate your model. We provide you with some recommended hyperparameters; you should be able to get the desired accuracy with these, but feel free to play around with them.

In [None]:
if __name__=='__main__':
    cnn_model = CNN(vocab_size = train_Ds.vocab_size, 
                embed_size = 128, 
                out_channels = 64, 
                filter_heights = [2, 3, 4], 
                stride = 1, 
                dropout = 0.5, 
                num_classes = 2, 
                pad_idx = train_Ds.word2idx[PAD]) 

    cnn_model = cnn_model.to(device)
    
    print('The model has {:,d} trainable parameters'.format(count_parameters(cnn_model)))

The model has 3,879,746 trainable parameters


Create the criterion and define our optimizer.

In [None]:
if __name__=='__main__':    
    LEARNING_RATE = 5e-4 

    criterion = nn.CrossEntropyLoss().to(device)
    
    optimizer = optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)

Finally, train the CNN model.

In [None]:
if __name__=='__main__':    
    N_EPOCHS = 25 
    
    # train model for N_EPOCHS epochs
    train_model(cnn_model, N_EPOCHS, train_loader, optimizer, criterion)

Training Model...


  0%|          | 0/25 [00:00<?, ?it/s]

[TRAIN]	 Epoch:  1	 Loss: 0.5790	 Train Accuracy: 68.72%
[TRAIN]	 Epoch:  2	 Loss: 0.4255	 Train Accuracy: 80.66%
[TRAIN]	 Epoch:  3	 Loss: 0.3159	 Train Accuracy: 87.33%
[TRAIN]	 Epoch:  4	 Loss: 0.2178	 Train Accuracy: 92.57%
[TRAIN]	 Epoch:  5	 Loss: 0.1322	 Train Accuracy: 97.02%
[TRAIN]	 Epoch:  6	 Loss: 0.0712	 Train Accuracy: 99.25%
[TRAIN]	 Epoch:  7	 Loss: 0.0349	 Train Accuracy: 99.91%
[TRAIN]	 Epoch:  8	 Loss: 0.0177	 Train Accuracy: 99.99%
[TRAIN]	 Epoch:  9	 Loss: 0.0096	 Train Accuracy: 100.00%
[TRAIN]	 Epoch: 10	 Loss: 0.0057	 Train Accuracy: 100.00%
[TRAIN]	 Epoch: 11	 Loss: 0.0038	 Train Accuracy: 100.00%
[TRAIN]	 Epoch: 12	 Loss: 0.0039	 Train Accuracy: 99.99%
[TRAIN]	 Epoch: 13	 Loss: 0.0019	 Train Accuracy: 100.00%
[TRAIN]	 Epoch: 14	 Loss: 0.0028	 Train Accuracy: 99.99%
[TRAIN]	 Epoch: 15	 Loss: 0.0007	 Train Accuracy: 100.00%
[TRAIN]	 Epoch: 16	 Loss: 0.0004	 Train Accuracy: 100.00%
[TRAIN]	 Epoch: 17	 Loss: 0.0003	 Train Accuracy: 100.00%
[TRAIN]	 Epoch: 18	 Loss

Then we can evaluate the model.

In [None]:
import random

def evaluate(model, data_loader, criterion):
    print('Evaluating performance on the test dataset...')
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    print("\nSOME PREDICTIONS FROM THE MODEL:")
    for texts, labels in tqdm(data_loader):
        texts = texts.to(device)
        labels = labels.to(device)
        
        output = model(texts)
        acc = accuracy(output, labels)
        pred = output.argmax(dim=1)
        all_predictions.append(pred)
        
        loss = criterion(output, labels)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        if random.random() < 0.0015:
            print("Input: "+' '.join([data_loader.dataset.idx2word[idx] for idx in texts[0].tolist() if idx not in {data_loader.dataset.word2idx[PAD], data_loader.dataset.word2idx[END]}]))
            print("Prediction:", pred.item(), '\tCorrect Output:', labels.item(), '\n')

    full_acc = 100*epoch_acc/len(data_loader)
    full_loss = epoch_loss/len(data_loader)
    print('[TEST]\t Loss: {:.4f}\t Accuracy: {:.2f}%'.format(full_loss, full_acc))
    predictions = torch.cat(all_predictions)
    return predictions, full_acc, full_loss

In [None]:
if __name__=='__main__':
    evaluate(cnn_model, test_loader, criterion) 

Evaluating performance on the test dataset...

SOME PREDICTIONS FROM THE MODEL:


  0%|          | 0/5000 [00:00<?, ?it/s]

Input: for months i've been hearing about this little movie and now i've seen it . i find it cute , cute how so many <UNK> directors make movies where they combine other people's creative ideas in order to make their own one-joke premise of a movie . troops , <UNK> , any of the million blair witch parodies come to mind . if all that these directors want is a foot inside hollywood's door then they're doing the right thing and they should keep it up because combining plot outlines is how hollywood makes films . how many times have
Prediction: 1 	Correct Output: 0 

Input: what i loved about the on-screen adaptation of the stone angel is that it stayed so true to the novel ! great film ! as an avid reader , i find the worst thing about film adaptations is that the book somehow gets lost in translation . you can tell the stone angel team was careful not to let this happen with this film.<br /><br <UNK> burstyn was an excellent casting choice for the role of hagar and she is definitely a mo

#4: Recurrent Neural Network

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, bidirectional, dropout, num_classes, pad_idx):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        if(bidirectional):
          self.directions = 2
        else:
          self.directions = 1
    
        self.embeddings = nn.Embedding(vocab_size, embed_size, padding_idx = pad_idx)          

        self.gru = nn.GRU(embed_size, 
                       hidden_size, 
                       num_layers=num_layers, 
                       bidirectional=bidirectional, 
                       dropout=dropout,
                       batch_first=True)
    
        self.dropout = nn.Dropout(dropout)        

        self.fc = nn.Linear(hidden_size * 2, num_classes)       


    def forward(self, texts):
        embedded = self.embeddings(texts)        

        packed_output, hidden = self.gru(embedded)        

        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)

        out = self.dropout(hidden)        

        out = self.fc(out)        
        return out



In [None]:
if __name__=='__main__':
    THRESHOLD = 5 
    MAX_LEN = 100 
    BATCH_SIZE = 32

    train_Ds = TextDataset(train_data, 'train', THRESHOLD, MAX_LEN)
    train_loader = torch.utils.data.DataLoader(train_Ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)

    test_Ds = TextDataset(test_data, 'test', THRESHOLD, MAX_LEN, train_Ds.idx2word, train_Ds.word2idx)
    test_loader = torch.utils.data.DataLoader(test_Ds, batch_size=1, shuffle=False, num_workers=1, drop_last=False)



In [None]:
if __name__=='__main__':
    rnn_model = RNN(vocab_size = train_Ds.vocab_size, 
                embed_size = 128, 
                hidden_size = 128, 
                num_layers = 2,
                bidirectional = True,
                dropout = 0.5,
                num_classes = 2,
                pad_idx = train_Ds.word2idx[PAD])

    rnn_model = rnn_model.to(device)

    print('The model has {:,d} trainable parameters'.format(count_parameters(rnn_model)))

The model has 4,300,546 trainable parameters


In [None]:
if __name__=='__main__':    
    LEARNING_RATE = 5e-4

    criterion = nn.CrossEntropyLoss().to(device)

    optimizer = optim.Adam(rnn_model.parameters(), lr=LEARNING_RATE)

In [None]:
if __name__=='__main__':    
    N_EPOCHS = 20
    
    # train model for N_EPOCHS epochs
    train_model(rnn_model, N_EPOCHS, train_loader, optimizer, criterion)

Training Model...


  0%|          | 0/20 [00:00<?, ?it/s]

[TRAIN]	 Epoch:  1	 Loss: 0.6608	 Train Accuracy: 59.66%
[TRAIN]	 Epoch:  2	 Loss: 0.5120	 Train Accuracy: 75.26%
[TRAIN]	 Epoch:  3	 Loss: 0.3949	 Train Accuracy: 82.94%
[TRAIN]	 Epoch:  4	 Loss: 0.3119	 Train Accuracy: 87.14%
[TRAIN]	 Epoch:  5	 Loss: 0.2346	 Train Accuracy: 91.18%
[TRAIN]	 Epoch:  6	 Loss: 0.1618	 Train Accuracy: 94.12%
[TRAIN]	 Epoch:  7	 Loss: 0.1110	 Train Accuracy: 96.20%
[TRAIN]	 Epoch:  8	 Loss: 0.0677	 Train Accuracy: 97.62%
[TRAIN]	 Epoch:  9	 Loss: 0.0444	 Train Accuracy: 98.61%
[TRAIN]	 Epoch: 10	 Loss: 0.0313	 Train Accuracy: 98.92%
[TRAIN]	 Epoch: 11	 Loss: 0.0209	 Train Accuracy: 99.30%
[TRAIN]	 Epoch: 12	 Loss: 0.0251	 Train Accuracy: 99.14%
[TRAIN]	 Epoch: 13	 Loss: 0.0146	 Train Accuracy: 99.43%
[TRAIN]	 Epoch: 14	 Loss: 0.0128	 Train Accuracy: 99.53%
[TRAIN]	 Epoch: 15	 Loss: 0.0234	 Train Accuracy: 99.22%
[TRAIN]	 Epoch: 16	 Loss: 0.0116	 Train Accuracy: 99.61%
[TRAIN]	 Epoch: 17	 Loss: 0.0118	 Train Accuracy: 99.61%
[TRAIN]	 Epoch: 18	 Loss: 0.012

Finally, we evaluate the RNN. 


In [None]:
if __name__=='__main__':    
    evaluate(rnn_model, test_loader, criterion)

Evaluating performance on the test dataset...

SOME PREDICTIONS FROM THE MODEL:


  0%|          | 0/5000 [00:00<?, ?it/s]

Input: no wonder most of the cast wished they never made this movie . it's just plain ridiculous and embarrassing to watch . bad actors reading cheesy lines while shiny classic <UNK> cars continuously circle a diner that looks more like a disneyland attraction . students <UNK> with the deranged principal as he tries to stop them from setting fire to a bronze civil war statue . the watts riots with a cast <UNK> ? dermot mulroney tries not to gag while he makes out with a mary hartman look-alike with the most annoying smile since ' mr . <UNK> .
Prediction: 0 	Correct Output: 0 

Input: the imdb guidelines state that you have to declare if your comments contain ' <UNK> . <br /><br />well , this whole film is something of a <UNK> . a cautionary tale that glorifies what it <UNK> against , a tale of lost youth that doesn't know where it itself is going.<br /><br />i just saw this at the tribeca film festival . this film wasn't just bad , it was really bad.<br /><br />the acting is inconsiste