In [None]:
import torch
from torchtext import data
from torchtext import datasets
import torch.nn as nn
import numpy as np
import random

In [None]:
USE_CUDA = torch.cuda.is_available()

# add seed to make sure the result can be reproduced
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)
device = torch.device("cuda" if USE_CUDA else "cpu")

In [None]:
# as seen in Language-Model, Field determines how we parse data
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [None]:
# torchtext has built in IMDB dataset
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

In [None]:
type(train_data)

In [None]:
# check one example, vars in python can convert an instance into dict
print(vars(train_data.examples[0]))

In [None]:
# split training dataset into train and val
train_data, valid_data = train_data.split(random_state=random.seed(1))

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

In [None]:
# build vocabulary using training dataset
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

In [None]:
print(TEXT.vocab.freqs.most_common(20))

In [None]:
print(TEXT.vocab.itos[:10])

In [None]:
print(LABEL.vocab.stoi)

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# BucketIterator will put sentences with similar length into the same batch
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)

In [None]:
batch = next(iter(train_iterator))
[TEXT.vocab.itos[i] for i in batch.text[:, 0]]

In [None]:
batch

In [None]:
# word averaging model
import torch.nn as nn
import torch.nn.functional as F

class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_dim, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx = pad_idx)
        self.fc = nn.Linear(embedding_size, output_dim)
        
    def forward(self, text):
        '''
        text: (seq_length, batch_size)
        '''
        embedded = self.embedding(text) # (seq_length, batch_size, embedding_size)
        embedded = embedded.transpose(1,0) # (batch_size, seq_length, embedding_size)
        # (embedded.shape[1], 1) means average on seq_length, keep embedding_size unchanged. So after squeeze, the final shape is (batch_size, embedding_size)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze()
        return self.fc(pooled)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordAVGModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [None]:
model

In [None]:
# check the total number of paramters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad) # numel() computes the number of parameters

In [None]:
count_parameters(model)

In [None]:
pretrained_embedding = TEXT.vocab.vectors # load embedding from glove
pretrained_embedding.shape

In [None]:
# initialize model embedding weights to pretrained weights
model.embedding.weight.data.copy_(pretrained_embedding)

In [None]:
# set the embedding for <pad> and <unk> to 0s
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)

In [None]:
def binary_accuracy(preds, y):
    '''
    preds: (batch_size)
    y: (batch_size)
    '''
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    
    epoch_loss = 0
    epoch_acc = 0
    total_len = 0
    
    for batch in iterator:
        preds = model(batch.text).squeeze() # (batch_size)
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label) # this loss is averaged by len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    return epoch_loss / total_len, epoch_acc / total_len

In [None]:
def evaluate(model, iterator, optimizer, criterion):
    model.eval()
    
    epoch_loss = 0
    epoch_acc = 0
    total_len = 0
    
    with torch.no_grad():
        for batch in iterator:
            preds = model(batch.text).squeeze() # (batch_size)
            loss = criterion(preds, batch.label)
            acc = binary_accuracy(preds, batch.label)

            epoch_loss += loss.item() * len(batch.label)
            epoch_acc += acc.item() * len(batch.label)
            total_len += len(batch.label)
    
    model.train()
    return epoch_loss / total_len, epoch_acc / total_len

In [None]:
NUM_EPOCHS = 10
best_valid_acc = 0

for epoch in range(NUM_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, optimizer, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
#         torch.save(model.state_dict(), "wordavg-model.pth")
    
    print("epoch", epoch, "Train loss", train_loss, "Train Acc", train_acc)
    print("epoch", epoch, "Valid loss", valid_loss, "Valid Acc", valid_acc)
    

In [None]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1) # (seq_length, batch_size = 1)
    prediction = torch.sigmoid(model(tensor)) # convert model prediction to probability
    return prediction.item()

In [None]:
predict_sentiment("This film is terrible")

In [None]:
predict_sentiment("This film is great")

In [None]:
# RNN model
class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, output_dim, nlayers, pad_idx, bidirectional=True, dropout=0.3):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embed_size, hidden_size, 
                            num_layers=nlayers, 
                            bidirectional=bidirectional, 
                            dropout=dropout)
        self.fc = nn.Linear(hidden_size * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.nlayers = nlayers
        self.hidden_size = hidden_size
    
    def forward(self, text):
        '''
        text: seq_length * batch_size
        '''
        embed = self.dropout(self.embed(text)) # (seq_length, batch_size, embed_size)
        # if we don't pass in initial hidden, default are all 0s
        output, (hidden, cell) = self.lstm(embed) # output: (seq_length, batch, num_directions * hidden_size), hidden: (nlayers*num_directions, batch, hidden_size), cell: (nlayers*num_directions, batch, hidden_size)
        
        # use the hidden (hidden state in the last stage) to predict
        hidden = torch.cat((hidden[-2], hidden[-1]), dim = 1) # (batch, hidden_size * 2)
        hidden = self.dropout(hidden)
        
        return self.fc(hidden)


In [None]:
model = RNNModel(vocab_size = INPUT_DIM, 
                 embed_size = EMBEDDING_DIM, 
                 hidden_size = 100, 
                 output_dim = OUTPUT_DIM,
                 nlayers = 2, 
                 pad_idx = PAD_IDX,
                 bidirectional=True, 
                 dropout=0.3)
if USE_CUDA:
    model = model.to(device)

In [None]:
pretrained_embedding = TEXT.vocab.vectors # load embedding from glove
pretrained_embedding.shape

In [None]:
model.embed.weight.data.copy_(pretrained_embedding)

In [None]:
# set the embedding for <pad> and <unk> to 0s
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [None]:
NUM_EPOCHS = 10
best_valid_acc = 0

for epoch in range(NUM_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, optimizer, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
#         torch.save(model.state_dict(), "wordavg-model.pth")
    
    print("epoch", epoch, "Train loss", train_loss, "Train Acc", train_acc)
    print("epoch", epoch, "Valid loss", valid_loss, "Valid Acc", valid_acc)

In [None]:
# CNN model

class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_dim, pad_idx, num_filters, filter_sizes, dropout = 0.3):
        super(CNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx = pad_idx)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, 
                      out_channels=num_filters, 
                      kernel_size=(fs, embedding_size))
            for fs in filter_sizes
        ])
#         self.conv = nn.Conv2d(in_channels=1, 
#                               out_channels=num_filters, 
#                               kernel_size=(filter_size, embedding_size))
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, text):
        '''
        text: (seq_length, batch_size)
        '''
        embedded = self.embed(text) # (seq_length, batch_size, embedding_size)
        embedded = embedded.transpose(1, 0) # (batch_size, seq_length, embedding_size)
        embedded = embedded.unsqueeze(1) # (batch_size, 1, seq_length, embedding_size)
#         conv = F.relu(self.conv(embedded)) # (batch_size, num_filters, seq_length - filter_size + 1, 1)
#         conv = conv.squeeze() # (batch_size, num_filters, seq_length - filter_size + 1)
        conved = [F.relu(conv(embedded)).squeeze() for conv in self.convs]

#         pooled = F.max_pool1d(conv, conv.shape[2]) # (batch_size, num_filters, 1)
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze() for conv in conved] #  [(batch_size, num_filters)]
#         pooled = pooled.squeeze() # (batch_size, num_filters)
#         pooled = self.dropout(pooled) # (batch_size, num_filters)
        
        cat = self.dropout(torch.cat(pooled, dim=1)) # (batch_size, len(filter_sizes) * num_filters)
        
        return self.fc(cat)

In [None]:
model = CNNModel(vocab_size = INPUT_DIM, 
                 embedding_size = EMBEDDING_DIM, 
                 output_dim = OUTPUT_DIM,
                 pad_idx = PAD_IDX,
                 num_filters = 100,
                 filter_sizes = [3,4,5],
                 dropout=0.3)
if USE_CUDA:
    model = model.to(device)

In [None]:
pretrained_embedding = TEXT.vocab.vectors # load embedding from glove
model.embed.weight.data.copy_(pretrained_embedding)

# set the embedding for <pad> and <unk> to 0s
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [None]:
NUM_EPOCHS = 10
best_valid_acc = 0

for epoch in range(NUM_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, optimizer, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
#         torch.save(model.state_dict(), "wordavg-model.pth")
    
    print("epoch", epoch, "Train loss", train_loss, "Train Acc", train_acc)
    print("epoch", epoch, "Valid loss", valid_loss, "Valid Acc", valid_acc)