In [None]:
import os
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
import en_core_web_sm
import pickle as pkl
from tqdm import tqdm_notebook
import pickle as pkl
from collections import Counter
import numpy as np
import torch
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
%matplotlib inline

In [None]:
MAX_SENTENCE_LENGTH = 500
max_vocab_size = 10000
PAD_IDX = 0
UNK_IDX = 1

### Raw data load

In [None]:
def get_data(path):
    data = []
    rate = []
    for x in os.listdir(path):
        rate.append(x.split(".")[0][-1])
        with open(path+x, "r") as f:
            data.extend(f.readlines())
    return data, len(data), rate

In [None]:
path = "aclImdb/"
train_data, n_pos, _ = get_data(path+"train/pos/")
train_data.extend(get_data(path+"train/neg/")[0])
n_neg = len(train_data) - n_pos
train_targets = [1] * n_pos + [0] * n_neg
test_data, n_pos, _ = get_data(path+"test/pos/")
test_data.extend(get_data(path+"test/neg/")[0])
n_neg = len(test_data) - n_pos
test_targets = [1] * n_pos + [0] * n_neg

In [None]:
train_data, val_data, train_targets, val_targets = train_test_split(train_data, train_targets, test_size=5000, random_state=42)
print ("Train dataset size is {}".format(len(train_data)))
print ("Val dataset size is {}".format(len(val_data)))
print ("Test dataset size is {}".format(len(test_data)))

In [None]:
pkl.dump([train_data, val_data, train_targets, val_targets, test_data, test_targets], open("data/raw_data.pickle", "wb"))

### Utils and Functions

In [None]:
tokenizer = en_core_web_sm.load()
punctuations = string.punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]

In [None]:
# Credit to Ilya Kulikov
def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)]

def tokenize_dataset(dataset, gram=1):
    token_dataset = []
    all_tokens = []
    
    tokenizer = en_core_web_sm.load()     
    for sample in tqdm_notebook(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=2)):
        tokens = lower_case_remove_punc(sample)
        if gram > 1:
            tokens += ["".join(tokens[i:i+gram]) for i in range(len(tokens)-gram)]
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

def tokenize_dataset_seq(dataset, gram):
    token_dataset = []
    all_tokens = []
    
    for sample in tqdm_notebook(dataset):
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        if gram > 1:
            tokens += ["".join(tokens[i:i+gram]) for i in range(len(tokens)-gram)]
        all_tokens += tokens

    return token_dataset, all_tokens

def nltk_tokenize_dataset(dataset):
    token_dataset = []
    all_tokens = []
    tokenizer = word_tokenize
    for sample in tqdm_notebook(dataset):
        parsed = tokenizer(sample)
        tokens = [token.lower() for token in parsed if (token not in punctuations)]
#         tokens = [token for token in sample]
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [None]:
def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [None]:
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [None]:
class ImdbDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def _collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]


In [None]:
class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,2)
    
    def forward(self, data, length):
        """  
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [None]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data = data.cuda()
        lengths = lengths.cuda()
        labels = labels.cuda()
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [None]:
def train(learning_rate, num_epochs, optimizer, save=False, label="", schedule=False, tolerance=6):
    criterion = torch.nn.CrossEntropyLoss()  
    hist = []
    best_val = 0
    fail_cnt = 0
    for epoch in range(num_epochs):
        for i, (data, lengths, labels) in enumerate(train_loader):
            model.train()
            data_batch, length_batch, label_batch = data.cuda(), lengths.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            hist.append(loss.item())
            loss.backward()
            optimizer.step()
            # validate every 100 iterations
            if i > 0 and i % 100 == 0:
                # validate
                val_acc = test_model(val_loader, model)
                if schedule:
                    scheduler.step(val_acc)
                if val_acc > best_val:
                    fail_cnt = 0
                    if save:
                        torch.save(model.state_dict(), 'model' + label + '.ckpt')
                    best_val = val_acc
                    print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc))
                else:
                    fail_cnt += 1
                    print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, failed {} times'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc, fail_cnt))
            if fail_cnt == tolerance:
                break
            
    return hist, best_val

### Do stuffs

In [None]:
results = {}

In [None]:
[train_data, val_data, train_targets, val_targets, test_data, test_targets] = pkl.load(open("data/raw_data.pickle", "rb"))

In [None]:
gram = 2
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_data, gram=gram)
# val_data_tokens, _ = nltk_tokenize_dataset(val_data)
pkl.dump(val_data_tokens, open("val_data_"+str(gram)+"tokens.p", "wb"))

print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(train_data, gram=gram)
# train_data_tokens, all_train_tokens = nltk_tokenize_dataset(train_data)
pkl.dump(train_data_tokens, open("train_data_"+str(gram)+"_tokens.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_"+str(gram)+"_tokens.p", "wb"))

print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset_seq(test_data, gram=gram)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

In [None]:
path = "./"
train_data_tokens = pkl.load(open("train_data_tokens.p", "rb"))
print ("Train dataset size is {}".format(len(train_data_tokens)))
all_train_tokens = pkl.load(open("all_train_tokens.p", "rb"))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))
val_data_tokens = pkl.load(open("val_data_tokens.p", "rb"))
print ("Val dataset size is {}".format(len(val_data_tokens)))

In [None]:
token2id, id2token = build_vocab(all_train_tokens)

In [None]:
train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
# test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
# print ("Test dataset size is {}".format(len(test_data_indices)))

In [None]:
BATCH_SIZE = 32
train_dataset = ImdbDataset(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=_collate_func,
                                           shuffle=True)

val_dataset = ImdbDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=_collate_func,
                                           shuffle=True)

In [None]:
emb_dim = 50
learning_rate = 0.005
num_epochs = 100
tolerance = 15
model = BagOfWords(len(id2token), emb_dim).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=6, factor=0.1)
hist, best_val = train(learning_rate, num_epochs, optimizer, schedule=True, tolerance=tolerance, 
                       save=True, label="LoPun-unigram-Adam-0.005-Ann-100")

In [None]:
results["nltkLoPun-unigram-Adam-0.005-Ann-100"] = {"hist": hist, "best_val": best_val}

In [None]:
plt.figure(figsize=(12,6))
sorted_results = sorted(list(zip(*(results.keys(), results.values()))), key=lambda x: x[1]['best_val'])
for config, res in sorted_results:
    info = config.split("-")
    if (info[0] == "LoPun") and (info[1] == "unigram"):
        print("{} & {} & {} & {} \\\\".format(info[2], info[3], info[4], res["best_val"]))
        plt.plot(res["hist"][::50], label="{}, lr={}, {}".format(info[2], info[3], info[4]))
        plt.legend()
        plt.ylabel("Loss")
        plt.xlabel("steps/50")
        plt.title("Learning Curve of different optimizer configurations")
        plt.savefig("training_curve")

In [None]:
pkl.dump(results, open("results.pickle", "wb"))

### test

In [None]:
test_data_indices = token2index_dataset(test_data_tokens)
test_dataset = ImdbDataset(test_data_indices, test_targets)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=_collate_func,
                                           shuffle=False)

In [None]:
print ("Val Acc {}".format(test_model(val_loader, model)))
print ("Test Acc {}".format(test_model(test_loader, model)))

### get examples

In [None]:
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=1,
                                           collate_fn=_collate_func,
                                           shuffle=False)
correct_cnt, wrong_cnt = 0, 0
correct_list = []
wrong_list = []
prediction = []
label = []
model.eval()
for data, lengths, labels in test_loader:
    data = data.cuda()
    lengths = lengths.cuda()
    labels = labels.cuda()
    data_batch, length_batch, label_batch = data, lengths, labels
    outputs = F.softmax(model(data_batch, length_batch), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    prediction.append(predicted.data[0])
    label.append(labels.data[0])
    
    correct = predicted.eq(labels.view_as(predicted)).sum().item()
    if (correct == 1) and (correct_cnt < 3):
        if correct_cnt == 2:
            if labels.data[0].item() == 1:
                continue
        correct_cnt  += 1
        correct_list.append((data.data[0], labels.data[0]))
    if (correct == 0) and (wrong_cnt < 3):
        if wrong_cnt == 2:
            if labels.data[0].item() == 1:
                continue
        wrong_cnt += 1
        wrong_list.append((data.data[0], labels.data[0]))

In [None]:
truth_table = {"00":0, "01":0, "10":0, "11":0}
for i, j in zip(prediction, label):
    truth_table[str(i.item())+str(j.item())] += 1
truth_table

In [None]:
for ex in correct_list:
    print("\item label:{}".format(ex[1]), " ".join([id2token[i] for i in ex[0] if i > 0]).replace("<", " $<$ ").replace(">", " $>$ "))
print("-" * 20)
for ex in wrong_list:
    print("\item label:{}".format(ex[1]), " ".join([id2token[i] for i in ex[0] if i > 0]).replace("<", " $<$ ").replace(">", " $>$ "))