In [1]:
import numpy as np
import os
import torch
from collections import Counter
from sklearn.feature_extraction import stop_words
from torch.utils.data import Dataset
import random
import spacy
import string
from tqdm import tqdm_notebook
import pickle as pkl
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
data_dir = "C:/Users/sherryyang/Desktop/dsga1011/aclImdb/"
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
TRAIN_SIZE = 20000
VALIDATION_SIZE = 5000
TEST_SIZE = 25000
PADDING_IDX = 0

## Load Data from disk

In [3]:
def read_files(file_name):
    with open(file_name, "rb") as f:
        content = f.read()
        content = content.lower().decode('utf_8')#.replace("<br />", "")
        return content
    
def construct_dataset(dataset_dir, dataset_size, start=0):
    """
    Function that loads a dataset
    
    """
    pos_dir = os.path.join(dataset_dir, "pos")
    neg_dir = os.path.join(dataset_dir, "neg")
    single_label_size = int(dataset_size / 2)
    output = []
    target = []
    all_pos = os.listdir(pos_dir)
    all_neg = os.listdir(neg_dir)
    for i in range(start, start+single_label_size):
        output.append(read_files(os.path.join(pos_dir, all_pos[i])))
        target.append(1)
        output.append(read_files(os.path.join(neg_dir, all_neg[i])))
        target.append(0)
    return output,target

In [4]:
# Load Dataset
train_set = construct_dataset(train_dir, TRAIN_SIZE)[0]
train_target = construct_dataset(train_dir, TRAIN_SIZE)[1]
validation_set = construct_dataset(train_dir, VALIDATION_SIZE, start=int(TRAIN_SIZE/2))[0]
validation_target = construct_dataset(train_dir, VALIDATION_SIZE, start=int(TRAIN_SIZE/2))[1]
test_set = construct_dataset(test_dir, TEST_SIZE)[0]
test_target = construct_dataset(test_dir, TEST_SIZE)[1]

In [5]:
len(train_set), len(train_target), len(validation_set), len(validation_target), len(test_set), len(test_target)

(20000, 20000, 5000, 5000, 25000, 25000)

## Tokenize remove punctuations

In [None]:

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)]



# Tokenize - Remove Stopwords

In [None]:
from nltk.corpus import stopwords 

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
stop_words = set(stopwords.words('english'))  


# lowercase and remove punctuation
def lower_case_remove_punc(parsed):
    #tokens = tokenizer(parsed)
    return [token.text.lower() for token in parsed if (token.text not in stop_words)]



# Tokenize -Stem

In [6]:
from nltk.stem import PorterStemmer
#from nltk.tokenize import sent_tokenize, word_tokenize
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
PorterStemmer = PorterStemmer()

def lower_case_remove_punc(parsed):
    #tokens = tokenizer(parsed)
    stem_tokens =[]
    for token in parsed:
        #if token.text not in punctuations:
        if PorterStemmer.stem(token.text.lower()):
            stem_tokens.append(PorterStemmer.stem(token.text.lower()))
        else:
            stem_tokens.append(token.text.lower())
    return stem_tokens



## Tokenize - remove Punc, stop words + stem 

In [None]:
from nltk.stem import PorterStemmer
#from nltk.tokenize import sent_tokenize, word_tokenize
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
PorterStemmer = PorterStemmer()
stop_words = set(stopwords.words('english')) 

def lower_case_remove_punc(parsed):
    
    stem_tokens =[]
    for token in parsed:
        if token.text not in punctuations and stop_words:
            if PorterStemmer.stem(token.text.lower()):
                stem_tokens.append(PorterStemmer.stem(token.text.lower()))
            else:
                stem_tokens.append(token.text.lower())
    return stem_tokens

# Unigram

In [8]:


def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset
    # in order to create vocabulary later
    all_tokens = []

    for sample in tqdm_notebook(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=1)):
        tokens = lower_case_remove_punc(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [9]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(validation_set)
pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))

#test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_set)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

#train set tokens
print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(train_set)
pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))

Tokenizing val data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Tokenizing test data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Tokenizing train data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
import pickle as pkl
# Then, load preprocessed train, val and test datasets
train_data_tokens = pkl.load(open("train_data_tokens.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens.p", "rb"))

val_data_tokens = pkl.load(open("val_data_tokens.p", "rb"))
test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))

# double checking
print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))

print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 5437103


## Vocab_size =10000

In [11]:
max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens)

In [12]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 2926 ; token modesti
Token modesti; token id 2926


In [13]:
len(id2token)

10002

In [14]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [15]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = NewsGroupDataset(train_data_indices, train_targets)
#val_loader = NewsGroupDataset(val_data_indices, val_targets)
#test_loader = NewsGroupDataset(test_data_indices, test_targets)

BATCH_SIZE = 32
train_dataset = NewsGroupDataset(train_data_indices, train_target)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, validation_target)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_target)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)



In [16]:
class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        data = data.long()
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

emb_dim = 100
model = BagOfWords(len(id2token), emb_dim)

In [17]:
len(id2token)

10002

# Optimizer : Adam

In [18]:
learning_rate = 0.001
num_epochs = 6 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
   
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

def error_analysis(loader, model):
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = model(data_batch, length_batch)
        predicted = (outputs.data > 0.5).long().view(-1)
        #convert to numpy arraies
        predicted_numpy = predicted.numpy()
        label_numpy=labels.numpy()
        print("predicted_numpy",predicted_numpy)
        print("label_numpy",label_numpy)
        diff=(predicted!=labels).numpy()
        wrong=np.where(predicted_numpy<label_numpy)[0]
        review=[]
        for loc in wrong[:3]:
            if predicted_numpy[loc]==0:
                data_wrong=data[loc].numpy()
                for index in data_wrong:
                    word=all_train_tokens[index]
                    review.append(word)
                
                print("review",review)
                review=[]
                print("predicted",predicted_numpy[loc])
                print("label",label_numpy[loc])
    model.train()
    return (100 * correct / total)
        
        

def earily_stop(val_acc_history, t=5, required_progress=0.001):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """
    # TODO: Finished
    stop=False
    repeat=0
    if len(val_acc_history)<=t:
        stop=False
    else:
        for i in range(1,t+1):
            i=-i
            diff=val_acc_history[i]-val_acc_history[i-1]
            
            if diff-required_progress<=0.00001:
                repeat+=1
            if repeat==t:
                stop=True
                break
    return stop

In [19]:
# Training the Model
validation_acc_history = []
stop_training = False

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        
        # validate every 300 iterations
        running_loss += loss.item()
        if i > 0 and i % 300 == 0:
            train_acc = test_model(train_loader, model)
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], training loss: {}, Train Acc: {}, Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), running_loss/300, train_acc, val_acc))
            running_loss =0.0
            validation_acc_history.append(val_acc)
            # check if we need to earily stop the model
            stop_training = earily_stop(validation_acc_history)
            
            if stop_training:
                print("earily stop triggered")
                break
    # because of the the nested loop
    if stop_training:
        break

Epoch: [1/6], Step: [301/625], training loss: 1.1868514891465505, Train Acc: 62.01, Validation Acc: 61.88
Epoch: [1/6], Step: [601/625], training loss: 0.6560425565640131, Train Acc: 74.53, Validation Acc: 72.74
Epoch: [2/6], Step: [301/625], training loss: 0.5572997372349103, Train Acc: 80.045, Validation Acc: 77.5
Epoch: [2/6], Step: [601/625], training loss: 0.47502958665291467, Train Acc: 83.78, Validation Acc: 80.5
Epoch: [3/6], Step: [301/625], training loss: 0.40431680078307786, Train Acc: 85.675, Validation Acc: 81.56
Epoch: [3/6], Step: [601/625], training loss: 0.35851009577512744, Train Acc: 87.025, Validation Acc: 82.5
Epoch: [4/6], Step: [301/625], training loss: 0.32457328905661903, Train Acc: 88.025, Validation Acc: 83.08
Epoch: [4/6], Step: [601/625], training loss: 0.3125615604221821, Train Acc: 88.485, Validation Acc: 83.64
Epoch: [5/6], Step: [301/625], training loss: 0.28971569404006003, Train Acc: 89.785, Validation Acc: 83.82
Epoch: [5/6], Step: [601/625], trainin

In [21]:
print ("After training for {} epochs".format(num_epochs))
print ("Val Acc {}".format(test_model(val_loader, model)))
print ("Test Acc {}".format(test_model(test_loader, model)))

After training for 6 epochs
Val Acc 83.7
Test Acc 84.432
