In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
import random
import spacy
import nltk
import string
import pdb
import concurrent.futures
import pickle
SEED = 7

In [2]:
TRAIN_POS_PATH = "./aclImdb/train/pos/"
TRAIN_NEG_PATH = "./aclImdb/train/neg/"
TEST_POS_PATH = "./aclImdb/test/pos/"
TEST_NEG_PATH = "./aclImdb/test/neg/"

In [3]:
train_split = int(20000/2)

train_pos_paths = glob(os.path.join(TRAIN_POS_PATH, "*.txt"))
train_neg_paths = glob(os.path.join(TRAIN_NEG_PATH, "*.txt"))
test_pos_paths = glob(os.path.join(TEST_POS_PATH, "*.txt"))
test_neg_paths = glob(os.path.join(TEST_NEG_PATH, "*.txt"))

random.Random(SEED).shuffle(train_pos_paths)
random.Random(SEED).shuffle(train_neg_paths)

val_pos_paths = train_pos_paths[train_split:]
val_neg_paths = train_neg_paths[train_split:]

train_pos_paths = train_pos_paths[:train_split]
train_neg_paths = train_neg_paths[:train_split]

train_paths = train_pos_paths+train_neg_paths
val_paths = val_pos_paths+val_neg_paths
test_paths = test_pos_paths+test_neg_paths

random.Random(SEED).shuffle(train_paths)
random.Random(SEED).shuffle(val_paths)
random.Random(SEED).shuffle(test_paths)

In [4]:
print ("length of the training data: {}".format(len(train_paths)))
print ("length of the validation data: {}".format(len(val_paths)))
print ("length of the test data: {}".format(len(test_paths)))

length of the training data: 20000
length of the validation data: 5000
length of the test data: 25000


In [8]:
def dataset_collection(paths):
    data = pd.DataFrame()
    id_list = []
    label_list = []
    text_list = []
    
    for path in tqdm(paths):
        file_name = path.split('/')[-1].split('.')[0]
        id_list.append(file_name.split('_')[0])
#         For 10 classes
#         label_list.append(int(file_name.split('_')[1])-1)
#         For 2 classes
        label_list.append(int(file_name.split('_')[1])-1)
        with open(path, 'r') as f:
            text_list.append(f.read())
    
    data['ID'] = id_list
    data['text'] = text_list
    data['label'] = label_list    
    return data

In [9]:
training_dataset = dataset_collection(train_paths[:])
val_dataset = dataset_collection(val_paths[:])
test_dataset =dataset_collection(test_paths[:])

100%|██████████| 20000/20000 [00:07<00:00, 2548.83it/s]
100%|██████████| 5000/5000 [00:00<00:00, 10148.93it/s]
100%|██████████| 25000/25000 [00:01<00:00, 14526.28it/s]


In [7]:
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def tokenize_spacy(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if token.text not in punctuations]

def tokenize_nltk(sent):
    tokens = nltk.word_tokenize(sent)
    return [token.lower() for token in tokens if token not in punctuations]

def tokenize_space(sent):
    tokens = sent.split(' ')
    return [token.lower() for token in tokens if token not in punctuations]

def tokenize_dataset(dataset, mode='spacy'):
    data_id = []
    data_text = []
    data_label = []
    all_tokens = []
    for idx in tqdm(range(len(dataset))):
        if mode == 'spacy':
            tokens = tokenize_spacy(dataset.iloc[idx]['text'].replace('<br /><br />', ''))
        elif mode == 'nltk':
            tokens = tokenize_spacy(dataset.iloc[idx]['text'].replace('<br /><br />', ''))
        elif mode == 'space':
            tokens = tokenize_space(dataset.iloc[idx]['text'].replace('<br /><br />', ''))
        else:
            raise ValueError("Unrecognized mode!")
        data_id.append(dataset.iloc[idx]['ID'])
        data_text.append(tokens)
        data_label.append(dataset.iloc[idx]['label'])
        all_tokens += tokens
    return data_id, data_text, data_label, all_tokens

In [None]:
_, val_tokens_, val_label, _ = tokenize_dataset(val_dataset)
_, train_tokens_, train_label, all_tokens_ = tokenize_dataset(training_dataset)
_, test_tokens_, test_label, _ = tokenize_dataset(test_dataset)

In [None]:
# pickle.dump(train_tokens, open('./train_tokens_spacy.p', 'wb'))
# pickle.dump(train_label, open('./train_label_bi_spacy.p', 'wb'))
# pickle.dump(all_tokens, open('./all_tokens_spacy.p', 'wb'))

# pickle.dump(val_tokens, open('./val_tokens_spacy.p', 'wb'))
# pickle.dump(val_label, open('./val_label_bi_spacy.p', 'wb'))

# pickle.dump(test_tokens, open('./test_tokens_spacy.p', 'wb'))
# pickle.dump(test_label, open('./test_label_bi_spacy.p', 'wb'))

In [10]:
train_tokens=pickle.load(open('./train_tokens.p', 'rb'))
train_label=pickle.load(open('./train_label.p', 'rb'))
all_tokens=pickle.load(open('./all_tokens.p', 'rb'))

val_tokens = pickle.load(open('./val_tokens.p', 'rb'))
val_label = pickle.load(open('./val_label.p', 'rb'))

test_tokens = pickle.load(open('./test_tokens.p', 'rb'))
test_label = pickle.load(open('./test_label.p', 'rb'))

In [11]:
def to_N_gram(dataset, N):
    dataset_n_gram = []
    all_tokens = []
    for data in tqdm(dataset):
        n_gram = []
        for idx in range(len(data)-N+1):
#             pdb.set_trace()
            n_gram.append([' '.join(data[idx:idx+N])][0])
        dataset_n_gram.append(n_gram)
        all_tokens += n_gram
    return dataset_n_gram, all_tokens

train_tokens_bi, all_tokens_bi = to_N_gram(train_tokens, 2)
val_tokens_bi, _ = to_N_gram(val_tokens, 2)
test_tokens_bi, _ = to_N_gram(test_tokens, 2)

100%|██████████| 20000/20000 [00:04<00:00, 4881.91it/s]
100%|██████████| 5000/5000 [00:01<00:00, 4955.39it/s]
100%|██████████| 25000/25000 [00:04<00:00, 5387.58it/s]


In [12]:
from collections import Counter

VOCABULARY_SIZE = 40000
PAD_IDX = 0
UNK_IDX = 1 
MAX_SENTENCE_LENGTH = 400

def build_vocab(all_tokens):
    token_counter = Counter(all_tokens)
    print ("Total number of tokens: ", len(token_counter))
    words, freq = zip(*token_counter.most_common(VOCABULARY_SIZE))
    id2token = ['<pad>', '<unk>']+list(words)
    token2id = dict(zip(id2token, range(2+len(words))))
    return id2token, token2id

id2token, token2id = build_vocab(all_tokens_bi)

Total number of tokens:  1203340


In [13]:
def check_tokens_id(idx):
    print ("Token {} corresponds to word {}".format(idx, id2token[idx]))

In [14]:
def token2id_dataset(text_dataset):
    indices_data=[]
    for text in text_dataset:
        idx_list = [token2id[word] if word in token2id else UNK_IDX for word in text]
#         idx_list = idx_list+[PAD_IDX]*(PAD_LEN-len(idx_list))
        indices_data.append(idx_list)
    return indices_data

train_indices = token2id_dataset(train_tokens_bi)
val_indices = token2id_dataset(val_tokens_bi)
test_indices = token2id_dataset(test_tokens_bi)


print ("Train dataset size is {}".format(len(train_indices)))
print ("Val dataset size is {}".format(len(val_indices)))
print ("Test dataset size is {}".format(len(test_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [15]:
# MAX_SENTENCE_LENGTH = 400

import numpy as np
import torch
from torch.utils.data import Dataset

class IMDB_Dataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
#         pdb.set_trace()
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
#     for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = NewsGroupDataset(train_data_indices, train_targets)
#val_loader = NewsGroupDataset(val_data_indices, val_targets)
#test_loader = NewsGroupDataset(test_data_indices, test_targets)

BATCH_SIZE = 32
train_dataset = IMDB_Dataset(train_indices, train_label)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = IMDB_Dataset(val_indices, val_label)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset = IMDB_Dataset(test_indices, test_label)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

In [None]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,10)
    
    def forward(self, data, length):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return F.log_softmax(out, dim=1)

emb_dim = 200
model = BagOfWords(len(id2token), emb_dim)

In [None]:
from tqdm import tqdm
learning_rate = 0.001
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.NLLLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = model(data_batch, length_batch)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
#     print (total)
    return (100 * correct / total)

def train(model, num_epochs, early_stopping_patience = 0):
    prev_best = 0
    count = 0
    for epoch in range(num_epochs):
        for i, (data, lengths, labels) in enumerate(train_loader):
            model.train()
    #         pdb.set_trace()
            data_batch, length_batch, label_batch = data, lengths, labels
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
            # validate every 100 iterations
            if i > 0 and i % 100 == 0:
                # validate
                val_acc = test_model(val_loader, model)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc))
        val_acc = test_model(train_loader, model)
        if val_acc >= prev_best:
            prev_best = val_acc
            count = 0
        else:
            count += 1
        if count > early_stopping_patience:
            print ("Finished!")
            break
    print ('Finished!')
train(model, num_epochs=5)

Epoch: [1/5], Step: [101/625], Validation Acc: 20.38
Epoch: [1/5], Step: [201/625], Validation Acc: 26.8
Epoch: [1/5], Step: [301/625], Validation Acc: 27.26
Epoch: [1/5], Step: [401/625], Validation Acc: 23.9
Epoch: [1/5], Step: [501/625], Validation Acc: 31.16
Epoch: [1/5], Step: [601/625], Validation Acc: 31.56
Epoch: [2/5], Step: [101/625], Validation Acc: 30.7
Epoch: [2/5], Step: [201/625], Validation Acc: 32.76
Epoch: [2/5], Step: [301/625], Validation Acc: 33.92


In [None]:
test_acc = test_model(test_loader, model)
print (test_acc)