We will start by downloading 20-newsgroup text dataset:

```http://scikit-learn.org/stable/datasets/index.html#the-20-newsgroups-text-dataset```

In [27]:
import os
import random

In [52]:
# import training and testing data from file
import pickle as pkl


def import_data(dir):
    dataset = []
    for file_dir in os.listdir(dir):
        with open(dir+'/'+file_dir) as f:
            dataset.append(f.read())
    return dataset

pos_train_data = import_data("aclImdb/train/pos")
neg_train_data = import_data("aclImdb/train/neg")
pos_test_data = import_data("aclImdb/test/pos")
neg_test_data = import_data("aclImdb/test/neg")

# concatenate positive and negative training data, testing data
train_data = pos_train_data+neg_test_data
train_target = [1]*len(pos_train_data)+[0]*len(neg_train_data)
test_data = pos_test_data+neg_test_data
test_target = [1]*len(pos_test_data)+[0]*len(neg_test_data)

# split train into train and validation 
val_idx = random.sample(range(len(train_data)), 5000)
pkl.dump(val_idx, open("val_idx.p", "wb"))

val_data = [train_data[i] for i in val_idx]
val_target = [train_target[i] for i in val_idx]

train_sub_data = [train_data[i] for i in range(len(train_data)) if i not in val_idx]
train_sub_target = [train_target[i] for i in range(len(train_data)) if i not in val_idx]

In [None]:
#!pip install spacy
!python -m spacy download en_core_web_sm

In [53]:
# Let's write the tokenization function 

import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]



In [54]:
# This is the code cell that tokenizes train/val/test datasets
# However it takes about 15-20 minutes to run it
# For convinience we have provided the preprocessed datasets
# Please see the next code cell

def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    token_dataset_raw = []
    all_tokens_raw = []
    
    for sample in dataset:
        tokens_raw = tokenizer(sample)
        tokens = [token.text.lower() for token in tokens_raw if (token.text not in punctuations)]
        
        token_dataset.append(tokens)
        token_dataset_raw.append(tokens_raw)
        
        all_tokens += tokens
        all_tokens_raw += tokens_raw

    return token_dataset, all_tokens, token_dataset_raw, all_tokens_raw

# val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_data)
pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))
pkl.dump(val_target, open("val_target.p", "wb"))

# test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_data)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))
pkl.dump(test_target, open("test_target.p", "wb"))

# train set tokens
print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(train_sub_data)
pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))
pkl.dump(train_sub_target, open("train_target.p", "wb"))

Tokenizing val data
Tokenizing test data
Tokenizing train data


In [73]:
# without post-processing
def tokenize_dataset_raw(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
        # directly call tokenizer
        tokens = [token.text for token in tokenizer(sample)]
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

print ("Tokenizing val data")
val_data_tokens_raw, _ = tokenize_dataset_raw(val_data)
pkl.dump(val_data_tokens_raw, open("val_data_tokens_raw.p", "wb"))

# test set tokens
print ("Tokenizing test data")
test_data_tokens_raw, _ = tokenize_dataset_raw(test_data)
pkl.dump(test_data_tokens_raw, open("test_data_tokens_raw.p", "wb"))

# train set tokens
print ("Tokenizing train data")
train_data_tokens_raw, all_train_tokens_raw = tokenize_dataset_raw(train_sub_data)
pkl.dump(train_data_tokens_raw, open("train_data_tokens_raw.p", "wb"))
pkl.dump(all_train_tokens_raw, open("all_train_tokens_raw.p", "wb"))


Tokenizing val data
Tokenizing test data
Tokenizing train data


In [83]:
import pickle as pkl
# First, download datasets from here
# Use your NYU account
#https://drive.google.com/open?id=1eR2LFI5MGliHlaL1S2nsX4ouIO1k_ip2
#https://drive.google.com/open?id=133QCWbiz_Xc7Qm4r6t-fJP1K669xjNlM
#https://drive.google.com/open?id=1SuUIUpJ1iznU707ktkpnEGSwt_XIqOYp
#https://drive.google.com/open?id=1UQsrZ2LVfcxdxxa47344fMs_qvya72KR

# Then, load preprocessed train, val and test datasets
train_data_tokens = pkl.load(open("train_data_tokens.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens.p", "rb"))

train_data_tokens_raw = pkl.load(open("train_data_tokens_raw.p", "rb"))
all_train_tokens_raw = pkl.load(open("all_train_tokens_raw.p", "rb"))

train_target = pkl.load(open("train_target.p", "rb"))

val_data_tokens = pkl.load(open("val_data_tokens.p", "rb"))
val_data_tokens_raw = pkl.load(open("val_data_tokens_raw.p", "rb"))
val_target = pkl.load(open("val_target.p", "rb"))

#test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))
#test_data_tokens_raw = pkl.load(open("test_data_tokens_raw.p", "rb"))
#test_target = pkl.load(open("test_target.p", "rb"))


# double checking
print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))

print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4817127


Now, we are going to create the vocabulary of most common 10,000 tokens in the training set.

In [78]:
# build n gram tokens
def n_gram_token(tokens, n):
    new_tokens = []
    for idx, word in enumerate(tokens):
        if idx < len(tokens)-(n-1):
            for i in range(1,n):
                word += (" "+tokens[i+idx])
            new_tokens += [word]
    return new_tokens

# return list of tokens that contains tokens for i<=n 
def get_n_gram_tokens(n, tokens):
    list_of_n_tokens = []
    for i in range(2,n+1):
        list_of_n_tokens.append([n_gram_token(token_i,i) for token_i in tokens])
    return list_of_n_tokens

# get token list
train_token_list = get_n_gram_tokens(4, train_data_tokens)
val_token_list = get_n_gram_tokens(4, val_data_tokens)
#test_token_list = get_n_gram_tokens(4, test_data_tokens)

In [79]:
# build all training token based on n-gram
def build_all_token_list(n, original_tokens, train_tokens=train_token_list):
    all_tr_token=[]
    for i in range(n-1):
        for tr_token in train_tokens[i]:
            all_tr_token += tr_token
    return original_tokens+all_tr_token

In [9]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(n, original_tokens,max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    all_tokens = build_all_token_list(n, original_tokens)
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

#token2id, id2token = build_vocab(all_train_tokens)
#token2id, id2token = build_vocab(all_tr_token)
#token2id, id2token = build_vocab(2)

In [15]:
# Lets check the dictionary by loading random token from it
import random
random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 3292 ; token mine
Token mine; token id 3292


In [None]:
import random

In [80]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(n, original_tokens,max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    all_tokens = build_all_token_list(n, original_tokens)
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token



# convert token to id in the dataset
def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

#train_data_indices = token2index_dataset(train_tokens)
#val_data_indices = token2index_dataset(val_tokens)
#test_data_indices = token2index_dataset(test_tokens)

# double checking
#print ("Train dataset size is {}".format(len(train_data_indices)))
#print ("Val dataset size is {}".format(len(val_data_indices)))
#print ("Test dataset size is {}".format(len(test_data_indices)))

Now we are going to create PyTorch DataLoader 

In [38]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = NewsGroupDataset(train_data_indices, train_targets)
#val_loader = NewsGroupDataset(val_data_indices, val_targets)
#test_loader = NewsGroupDataset(test_data_indices, test_targets)

# BATCH_SIZE = 32
# train_dataset = NewsGroupDataset(train_data_indices, train_sub_target)
# train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=newsgroup_collate_func,
#                                            shuffle=True)

# val_dataset = NewsGroupDataset(val_data_indices, val_target)
# val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=newsgroup_collate_func,
#                                            shuffle=True)

# test_dataset = NewsGroupDataset(test_data_indices, test_target)
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=newsgroup_collate_func,
#                                            shuffle=False)

#for i, (data, lengths, labels) in enumerate(train_loader):
#    print (data)
#    print (labels)
#    break

Here we will define Bag-of-Words model in PyTorch

In [39]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding; 100 at least
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

#emb_dim =100
#model = BagOfWords(len(id2token), emb_dim)

In [40]:
# learning_rate = 0.01
# num_epochs = 10 # number epoch to train

# # Criterion and Optimizer
# criterion = torch.nn.CrossEntropyLoss()  
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

# for epoch in range(num_epochs):
#     for i, (data, lengths, labels) in enumerate(train_loader):
#         model.train()
#         data_batch, length_batch, label_batch = data, lengths, labels
#         optimizer.zero_grad()
#         outputs = model(data_batch, length_batch)
#         loss = criterion(outputs, label_batch)
#         loss.backward()
#         optimizer.step()
#         # validate every 100 iterations
#         if i > 0 and i % 100 == 0:
#             # validate
#             val_acc = test_model(val_loader, model)
#             print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
#                        epoch+1, num_epochs, i+1, len(train_loader), val_acc))


In [91]:
# tune parameters for n gram; will try all i such that 1<=i<=n grams
MAX_SENTENCE_LENGTH = 200

def run_model(emb_dim, learning_rate, num_epochs, train_loader, val_loader, id2token):
    
    model = BagOfWords(len(id2token), emb_dim)
    criterion = torch.nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        
    for epoch in range(num_epochs):
            for i, (data, lengths, labels) in enumerate(train_loader):
                model.train()
                data_batch, length_batch, label_batch = data, lengths, labels
                optimizer.zero_grad()
                outputs = model(data_batch, length_batch)
                loss = criterion(outputs, label_batch)
                loss.backward()
                optimizer.step()
                if i > 0 and i % 100 == 0:
                    # validate
                    val_acc = test_model(val_loader, model)
                    print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


            
            # print accuracy every epoch
            
    val_acc_i = test_model(val_loader, model)
    return val_acc_i
        
# set default
max_vocab_default = 20000
emb_dim_default = 200
lr_default = 0.01
num_epochs_default = 10
BATCH_SIZE = 64

    
def tune_n_gram(n_size):    
    # tune n-gram
    print("Tuning n-gram:")
    for n_i in range(1,n_size+1):
        # build vocab
        token2id, id2token = build_vocab(n_i, all_train_tokens,max_vocab_default)
        if n_i == 1:
            new_train = train_data_tokens.copy()
            new_val = val_data_tokens.copy()
        else:
            for idx in range(train_size):
                new_train[idx] += train_token_list[n_i-2][idx]
            for idx in range(val_size):
                new_val[idx] += val_token_list[n_i -2][idx]
                
        print(new_train[0][len(new_train[0])-1])
        print(new_val[0][len(new_val[0])-1])
        # get indices
        train_data_indices = token2index_dataset(new_train,token2id)
        val_data_indices = token2index_dataset(new_val, token2id)
        
        # prepare data with dataloader for model
        train_dataset = NewsGroupDataset(train_data_indices, train_target)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)
        
        val_dataset = NewsGroupDataset(val_data_indices, val_target)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=newsgroup_collate_func,
                                                   shuffle=True)
        
        # create model
        #emb_dim =100
        #model = BagOfWords(len(id2token), emb_dim)
        #learning_rate = 0.01
        #num_epochs = 10 # number epoch to train

        # Criterion and Optimizer
        #criterion = torch.nn.CrossEntropyLoss()  
        #optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        
        # run
        print("n-gram: ", n_i)
        #for epoch in range(num_epochs):
        #    for i, (data, lengths, labels) in enumerate(train_loader):
        #        model.train()
        #        data_batch, length_batch, label_batch = data, lengths, labels
        #        optimizer.zero_grad()
        #        outputs = model(data_batch, length_batch)
        #        loss = criterion(outputs, label_batch)
        #        loss.backward()
        #        optimizer.step()
        #    
        #    # print accuracy every epoch
        #    val_acc = test_model(val_loader, model)
        #    print('Epoch: [{}/{}], Validation Acc: {}'.format( 
        #                epoch+1, num_epochs, val_acc))
        #val_acc_i = test_model(val_loader, model)
        
        
        val_acc_i = run_model(emb_dim_default, lr_default, num_epochs_default, train_loader, val_loader,id2token)
        print ("Val Acc {}".format(val_acc_i))
        if val_acc_i > max_val_acc:
            max_val_acc = val_acc_i
            best_n = n_i
            
    return best_n
    
    
    
    # update train data and val according to best n-gram size
    for i in range(1,best_n+1):
        for idx in range(train_size):
            train_data_tokens[idx]+= train_token_list[i-2][idx]
        for idx in range(val_size):
            val_data_tokens[idx] += val_token_list[i-2][idx]
            
    
    
    # tune vocabulary size
    best_vocab_size = -1
    max_val_acc = -1
    
    print("Tuning max vocabulary size:")
    for v_s in vocab_size_list:
        print("max vocab size: ", v_s)
        token2id, id2token = build_vocab(best_n, v_s)
        
        # get indices
        train_data_indices = token2index_dataset(train_data_tokens, token2id)
        val_data_indices = token2index_dataset(val_data_tokens, token2id)
    
        # prepare data with dataloader for model
        train_dataset = NewsGroupDataset(train_data_indices, train_target)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)
        
        val_dataset = NewsGroupDataset(val_data_indices, val_target)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=newsgroup_collate_func,
                                                   shuffle=True)
        
        
        
        val_acc_i = run_model(emb_dim_default, lr_default, num_epochs_default, train_loader, val_loader, id2token)
        print ("Val Acc {}".format(val_acc_i))
        if val_acc_i > max_val_acc:
            max_val_acc = val_acc_i
            best_vocab_size = v_s
    
    # update best vocab size
    token2id, id2token = build_vocab(best_n, best_vocab_size)
    # get indices
    train_data_indices = token2index_dataset(train_data_tokens, token2id)
    val_data_indices = token2index_dataset(val_data_tokens, token2id)
    
    # prepare data with dataloader for model
    train_dataset = NewsGroupDataset(train_data_indices, train_target)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)
        
    val_dataset = NewsGroupDataset(val_data_indices, val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=newsgroup_collate_func,
                                                   shuffle=True)
        
    
    # tune embedding dimension
    best_emb_dim = -1
    max_val_acc = -1
    
    print("Tuning embedding dimension:")
    for emb_i in emb_dim_list:
        print("embedding dim: ", emb_i)
        val_acc_i = run_model(emb_i, lr_default, num_epochs_default, train_loader, val_loader, id2token)
        print ("Val Acc {}".format(val_acc_i))
        if val_acc_i > max_val_acc:
            max_val_acc = val_acc_i
            best_emb_dim = emb_i
    
    return [best_n, best_vocab_size, best_emb_dim]
    


In [93]:
# tune tokenization
print("Tune tokenization")
print("with post-processing")
token2id, id2token = build_vocab(1, all_train_tokens,max_vocab_default)
train_data_indices = token2index_dataset(train_data_tokens,token2id)
val_data_indices = token2index_dataset(val_data_tokens, token2id)

# prepare data with dataloader for model
train_dataset = NewsGroupDataset(train_data_indices, train_target)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                       batch_size=BATCH_SIZE,
                                       collate_fn=newsgroup_collate_func,
                                       shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_target)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)
# run
val_acc_i = run_model(emb_dim_default, lr_default, num_epochs_default, train_loader, val_loader,id2token)
print ("Val Acc {}".format(val_acc_i))


print("without post-processing")
token2id, id2token = build_vocab(1, all_train_tokens_raw,max_vocab_default)
train_data_indices = token2index_dataset(train_data_tokens_raw,token2id)
val_data_indices = token2index_dataset(val_data_tokens_raw, token2id)

# prepare data with dataloader for model
train_dataset = NewsGroupDataset(train_data_indices, train_target)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                       batch_size=BATCH_SIZE,
                                       collate_fn=newsgroup_collate_func,
                                       shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_target)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)
# run
val_acc_i = run_model(emb_dim_default, lr_default, num_epochs_default, train_loader, val_loader,id2token)
print ("Val Acc {}".format(val_acc_i))



Tune tokenization
with post-processing
Epoch: [1/10], Step: [101/313], Validation Acc: 84.52
Epoch: [1/10], Step: [201/313], Validation Acc: 87.22
Epoch: [1/10], Step: [301/313], Validation Acc: 88.26
Epoch: [2/10], Step: [101/313], Validation Acc: 87.72
Epoch: [2/10], Step: [201/313], Validation Acc: 87.96
Epoch: [2/10], Step: [301/313], Validation Acc: 87.74
Epoch: [3/10], Step: [101/313], Validation Acc: 87.1
Epoch: [3/10], Step: [201/313], Validation Acc: 87.08
Epoch: [3/10], Step: [301/313], Validation Acc: 86.74
Epoch: [4/10], Step: [101/313], Validation Acc: 86.7
Epoch: [4/10], Step: [201/313], Validation Acc: 86.68
Epoch: [4/10], Step: [301/313], Validation Acc: 86.62
Epoch: [5/10], Step: [101/313], Validation Acc: 86.26
Epoch: [5/10], Step: [201/313], Validation Acc: 86.52
Epoch: [5/10], Step: [301/313], Validation Acc: 85.88
Epoch: [6/10], Step: [101/313], Validation Acc: 85.98
Epoch: [6/10], Step: [201/313], Validation Acc: 86.24
Epoch: [6/10], Step: [301/313], Validation Ac

In [98]:
def tune_n_gram(n_size):   
    
    new_train = []
    new_val = []
    max_val_acc = -1
    best_n = -1
    train_size = len(train_data_tokens)
    val_size = len(val_data_tokens)
    
    
    # tune n-gram
    print("Tuning n-gram:")
    for n_i in range(1,n_size+1):
        # build vocab
        token2id, id2token = build_vocab(n_i, all_train_tokens,max_vocab_default)
        if n_i == 1:
            new_train = train_data_tokens.copy()
            new_val = val_data_tokens.copy()
        else:
            for idx in range(train_size):
                new_train[idx] += train_token_list[n_i-2][idx]
            for idx in range(val_size):
                new_val[idx] += val_token_list[n_i -2][idx]
                
        # get indices
        train_data_indices = token2index_dataset(new_train,token2id)
        val_data_indices = token2index_dataset(new_val, token2id)
        
        # prepare data with dataloader for model
        train_dataset = NewsGroupDataset(train_data_indices, train_target)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)
        
        val_dataset = NewsGroupDataset(val_data_indices, val_target)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=newsgroup_collate_func,
                                                   shuffle=True)
                
        # run
        print("n-gram: ", n_i)        
        
        val_acc_i = run_model(emb_dim_default, lr_default, num_epochs_default, train_loader, val_loader,id2token)
        print ("Val Acc {}".format(val_acc_i))
        if val_acc_i > max_val_acc:
            max_val_acc = val_acc_i
            best_n = n_i
            
    return best_n

best_n = tune_n_gram(4)

Tuning n-gram:
n-gram:  1
Epoch: [1/10], Step: [101/313], Validation Acc: 83.94
Epoch: [1/10], Step: [201/313], Validation Acc: 86.1
Epoch: [1/10], Step: [301/313], Validation Acc: 88.3
Epoch: [2/10], Step: [101/313], Validation Acc: 87.98
Epoch: [2/10], Step: [201/313], Validation Acc: 88.16
Epoch: [2/10], Step: [301/313], Validation Acc: 87.8
Epoch: [3/10], Step: [101/313], Validation Acc: 87.3
Epoch: [3/10], Step: [201/313], Validation Acc: 87.38
Epoch: [3/10], Step: [301/313], Validation Acc: 87.4
Epoch: [4/10], Step: [101/313], Validation Acc: 87.32
Epoch: [4/10], Step: [201/313], Validation Acc: 87.26
Epoch: [4/10], Step: [301/313], Validation Acc: 86.36
Epoch: [5/10], Step: [101/313], Validation Acc: 86.72
Epoch: [5/10], Step: [201/313], Validation Acc: 85.94
Epoch: [5/10], Step: [301/313], Validation Acc: 86.38
Epoch: [6/10], Step: [101/313], Validation Acc: 86.16
Epoch: [6/10], Step: [201/313], Validation Acc: 86.02
Epoch: [6/10], Step: [301/313], Validation Acc: 86.16
Epoch: 

KeyboardInterrupt: 

In [None]:
def tune_n_gram(n_size, vocab_size_list, emb_dim_list):  
    # set initials
    new_train = []
    new_val = []
    max_val_acc = -1
    best_config = []
    train_size = len(train_data_tokens)
    val_size = len(val_data_tokens)
    
    # tune n-gram
    print("Tuning model hyperparameters:")
    for n_i in range(1,n_size+1):
        for v_s in vocab_size_list:
            # build vocab
            token2id, id2token = build_vocab(n_i, all_train_tokens,v_s)
            if n_i == 1:
                new_train = train_data_tokens.copy()
                new_val = val_data_tokens.copy()
            else:
                for idx in range(train_size):
                    new_train[idx] += train_token_list[n_i-2][idx]
                for idx in range(val_size):
                    new_val[idx] += val_token_list[n_i -2][idx]

            #print(new_train[0][len(new_train[0])-1])
            #print(new_val[0][len(new_val[0])-1])
            # get indices
            train_data_indices = token2index_dataset(new_train,token2id)
            val_data_indices = token2index_dataset(new_val, token2id)

            # prepare data with dataloader for model
            train_dataset = NewsGroupDataset(train_data_indices, train_target)
            train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=newsgroup_collate_func,
                                                   shuffle=True)

            val_dataset = NewsGroupDataset(val_data_indices, val_target)
            val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                       batch_size=BATCH_SIZE,
                                                       collate_fn=newsgroup_collate_func,
                                                       shuffle=True)


            # run
            for emb_i in emb_dim_list:
                print("current config: ")
                print("n-gram: ", n_i, " max vocab size: ", v_s, " embedding size: ", emb_i)
                val_acc_i = run_model(emb_i, lr_default, num_epochs_default, train_loader, val_loader,id2token)
                print ("Val Acc {}".format(val_acc_i))
                if val_acc_i > max_val_acc:
                    max_val_acc = val_acc_i
                    best_config = [n_i, v_s, emb_i]  
    return best_config

    
#     # update train data and val according to best n-gram size
#     for i in range(1,best_n+1):
#         for idx in range(train_size):
#             train_data_tokens[idx]+= train_token_list[i-2][idx]
#         for idx in range(val_size):
#             val_data_tokens[idx] += val_token_list[i-2][idx]
            
    
    
#     # tune vocabulary size
#     best_vocab_size = -1
#     max_val_acc = -1
    
#     print("Tuning max vocabulary size:")
#     for v_s in vocab_size_list:
#         print("max vocab size: ", v_s)
#         token2id, id2token = build_vocab(best_n, v_s)
        
#         # get indices
#         train_data_indices = token2index_dataset(train_data_tokens, token2id)
#         val_data_indices = token2index_dataset(val_data_tokens, token2id)
    
#         # prepare data with dataloader for model
#         train_dataset = NewsGroupDataset(train_data_indices, train_target)
#         train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
#                                                batch_size=BATCH_SIZE,
#                                                collate_fn=newsgroup_collate_func,
#                                                shuffle=True)
        
#         val_dataset = NewsGroupDataset(val_data_indices, val_target)
#         val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
#                                                    batch_size=BATCH_SIZE,
#                                                    collate_fn=newsgroup_collate_func,
#                                                    shuffle=True)
        
        
        
#         val_acc_i = run_model(emb_dim_default, lr_default, num_epochs_default, train_loader, val_loader, id2token)
#         print ("Val Acc {}".format(val_acc_i))
#         if val_acc_i > max_val_acc:
#             max_val_acc = val_acc_i
#             best_vocab_size = v_s
    
#     # update best vocab size
#     token2id, id2token = build_vocab(best_n, best_vocab_size)
#     # get indices
#     train_data_indices = token2index_dataset(train_data_tokens, token2id)
#     val_data_indices = token2index_dataset(val_data_tokens, token2id)
    
#     # prepare data with dataloader for model
#     train_dataset = NewsGroupDataset(train_data_indices, train_target)
#     train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
#                                                batch_size=BATCH_SIZE,
#                                                collate_fn=newsgroup_collate_func,
#                                                shuffle=True)
        
#     val_dataset = NewsGroupDataset(val_data_indices, val_target)
#     val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
#                                                    batch_size=BATCH_SIZE,
#                                                    collate_fn=newsgroup_collate_func,
#                                                    shuffle=True)
        
    
#     # tune embedding dimension
#     best_emb_dim = -1
#     max_val_acc = -1
    
#     print("Tuning embedding dimension:")
#     for emb_i in emb_dim_list:
#         print("embedding dim: ", emb_i)
#         val_acc_i = run_model(emb_i, lr_default, num_epochs_default, train_loader, val_loader, id2token)
#         print ("Val Acc {}".format(val_acc_i))
#         if val_acc_i > max_val_acc:
#             max_val_acc = val_acc_i
#             best_emb_dim = emb_i
    
#     return [best_n, best_vocab_size, best_emb_dim]
    

best_model_params = tune_n_gram(4, [10000,20000,50000,80000], [100,200,500,800])

Tuning model hyperparameters:
current config: 
n-gram:  1  max vocab size:  10000  embedding size:  100
Epoch: [1/10], Step: [101/313], Validation Acc: 79.12
Epoch: [1/10], Step: [201/313], Validation Acc: 86.0
Epoch: [1/10], Step: [301/313], Validation Acc: 87.3
Epoch: [2/10], Step: [101/313], Validation Acc: 87.0
Epoch: [2/10], Step: [201/313], Validation Acc: 87.4
Epoch: [2/10], Step: [301/313], Validation Acc: 87.9
Epoch: [3/10], Step: [101/313], Validation Acc: 87.44
Epoch: [3/10], Step: [201/313], Validation Acc: 86.78
Epoch: [3/10], Step: [301/313], Validation Acc: 87.02
Epoch: [4/10], Step: [101/313], Validation Acc: 86.84
Epoch: [4/10], Step: [201/313], Validation Acc: 86.74
Epoch: [4/10], Step: [301/313], Validation Acc: 86.44
Epoch: [5/10], Step: [101/313], Validation Acc: 87.02
Epoch: [5/10], Step: [201/313], Validation Acc: 86.36
Epoch: [5/10], Step: [301/313], Validation Acc: 86.14
Epoch: [6/10], Step: [101/313], Validation Acc: 85.7
Epoch: [6/10], Step: [201/313], Valida

Epoch: [9/10], Step: [101/313], Validation Acc: 86.94
Epoch: [9/10], Step: [201/313], Validation Acc: 86.66
Epoch: [9/10], Step: [301/313], Validation Acc: 86.46
Epoch: [10/10], Step: [101/313], Validation Acc: 86.42
Epoch: [10/10], Step: [201/313], Validation Acc: 86.52
Epoch: [10/10], Step: [301/313], Validation Acc: 86.54
Val Acc 86.16
current config: 
n-gram:  1  max vocab size:  20000  embedding size:  200
Epoch: [1/10], Step: [101/313], Validation Acc: 82.72
Epoch: [1/10], Step: [201/313], Validation Acc: 83.96
Epoch: [1/10], Step: [301/313], Validation Acc: 84.8
Epoch: [2/10], Step: [101/313], Validation Acc: 87.78
Epoch: [2/10], Step: [201/313], Validation Acc: 88.08
Epoch: [2/10], Step: [301/313], Validation Acc: 88.46
Epoch: [3/10], Step: [101/313], Validation Acc: 88.36
Epoch: [3/10], Step: [201/313], Validation Acc: 88.1
Epoch: [3/10], Step: [301/313], Validation Acc: 86.86
Epoch: [4/10], Step: [101/313], Validation Acc: 87.94
Epoch: [4/10], Step: [201/313], Validation Acc:

Epoch: [7/10], Step: [101/313], Validation Acc: 87.3
Epoch: [7/10], Step: [201/313], Validation Acc: 87.1


## Exercise 1
### Try training the model with larger embedding size and for larger number of epochs
### Also plot the training curves of the model

## Exercise 2:
### Try downloading IMDB Large Movie Review Dataset that is used for Assignment 1 http://ai.stanford.edu/~amaas/data/sentiment/
### and tokenize it

## Exercise 3:
### If you have time, after tokenizing the dataset try training Bag-of-Words model on it and report your initial results
### on validation set.

In [97]:
len(train_data_tokens)

20000