# Data Processing

In [35]:
#Load Documents 
import glob
import dill
import numpy as np
import string
from collections import Counter

def load_dill_file(path):
    with open(path, "rb") as dill_file:
        return dill.load(dill_file)


#Load Review Tokens 
x_train= load_dill_file("/Users/Taurean/Documents/NLP-HW1/tokens/train_tokens_clean")
x_val =  load_dill_file("/Users/Taurean/Documents/NLP-HW1/tokens/val_tokens_clean")
x_test = load_dill_file("/Users/Taurean/Documents/NLP-HW1/tokens/test_tokens_clean")


#Load Target Values
y_train = np.genfromtxt("/Users/Taurean/Documents/NLP-HW1/target/train_target.txt",dtype='int')
y_val = np.genfromtxt("/Users/Taurean/Documents/NLP-HW1/target/val_target.txt",dtype='int')
y_test = np.genfromtxt("/Users/Taurean/Documents/NLP-HW1/target/test_target.txt",dtype='int')


#Print Total Samples in the data
print("The number of reviews in x_train is {:,d}".format(len(x_train)))
print("The number of reviews in x_val is {:,d}".format(len(x_val)))
print("The number of reviews in x_test is {:,d}".format(len(x_test)))

The number of reviews in x_train is 25,000
The number of reviews in x_val is 5,000
The number of reviews in x_test is 20,000


In [47]:
#Maps Word to Id Number
def data_dictionary(reviews,vocab_size_limit):
    token_counter = Counter()
    PAD_IDX = 0
    UNK_IDX = 1
    for review in reviews:
        for words in set(review):
            token_counter[words] += 1
            

    vocab, count = zip(*token_counter.most_common(vocab_size_limit))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

#Convert Review tokens to Id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else 0 for token in tokens]
        indices_data.append(index_list)
    return indices_data



x= data_dictionary(x_train,200000)
token2id= x[0]
id2token = x[1]
train_data_indices = token2index_dataset(x_train)
val_data_indices = token2index_dataset(x_val)
test_data_indices = token2index_dataset(x_test)

In [11]:
train_data_indices

[[173,
  414,
  279,
  580,
  18,
  294,
  16,
  996,
  1201,
  27,
  83,
  0,
  35,
  16,
  1985,
  1183,
  42,
  35,
  74,
  149,
  0,
  14,
  9,
  414,
  27,
  83,
  0,
  0,
  154,
  164,
  16,
  34,
  1281,
  1562,
  4,
  10,
  61,
  16,
  3,
  1007,
  20,
  76,
  10,
  41,
  34,
  97,
  0,
  14,
  76,
  49,
  16,
  159,
  33,
  10,
  173,
  279,
  16,
  34,
  1146,
  12,
  1287,
  760,
  0,
  6,
  101,
  414,
  7,
  3,
  411,
  204,
  5,
  0,
  10,
  336,
  27,
  139,
  6,
  161,
  922,
  6,
  154,
  0,
  10,
  7,
  0,
  2,
  0,
  7,
  634,
  10,
  7,
  0,
  4,
  0,
  26,
  2,
  0,
  4,
  34,
  906,
  9,
  2,
  0,
  2,
  120,
  23,
  3,
  204,
  5,
  1758,
  154,
  25,
  5,
  117,
  252,
  2,
  824,
  430,
  18,
  2,
  0,
  2,
  0,
  23,
  34,
  953,
  264,
  91,
  57,
  98,
  0,
  65,
  0,
  76,
  38,
  21,
  34,
  69,
  50,
  6,
  80,
  4,
  34,
  69,
  50,
  6,
  138,
  4,
  38,
  21,
  0,
  0,
  2,
  72,
  0,
  11,
  1183,
  15,
  394],
 [8,
  7,
  3,
  200,
  1693,
  209,
  8

In [None]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

list_ = []
data=list(x_train)
for x in range(len(data)):
    line =find_ngrams(data[x],3)
    list_.append(line)


# Hyperparameter Tuning

In [48]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of IMDB movie review tokens 
        @param target_list: list of IMDB movie review targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def IMBD_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = NewsGroupDataset(train_data_indices, train_targets)
#val_loader = NewsGroupDataset(val_data_indices, val_targets)
#test_loader = NewsGroupDataset(test_data_indices, test_targets)

BATCH_SIZE = 32
train_dataset = IMDBDataset(train_data_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=IMBD_collate_func,
                                           shuffle=True)

val_dataset = IMDBDataset(val_data_indices, y_val)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                         batch_size=BATCH_SIZE,
                                         collate_fn=IMBD_collate_func,
                                         shuffle=True)

test_dataset = IMDBDataset(test_data_indices, y_test)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=BATCH_SIZE,
                                          collate_fn=IMBD_collate_func,
                                          shuffle=False)

for i, (data, lengths, labels) in enumerate(train_loader):
    print (data)
    print (labels)
    break

tensor([[ 5, 18,  4,  ..., 16,  9, 13],
        [ 5,  4,  2,  ...,  3,  9, 16],
        [ 5, 28, 13,  ..., 19, 12, 12],
        ...,
        [ 5,  2, 23,  ...,  6,  3, 13],
        [ 5, 20, 11,  ...,  8,  9, 14],
        [ 5, 25,  4,  ...,  2,  6,  3]])
tensor([1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 0, 0, 1, 0, 0, 1, 0])


In [49]:
#import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

emb_dim = 500
model = BagOfWords(len(id2token), emb_dim)

In [None]:
learning_rate = 0.001
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

Epoch: [1/10], Step: [101/782], Validation Acc: 48.9
Epoch: [1/10], Step: [201/782], Validation Acc: 50.94
Epoch: [1/10], Step: [301/782], Validation Acc: 49.54
Epoch: [1/10], Step: [401/782], Validation Acc: 50.68
Epoch: [1/10], Step: [501/782], Validation Acc: 48.84
Epoch: [1/10], Step: [601/782], Validation Acc: 50.82
Epoch: [1/10], Step: [701/782], Validation Acc: 49.04
Epoch: [2/10], Step: [101/782], Validation Acc: 47.54
Epoch: [2/10], Step: [201/782], Validation Acc: 50.74
Epoch: [2/10], Step: [301/782], Validation Acc: 48.86
Epoch: [2/10], Step: [401/782], Validation Acc: 48.84
Epoch: [2/10], Step: [501/782], Validation Acc: 50.9
Epoch: [2/10], Step: [601/782], Validation Acc: 49.04
Epoch: [2/10], Step: [701/782], Validation Acc: 48.36


In [20]:
print ("After training for {} epochs".format(num_epochs))
print ("Val Acc {}".format(test_model(val_loader, model)))
print ("Test Acc {}".format(test_model(test_loader, model)))

After training for 10 epochs
Val Acc 47.92
Test Acc 48.42


In [32]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

list_ = []
data= x_train
for x in range(len(data)):
    line =find_ngrams(data[x],4)
    list_.append(line)

x= data_dictionary(list_,5)
token2id= x[0]
id2token = x[1]