In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pickle as pkl
from matplotlib import pyplot as plt
from collections import Counter
from torch.utils.data import Dataset


In [2]:
MAX_SENTENCE_LENGTH = 200
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token


In [3]:
# convert token to id in the dataset
def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data


In [4]:
class MovieReviewDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]
    

In [5]:
def moviereview_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]


In [6]:
train_targets = pkl.load(open("./pkl/train_targets.p","rb"))
val_targets = pkl.load(open("./pkl/val_targets.p", "rb"))
test_targets = pkl.load(open("./pkl/test_targets.p", "rb"))


In [7]:
train_data_tokens_wp_1 = pkl.load(open("./pkl/train_tokens_wp_1.p", "rb"))
all_train_tokens_wp_1 = pkl.load(open("./pkl/all_train_tokens_wp_1.p", "rb"))
val_data_tokens_wp_1 = pkl.load(open("./pkl/val_tokens_wp_1.p", "rb"))
test_data_tokens_wp_1 = pkl.load(open("./pkl/test_tokens_wp_1.p", "rb"))

train_data_tokens_wp_2 = pkl.load(open("./pkl/train_tokens_wp_2.p", "rb"))
all_train_tokens_wp_2 = pkl.load(open("./pkl/all_train_tokens_wp_2.p", "rb"))
val_data_tokens_wp_2 = pkl.load(open("./pkl/val_tokens_wp_2.p", "rb"))
test_data_tokens_wp_2 = pkl.load(open("./pkl/test_tokens_wp_2.p", "rb"))

train_data_tokens_wp_3 = pkl.load(open("./pkl/train_tokens_wp_3.p", "rb"))
all_train_tokens_wp_3 = pkl.load(open("./pkl/all_train_tokens_wp_3.p", "rb"))
val_data_tokens_wp_3 = pkl.load(open("./pkl/val_tokens_wp_3.p", "rb"))
test_data_tokens_wp_3 = pkl.load(open("./pkl/test_tokens_wp_3.p", "rb"))

train_data_tokens_wp_4 = pkl.load(open("./pkl/train_tokens_wp_4.p", "rb"))
all_train_tokens_wp_4 = pkl.load(open("./pkl/all_train_tokens_wp_4.p", "rb"))
val_data_tokens_wp_4 = pkl.load(open("./pkl/val_tokens_wp_4.p", "rb"))
test_data_tokens_wp_4 = pkl.load(open("./pkl/test_tokens_wp_4.p", "rb"))


In [8]:
train_data_tokens_np_1 = pkl.load(open("./pkl/train_tokens_np_1.p", "rb"))
all_train_tokens_np_1 = pkl.load(open("./pkl/all_train_tokens_np_1.p", "rb"))
val_data_tokens_np_1 = pkl.load(open("./pkl/val_tokens_np_1.p", "rb"))
test_data_tokens_np_1 = pkl.load(open("./pkl/test_tokens_np_1.p", "rb"))

train_data_tokens_np_2 = pkl.load(open("./pkl/train_tokens_np_2.p", "rb"))
all_train_tokens_np_2 = pkl.load(open("./pkl/all_train_tokens_np_2.p", "rb"))
val_data_tokens_np_2 = pkl.load(open("./pkl/val_tokens_np_2.p", "rb"))
test_data_tokens_np_2 = pkl.load(open("./pkl/test_tokens_np_2.p", "rb"))

train_data_tokens_np_3 = pkl.load(open("./pkl/train_tokens_np_3.p", "rb"))
all_train_tokens_np_3 = pkl.load(open("./pkl/all_train_tokens_np_3.p", "rb"))
val_data_tokens_np_3 = pkl.load(open("./pkl/val_tokens_np_3.p", "rb"))
test_data_tokens_np_3 = pkl.load(open("./pkl/test_tokens_np_3.p", "rb"))

train_data_tokens_np_4 = pkl.load(open("./pkl/train_tokens_np_4.p", "rb"))
all_train_tokens_np_4 = pkl.load(open("./pkl/all_train_tokens_np_4.p", "rb"))
val_data_tokens_np_4 = pkl.load(open("./pkl/val_tokens_np_4.p", "rb"))
test_data_tokens_np_4 = pkl.load(open("./pkl/test_tokens_np_4.p", "rb"))


In [9]:
train_data_tokens_wp = []
train_data_tokens_wp.append(train_data_tokens_wp_1)
train_data_tokens_wp.append(train_data_tokens_wp_2)
train_data_tokens_wp.append(train_data_tokens_wp_3)
train_data_tokens_wp.append(train_data_tokens_wp_4)

val_data_tokens_wp = []
val_data_tokens_wp.append(val_data_tokens_wp_1)
val_data_tokens_wp.append(val_data_tokens_wp_2)
val_data_tokens_wp.append(val_data_tokens_wp_3)
val_data_tokens_wp.append(val_data_tokens_wp_4)

test_data_tokens_wp = []
test_data_tokens_wp.append(test_data_tokens_wp_1)
test_data_tokens_wp.append(test_data_tokens_wp_2)
test_data_tokens_wp.append(test_data_tokens_wp_3)
test_data_tokens_wp.append(test_data_tokens_wp_4)

all_train_tokens_wp = []
all_train_tokens_wp.append(all_train_tokens_wp_1)
all_train_tokens_wp.append(all_train_tokens_wp_2)
all_train_tokens_wp.append(all_train_tokens_wp_3)
all_train_tokens_wp.append(all_train_tokens_wp_4)


In [10]:
train_data_tokens_np = []
train_data_tokens_np.append(train_data_tokens_np_1)
train_data_tokens_np.append(train_data_tokens_np_2)
train_data_tokens_np.append(train_data_tokens_np_3)
train_data_tokens_np.append(train_data_tokens_np_4)

val_data_tokens_np = []
val_data_tokens_np.append(val_data_tokens_np_1)
val_data_tokens_np.append(val_data_tokens_np_2)
val_data_tokens_np.append(val_data_tokens_np_3)
val_data_tokens_np.append(val_data_tokens_np_4)

test_data_tokens_np = []
test_data_tokens_np.append(test_data_tokens_np_1)
test_data_tokens_np.append(test_data_tokens_np_2)
test_data_tokens_np.append(test_data_tokens_np_3)
test_data_tokens_np.append(test_data_tokens_np_4)

all_train_tokens_np = []
all_train_tokens_np.append(all_train_tokens_np_1)
all_train_tokens_np.append(all_train_tokens_np_2)
all_train_tokens_np.append(all_train_tokens_np_3)
all_train_tokens_np.append(all_train_tokens_np_4)


In [11]:
def token_select(punc, ngrams):
    if(punc == 'w'):
        all_train_tokens = all_train_tokens_wp[ngrams-1]
        train_data_tokens = train_data_tokens_wp[ngrams-1]
        val_data_tokens = val_data_tokens_wp[ngrams-1]
        test_data_tokens = test_data_tokens_wp[ngrams-1]
    elif(punc == 'n'):
        all_train_tokens = all_train_tokens_np[ngrams-1]
        train_data_tokens = train_data_tokens_np[ngrams-1]
        val_data_tokens = val_data_tokens_np[ngrams-1]
        test_data_tokens = test_data_tokens_np[ngrams-1]
        
    return all_train_tokens, train_data_tokens, val_data_tokens, test_data_tokens


In [12]:
class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out
    

In [13]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


In [14]:
BATCH_SIZE = 32
learning_rate = 0.005
num_epochs = 4

for pc in ['w','n']:
    for ng in [1,2,3,4]:
        for max_vocab_size in [20000,30000]:
            all_train_tokens, train_data_tokens, val_data_tokens, test_data_tokens = token_select(pc, ng)
            token2id, id2token = build_vocab(all_train_tokens, max_vocab_size)
        
            train_data_indices = token2index_dataset(train_data_tokens, token2id)
            val_data_indices = token2index_dataset(val_data_tokens, token2id)
            test_data_indices = token2index_dataset(test_data_tokens, token2id)
                
            train_dataset = MovieReviewDataset(train_data_indices, train_targets)
            train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                       batch_size=BATCH_SIZE,
                                                       collate_fn=moviereview_collate_func,
                                                       shuffle=True)

            val_dataset = MovieReviewDataset(val_data_indices, val_targets)
            val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                                        batch_size=BATCH_SIZE,
                                                        collate_fn=moviereview_collate_func,
                                                        shuffle=True)

            test_dataset = MovieReviewDataset(test_data_indices, test_targets)
            test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                        batch_size=BATCH_SIZE,
                                                        collate_fn=moviereview_collate_func,
                                                        shuffle=False)
            for emb_dim in [200,300]:
                model = BagOfWords(len(id2token), emb_dim)
                criterion = torch.nn.CrossEntropyLoss()
                for opt in ['Adam', 'SGD']:
                    file_path = './result/' + pc + '_' + str(ng) + '_' + str(max_vocab_size) + '_' + str(emb_dim) + '_' + opt + '.txt'
                    f= open(file_path,"w+")
                    
                    if(opt == 'Adam'):
                        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
                    elif(opt == 'SGD'):
                        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
                        
                    val_acc_epochs = []
                    step_numbers = []

                    for epoch in range(num_epochs):
                        for i, (data, lengths, labels) in enumerate(train_loader):
                            model.train()
                            data_batch, length_batch, label_batch = data, lengths, labels
                            optimizer.zero_grad()
                            outputs = model(data_batch, length_batch)
                            loss = criterion(outputs, label_batch)
                            loss.backward()
                            optimizer.step()
                            # validate every 100 iterations
                            if i > 0 and i % 100 == 0:
                                # validate
                                val_acc = test_model(val_loader, model)
                                val_acc_epochs.append(val_acc)
                                step_numbers.append(i + epoch * 600)
                                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                                           epoch+1, num_epochs, i+1, len(train_loader), val_acc), file=open(file_path, "a"))
            
                    print ("After training for {} epochs".format(num_epochs), file=open(file_path, "a"))
                    print ("Val Acc {}".format(test_model(val_loader, model)), file=open(file_path, "a"))
                    print ("Test Acc {}".format(test_model(test_loader, model)), file=open(file_path, "a"))
                                
                    #plt.plot(step_numbers, val_acc_epochs)
                    #plt.xlabel("Step")
                    #plt.ylabel("Validation Accuracy")
                    #plt.title("Training curve")
                    #plot_path = './plot/' + pc + '_' + str(ng) + '_' + str(max_vocab_size) + '_' + str(emb_dim) + '_' + opt + '.png'
                    #plt.savefig(plot_path)  
                    #plt.close()

In [None]:
'''
max_vocab_size = 20000
emb_dim = 200

BATCH_SIZE = 32
learning_rate = 0.005
num_epochs = 5

all_train_tokens, train_data_tokens, val_data_tokens, test_data_tokens = token_select('w', 1)

token2id, id2token = build_vocab(all_train_tokens, max_vocab_size)
train_data_indices = token2index_dataset(train_data_tokens, token2id)
val_data_indices = token2index_dataset(val_data_tokens, token2id)
test_data_indices = token2index_dataset(test_data_tokens, token2id)


train_dataset = MovieReviewDataset(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=moviereview_collate_func,
                                           shuffle=True)

val_dataset = MovieReviewDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=moviereview_collate_func,
                                           shuffle=True)

test_dataset = MovieReviewDataset(test_data_indices, test_targets)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=moviereview_collate_func,
                                           shuffle=False)

model = BagOfWords(len(id2token), emb_dim)
criterion = torch.nn.CrossEntropyLoss()  

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


val_acc_epochs = []
step_numbers = []

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            val_acc_epochs.append(val_acc)
            step_numbers.append(i + epoch * 600)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            
            
            

print ("After training for {} epochs".format(num_epochs))
print ("Val Acc {}".format(test_model(val_loader, model)))
print ("Test Acc {}".format(test_model(test_loader, model)))


plt.plot(step_numbers, val_acc_epochs)
plt.xlabel("Step")
plt.ylabel("Validation Accuracy")
plt.title("Training curve")
plt.savefig("./plot/curve_w_1.png")
plt.show()
'''


In [30]:
'''
for pc in ['w', 'n']:
    for ng in [1,2,3,4]:
        for max_vocab_size in [10000, 20000, 30000]:
            for emb_dim in [50, 100, 200]:
                for opt in ['SGD', 'Adam']:
                    file_path = './result/' + pc + '_' + str(ng) + '_' + str(max_vocab_size) + '_' + str(emb_dim) + '_' + opt + '.txt'
                    f= open(file_path,"w+")
                    print("Hello stackoverflow!", file=open(file_path, "a"))
'''


In [15]:
plt.plot(step_numbers, val_acc_epochs)
plt.xlabel("Step")
plt.ylabel("Validation Accuracy")
plt.title("Training curve")
plot_path = './plot/' + pc + '_' + str(ng) + '_' + str(max_vocab_size) + '_' + str(emb_dim) + '_' + opt + '.png'
plt.savefig(plot_path)  
#plt.close()