In [1]:
import torch
import numpy as np
import sklearn
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
import string
import nltk
import tqdm
import pickle as pkl

In [2]:
import matplotlib.pyplot as plt

In [5]:
path = '/Users/williamgodel/Google Drive/Grad School/Year Three/NLP/HW1'

In [15]:
#loading train negative reviews
os.chdir('/Users/williamgodel/Google Drive/Grad School/Year Three/NLP/HW1/aclImdb/train/neg')

neg_rev = []
for x in os.listdir():
    neg_rev.append(open(x,'r').read())


#loading train positive reviews
os.chdir('/Users/williamgodel/Google Drive/Grad School/Year Three/NLP/HW1/aclImdb/train/pos')

pos_rev = []
for x in os.listdir():
    pos_rev.append(open(x,'r').read())

all_rev = neg_rev
all_rev.extend(pos_rev)
train_data = pd.DataFrame({"reviews":all_rev})
train_data['positive'] = 0
train_data.iloc[12500:,1] = 1

X_train, X_val, y_train, y_val = train_test_split(train_data['reviews'], train_data['positive'], test_size=0.2, random_state=42)

In [27]:
#loading test negative reviews
os.chdir('/Users/williamgodel/Google Drive/Grad School/Year Three/NLP/HW1/aclImdb/test/neg')

neg_rev = []
for x in os.listdir():
    neg_rev.append(open(x,'r').read())


#loading test positive reviews
os.chdir('/Users/williamgodel/Google Drive/Grad School/Year Three/NLP/HW1/aclImdb/test/pos')

pos_rev = []
for x in os.listdir():
    pos_rev.append(open(x,'r').read())

all_rev = neg_rev
all_rev.extend(pos_rev)
test_data = pd.DataFrame({"reviews":all_rev})
test_data['positive'] = 0
test_data.iloc[12500:,1] = 1
X_test, y_test = test_data.iloc[:,0], test_data.iloc[:,1]

y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

os.chdir(path)

In [28]:
del pos_rev, neg_rev, all_rev, train_data, test_data
del X_train, X_test, X_val

In [6]:
os.chdir(path)
train_data_tokens_2 = pkl.load(open("train_data_tokens_2.p", "rb"))
all_train_tokens_2 = pkl.load(open("all_train_tokens_2.p", "rb"))

val_data_tokens_2 = pkl.load(open("val_data_tokens_2.p", "rb"))
test_data_tokens_2 = pkl.load(open("test_data_tokens_2.p", "rb"))

In [8]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens_2)

In [9]:
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens_2)
val_data_indices = token2index_dataset(val_data_tokens_2)
test_data_indices = token2index_dataset(test_data_tokens_2)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [10]:
MAX_SENTENCE_LENGTH = 200

from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]


In [31]:
BATCH_SIZE = 32
train_dataset = NewsGroupDataset(train_data_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, y_val)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, y_test)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

In [33]:
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out



In [35]:
# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

def Model_train(emb_dim, model, learning_rate, \
                num_epochs, criterion, optimizer, \
                train_loader):
    


    loss_vals = []
    acc_est = []
    for epoch in range(num_epochs):
        for i, (data, lengths, labels) in enumerate(train_loader):        
            model.train()
            data_batch, length_batch, label_batch = data, lengths, labels
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss_vals.append(loss/labels.size(0))
            loss.backward()
            optimizer.step()
        
    
            # validate every 100 iterations
            if i > 0 and i % 100 == 0:
                # validate
                val_acc = test_model(val_loader, model)
                acc_est.append(val_acc)
                #loss_vals.append(test_model_LOSS(train_loader,model))
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc))
    
    return loss_vals, acc_est

In [48]:

num_epochs = 5# number epoch to train
learning_rates = .001
embedd_dim = [100,200,500,1000]

loss_performance = []
acc_performance = []

for o in embedd_dim:
    
    emb_dim = o
    model = BagOfWords(len(id2token), emb_dim)
    
    criterion = torch.nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rates)
    
    loss_vals, acc_est = Model_train(emb_dim = emb_dim, model = model, learning_rate = learning_rates, \
                                     num_epochs = num_epochs, criterion = criterion, optimizer = optimizer, \
                                     train_loader = train_loader)
    
    loss_performance.append(loss_vals)
    acc_performance.append(test_model(val_loader, model))

Epoch: [1/5], Step: [101/625], Validation Acc: 53.96
Epoch: [1/5], Step: [201/625], Validation Acc: 54.02
Epoch: [1/5], Step: [301/625], Validation Acc: 62.76
Epoch: [1/5], Step: [401/625], Validation Acc: 66.62
Epoch: [1/5], Step: [501/625], Validation Acc: 65.96
Epoch: [1/5], Step: [601/625], Validation Acc: 69.68
Epoch: [2/5], Step: [101/625], Validation Acc: 74.4
Epoch: [2/5], Step: [201/625], Validation Acc: 76.08
Epoch: [2/5], Step: [301/625], Validation Acc: 77.24
Epoch: [2/5], Step: [401/625], Validation Acc: 79.26
Epoch: [2/5], Step: [501/625], Validation Acc: 79.88
Epoch: [2/5], Step: [601/625], Validation Acc: 79.62
Epoch: [3/5], Step: [101/625], Validation Acc: 81.2
Epoch: [3/5], Step: [201/625], Validation Acc: 82.06
Epoch: [3/5], Step: [301/625], Validation Acc: 82.58
Epoch: [3/5], Step: [401/625], Validation Acc: 83.14
Epoch: [3/5], Step: [501/625], Validation Acc: 83.14
Epoch: [3/5], Step: [601/625], Validation Acc: 83.5
Epoch: [4/5], Step: [101/625], Validation Acc: 83

In [49]:
acc_performance

[85.3, 86.06, 86.22, 86.12]

In [39]:
os.chdir(path)
train_data_tokens_3 = pkl.load(open("train_data_tokens_3.p", "rb"))
all_train_tokens_3 = pkl.load(open("all_train_tokens_3.p", "rb"))

val_data_tokens_3 = pkl.load(open("val_data_tokens_3.p", "rb"))
test_data_tokens_3 = pkl.load(open("test_data_tokens_3.p", "rb"))

token2id, id2token = build_vocab(all_train_tokens_3)

train_data_indices = token2index_dataset(train_data_tokens_3)
val_data_indices = token2index_dataset(val_data_tokens_3)
test_data_indices = token2index_dataset(test_data_tokens_3)

BATCH_SIZE = 32
train_dataset = NewsGroupDataset(train_data_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, y_val)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, y_test)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

num_epochs = 5# number epoch to train
learning_rates = .001
embedd_dim = [100,200,500,1000]

loss_performance1 = []
acc_performance1 = []

for o in embedd_dim:
    
    emb_dim = o
    model = BagOfWords(len(id2token), emb_dim)
    
    criterion = torch.nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rates)
    
    loss_vals, acc_est = Model_train(emb_dim = emb_dim, model = model, learning_rate = learning_rates, \
                                     num_epochs = num_epochs, criterion = criterion, optimizer = optimizer, \
                                     train_loader = train_loader)
    
    loss_performance1.append(loss_vals)
    acc_performance1.append(test_model(val_loader, model))

Epoch: [1/5], Step: [101/625], Validation Acc: 51.42
Epoch: [1/5], Step: [201/625], Validation Acc: 58.44
Epoch: [1/5], Step: [301/625], Validation Acc: 57.76
Epoch: [1/5], Step: [401/625], Validation Acc: 63.82
Epoch: [1/5], Step: [501/625], Validation Acc: 67.5
Epoch: [1/5], Step: [601/625], Validation Acc: 69.7
Epoch: [2/5], Step: [101/625], Validation Acc: 73.08
Epoch: [2/5], Step: [201/625], Validation Acc: 74.34
Epoch: [2/5], Step: [301/625], Validation Acc: 77.04
Epoch: [2/5], Step: [401/625], Validation Acc: 78.4
Epoch: [2/5], Step: [501/625], Validation Acc: 79.8
Epoch: [2/5], Step: [601/625], Validation Acc: 80.72
Epoch: [3/5], Step: [101/625], Validation Acc: 81.78
Epoch: [3/5], Step: [201/625], Validation Acc: 82.44
Epoch: [3/5], Step: [301/625], Validation Acc: 82.7
Epoch: [3/5], Step: [401/625], Validation Acc: 83.2
Epoch: [3/5], Step: [501/625], Validation Acc: 82.66
Epoch: [3/5], Step: [601/625], Validation Acc: 83.54
Epoch: [4/5], Step: [101/625], Validation Acc: 83.22

In [41]:
os.chdir(path)
train_data_tokens_4 = pkl.load(open("train_data_tokens_4.p", "rb"))
all_train_tokens_4 = pkl.load(open("all_train_tokens_4.p", "rb"))

val_data_tokens_4 = pkl.load(open("val_data_tokens_4.p", "rb"))
test_data_tokens_4 = pkl.load(open("test_data_tokens_4.p", "rb"))

token2id, id2token = build_vocab(all_train_tokens_4)

train_data_indices = token2index_dataset(train_data_tokens_4)
val_data_indices = token2index_dataset(val_data_tokens_4)
test_data_indices = token2index_dataset(test_data_tokens_4)

BATCH_SIZE = 32
train_dataset = NewsGroupDataset(train_data_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, y_val)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, y_test)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

num_epochs = 5# number epoch to train
learning_rates = .001
embedd_dim = [100,200,500,1000]

loss_performance2 = []
acc_performance2 = []

for o in embedd_dim:
    
    emb_dim = o
    model = BagOfWords(len(id2token), emb_dim)
    
    criterion = torch.nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rates)
    
    loss_vals, acc_est = Model_train(emb_dim = emb_dim, model = model, learning_rate = learning_rates, \
                                     num_epochs = num_epochs, criterion = criterion, optimizer = optimizer, \
                                     train_loader = train_loader)
    
    loss_performance2.append(loss_vals)
    acc_performance2.append(test_model(val_loader, model))

Epoch: [1/5], Step: [101/625], Validation Acc: 49.84
Epoch: [1/5], Step: [201/625], Validation Acc: 59.1
Epoch: [1/5], Step: [301/625], Validation Acc: 61.48
Epoch: [1/5], Step: [401/625], Validation Acc: 63.98
Epoch: [1/5], Step: [501/625], Validation Acc: 67.3
Epoch: [1/5], Step: [601/625], Validation Acc: 71.14
Epoch: [2/5], Step: [101/625], Validation Acc: 71.52
Epoch: [2/5], Step: [201/625], Validation Acc: 74.52
Epoch: [2/5], Step: [301/625], Validation Acc: 76.18
Epoch: [2/5], Step: [401/625], Validation Acc: 78.02
Epoch: [2/5], Step: [501/625], Validation Acc: 79.24
Epoch: [2/5], Step: [601/625], Validation Acc: 80.44
Epoch: [3/5], Step: [101/625], Validation Acc: 81.02
Epoch: [3/5], Step: [201/625], Validation Acc: 81.86
Epoch: [3/5], Step: [301/625], Validation Acc: 82.18
Epoch: [3/5], Step: [401/625], Validation Acc: 82.78
Epoch: [3/5], Step: [501/625], Validation Acc: 83.12
Epoch: [3/5], Step: [601/625], Validation Acc: 83.28
Epoch: [4/5], Step: [101/625], Validation Acc: 8

In [51]:
perf_table = np.vstack((acc_performance, acc_performance1, acc_performance2))

In [52]:
print(pd.DataFrame(perf_table).to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &      0 &      1 &      2 &      3 \\
\midrule
0 &  85.30 &  86.06 &  86.22 &  86.12 \\
1 &  85.94 &  86.22 &  86.56 &  86.02 \\
2 &  85.32 &  85.40 &  85.96 &  85.98 \\
\bottomrule
\end{tabular}

