In [1]:
import os
import spacy
import string
import torch
import copy
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pickle as pkl
from nltk import ngrams
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
train_path = '/Users/yi/Documents/DS1011/hw1/data/aclImdb/train'
test_path = '/Users/yi/Documents/DS1011/hw1/data/aclImdb/test'

def readfiles(path):
    signset = ['pos','neg']
    dataset = []
    targets = []
    for sign in signset: 
        files = os.listdir(path+'/' + sign)
        if sign == 'pos':
            target = 1
        else:
            target = 0
        for file in files:
            f = open(path+'/'+sign +'/'+file)
            iter_f = iter(f)
            tmp = ''
            for line in iter_f:
                tmp += line
            dataset.append(tmp)
            targets.append(target)
    return [dataset,targets]


In [3]:
train_x,train_y = readfiles(train_path)
train_x,val_x,train_y,val_y = train_test_split(train_x, train_y, test_size=0.2,random_state = 1)
test_x,test_y = readfiles(test_path)

In [4]:
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
# to delete common words
def isCommon(ngram):
    commonWords = ["the", "be", "and", "of", "a", "in", "to", "have",
                   "it", "i", "that", "for", "you", "he", "with", "on", "do", "say",
                   "this", "they", "is", "an", "at", "but","we", "his", "from", "that",
                   "by", "she", "or", "as", "what", "go", "their","can", "who",
                   "get", "if", "would", "her", "all", "my", "make", "about", "know",
                   "will","as", "up", "one", "time", "has", "been", "there", "year", "so",
                   "think", "when", "which", "them", "some", "me", "people", "take", "out",
                   "into", "just", "see", "him", "your", "come", "could", "now", "than",
                   "like", "other", "how", "then", "its", "our", "two", "more", "these",
                   "want", "way", "look", "first", "also", "new", "because", "day", "more",
                   "use", "man", "find", "here", "thing", "give", "many", "well"]

    return True if ngram in commonWords else False

# lowercase and remove punctuation 
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]

In [5]:
def tokenize_ngram_common(dataset,n):
    token_dataset = []
    all_tokens = []
    for sample in dataset:
        tokens = tokenize(sample)
        if n == 1:
            token_dataset.append(tokens)
            all_tokens += tokens
        else:
            ngram_tokens = ngrams(tokens,n)
            tmp = [c for c in ngram_tokens]
            token_dataset.append(tmp)
            all_tokens += tmp
    return token_dataset, all_tokens

In [6]:
train_data_tokens_common, all_train_tokens_common = tokenize_ngram_common(train_x,1)

In [7]:
val_data_tokens_common, _ = tokenize_ngram_common(val_x,1)
test_data_tokens_common, _ = tokenize_ngram_common(test_x,1)

In [13]:
def build_vocab(all_tokens,max_vocab_size = 10000, PAD_IDX = 0,UNK_IDX = 1):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        for token in tokens:
            index_list = []
            if token in token2id:
                index_list.append(token2id[token])
            else:
                index_list.append(1)
        indices_data.append(index_list)
    return indices_data

In [15]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [10]:
class Ngrams(nn.Module):
    #2-class classification
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(Ngrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,2)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [11]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]       
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [16]:
train_data_tokens = pkl.load(open("train_data_tokens_1.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens_1.p", "rb"))

val_data_tokens = pkl.load(open("val_data_tokens_1.p", "rb"))
test_data_tokens = pkl.load(open("test_data_tokens_1.p", "rb"))




Epoch: [1/5], Step: [101/625], Validation Acc: 56.38
Epoch: [1/5], Step: [201/625], Validation Acc: 56.22
Epoch: [1/5], Step: [301/625], Validation Acc: 56.72
Epoch: [1/5], Step: [401/625], Validation Acc: 58.24
Epoch: [1/5], Step: [501/625], Validation Acc: 58.06
Epoch: [1/5], Step: [601/625], Validation Acc: 58.46
Epoch: [2/5], Step: [101/625], Validation Acc: 58.82
Epoch: [2/5], Step: [201/625], Validation Acc: 58.68
Epoch: [2/5], Step: [301/625], Validation Acc: 59.54
Epoch: [2/5], Step: [401/625], Validation Acc: 60.34
Epoch: [2/5], Step: [501/625], Validation Acc: 60.92
Epoch: [2/5], Step: [601/625], Validation Acc: 58.16
Epoch: [3/5], Step: [101/625], Validation Acc: 59.92
Epoch: [3/5], Step: [201/625], Validation Acc: 59.28
Epoch: [3/5], Step: [301/625], Validation Acc: 60.08
Epoch: [3/5], Step: [401/625], Validation Acc: 60.98
Epoch: [3/5], Step: [501/625], Validation Acc: 60.58
Epoch: [3/5], Step: [601/625], Validation Acc: 59.98
Epoch: [4/5], Step: [101/625], Validation Acc:

In [19]:
max_vocab_size = 10000
token2id, id2token = build_vocab(all_train_tokens,max_vocab_size = max_vocab_size )
train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)
BATCH_SIZE = 32
MAX_SENTENCE_LENGTH = 200
train_dataset = NewsGroupDataset(train_data_indices, train_y)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_y)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_y)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

emb_dim = 400
model = Ngrams(len(id2token), emb_dim)
learning_rate = 0.005
num_epochs = 5 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

val_acclist = []
train_losslist = []
for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            val_acclist.append(val_acc)
            train_losslist.append((100.0 - test_model(train_loader, model)))
val_acclist,train_losslist

([84.5,
  85.86,
  88.74000000000001,
  87.4,
  88.03999999999999,
  88.25999999999999,
  88.52000000000001,
  88.6,
  87.92,
  87.7,
  88.53999999999999,
  88.9,
  88.22,
  88.46000000000001,
  88.08,
  88.86,
  89.06,
  88.34,
  88.6,
  88.68,
  88.82,
  89.52000000000001,
  88.4,
  88.4,
  89.34,
  87.86,
  88.88,
  88.42,
  86.82,
  87.64],
 [8.424,
  8.067,
  7.707,
  7.827,
  7.694,
  7.3260000000000005,
  7.013999999999999,
  6.994999999999999,
  6.827000000000002,
  6.811000000000002,
  6.7940000000000005,
  6.541,
  6.66,
  6.6530000000000005,
  6.5840000000000005,
  6.505000000000002,
  6.316999999999999,
  6.594999999999999,
  6.187000000000001,
  6.297999999999999,
  6.404,
  6.313,
  6.395,
  6.388,
  6.263999999999999,
  6.3610000000000015,
  6.2620000000000005,
  6.515000000000001,
  6.6530000000000005,
  6.511000000000002])