# Data Processing

In [41]:
#Load Documents 
import glob
import numpy as np
import spacy
import string
from collections import Counter
from progressbar import ProgressBar
from nltk.corpus import stopwords
import time
from tqdm import tqdm
import dill


#Create File Index for Train and Val Data
def create_index(path,num_val_samples):
    pos = glob.glob(path+ "pos/*")
    neg = glob.glob(path+ "neg/*")
    pos = np.column_stack((pos,(np.ones(len(pos),dtype=int))))  # Label 1
    neg = np.column_stack((neg,(np.zeros(len(neg),dtype=int))))  # Label 0
    data= np.vstack((pos,neg))
    
    #Create Files Names For Val and Train Set
    np.random.shuffle(data)
    val, test = data[:num_val_samples,:], data[num_val_samples:,:]
    
    #Save files
    np.savetxt("val_path.txt",val[:,0], delimiter=",", fmt="%s") 
    np.savetxt("val_target.txt",val[:,1], delimiter=",", fmt= "%s")
    np.savetxt("test_path.txt",test[:,0], delimiter=",", fmt = "%s") 
    np.savetxt("test_target.txt",test[:,1], delimiter=",", fmt= "%s") 


def read_data(files):
    lines= []
    for f in files:
        with open(f) as file:
            for line in file: 
                line = line.strip().lower().split()
                lines.append(line) #storing everything in memory!
                
    return np.array(lines)
        

def load_data(path):
    if isinstance(path, list):
        data = read_data(path)
        
    else:
        pos = read_data(glob.glob(path+ "pos/*"))
        neg = read_data(glob.glob(path+ "neg/*"))
            
        #Create Labels for Pos and Negative Reviews
        pos = np.column_stack((pos,(np.ones(len(pos),dtype=int))))  # Label 1
        neg = np.column_stack((neg,(np.zeros(len(neg),dtype=int))))  # Label 0
        data= np.vstack((pos,neg))
        np.random.shuffle(data)
    return data
    
    
#Training Data
train_data = load_data("/Users/Taurean/Documents/NLP-HW1/data/train/")
x_train, y_train = train_data[:,0], train_data[:,1]

#np.savetxt("train_target.txt",y_train, delimiter=",", fmt= "%s")

#Validation Data
x_val = load_data(list(np.genfromtxt("/Users/Taurean/Documents/NLP-HW1/val_path.txt",dtype='str')))
y_val = np.genfromtxt("/Users/Taurean/Documents/NLP-HW1/target/val_target.txt",dtype='int')

#Test Data
x_test = load_data(list(np.genfromtxt("/Users/Taurean/Documents/NLP-HW1/test_path.txt",dtype='str')))
y_test = np.genfromtxt("/Users/Taurean/Documents/NLP-HW1/target/test_target.txt",dtype='int')



In [22]:
#Print Total Samples in the data
print("The number of samples in x_train is {:,d}".format(len(x_train)))
print("The number of samples in x_val is {:,d}".format(len(x_val)))
print("The number of samples in x_test is {:,d}".format(len(x_test)))

The number of samples in x_train is 25,000
The number of samples in x_val is 5,000
The number of samples in x_test is 20,000


In [3]:
#Tokenization Schemes
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

#Function to Process Review 
def clean_text(review,punctuation= False,stop_words= False, stem = False):
    review= tokenizer(review)
    if punctuation is True:
        review = [token for token in review if (token.text not in punctuations)]
        
    if stop_words is True:
        review = [token for token in review if (token.text not in stopwords)]
        
    return str(review)
    

#Clean Training Data   
x_train_clean = []
for x in tqdm(range(len(x_train))):
    x_bow_rep =clean_text(str(x_train[x]),stop_words= True,punctuation= True, stem = False)
    x_train_clean.append(x_bow_rep)
    
with open("train_tokens_clean", "wb") as dill_file:
    dill.dump(x_train_clean, dill_file)        
    

#Clean Val Data   
x_val_clean = []
for x in tqdm(range(len(x_val))):
    x_bow_rep =clean_text(str(x_val[x]),stop_words= True,punctuation= True, stem = False)
    x_val_clean.append(x_bow_rep)
    
with open("val_tokens_clean_test", "wb") as dill_file:
    dill.dump(x_val_clean, dill_file) 
    
    
    
#Clean Text Data
x_test_clean = []
for x in tqdm(range(len(x_test))):
    x_bow_rep =clean_text(str(x_test[x]),stop_words= True,punctuation= True, stem = False)
    x_test_clean.append(x_bow_rep)

with open("test_tokens_clean", "wb") as dill_file:
    dill.dump(x_test_clean, dill_file)        
    
    
    
    
    
    

In [19]:
with open("/Users/Taurean/Documents/NLP-HW1/tokens/test_tokens", "rb") as dill_file:
    y=dill.load(dill_file)
    
y[0]

['better',
 'than',
 'the',
 'typical',
 'made-for-tv',
 'movie,',
 'invitation',
 'to',
 'hell',
 'is',
 'blessed',
 'with',
 'excellent',
 'casting',
 '(urich,',
 'lucci,',
 'cassidy,',
 'mccarthy,',
 'pre-murphy',
 'brown',
 'joe',
 'regalbuto,',
 'soleil',
 'moon-frye)',
 'and',
 'a',
 'high',
 'concept',
 'update',
 'to',
 'the',
 'familiar',
 'faustian',
 'plot.',
 'urich',
 'is',
 'likable',
 'as',
 'always',
 'and',
 'lucci',
 'is',
 'particularly',
 'fetching',
 'and',
 'devilishly',
 'over',
 'the',
 'top',
 'in',
 'the',
 'mother',
 'of',
 'all',
 'femme',
 'fatale',
 'roles.',
 'kind',
 'of',
 'a',
 'hybrid',
 'version',
 'of',
 'stepford',
 'wives',
 'and',
 'they',
 'live,',
 'the',
 'movie',
 'commits',
 'early',
 'to',
 'its',
 'apocalyptic',
 'miltonesque',
 'vision',
 'and',
 'horror',
 'fans',
 'will',
 'likely',
 'not',
 'have',
 'many',
 'complaints',
 'until',
 'the',
 'soppy,',
 'maudlin',
 'denoument.',
 '7/10']

In [51]:
#Maps Word to Id Number
def data_dictionary(reviews,vocab_size_limit):
    token_counter = Counter()
    PAD_IDX = 0
    UNK_IDX = 1
    for review in reviews:
        for words in set(review):
            token_counter[words] += 1
            

    vocab, count = zip(*token_counter.most_common(vocab_size_limit))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

#Convert Review tokens to Id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else 0 for token in tokens]
        indices_data.append(index_list)
    return indices_data



x= data_dictionary(all_tags,2000)
token2id= x[0]
id2token = x[1]
train_data_indices = token2index_dataset(x_train)
val_data_indices = token2index_dataset(x_val)
test_data_indices = token2index_dataset(x_test)

In [50]:
#np.stack((x_train,x_val,x_test))
all_tags=np.concatenate((x_train, x_val, x_test), axis=None)
len(all_tags)

50000

In [52]:
train_data_indices

[[8,
  24,
  16,
  100,
  32,
  2,
  0,
  2,
  52,
  1586,
  5,
  2,
  24,
  16,
  11,
  0,
  401,
  6,
  67,
  3,
  317,
  758,
  0,
  2,
  118,
  16,
  0,
  2,
  0,
  66,
  1180,
  44,
  51,
  0,
  33,
  47,
  0,
  171,
  0,
  209,
  11,
  0,
  12,
  452,
  10,
  16,
  9,
  2,
  0,
  0,
  14,
  47,
  2,
  163,
  5,
  2,
  24,
  230,
  4,
  0,
  4,
  2,
  72,
  277,
  23,
  9,
  2,
  0,
  27,
  58,
  0,
  34,
  0,
  199,
  21,
  859,
  9,
  6,
  67,
  730,
  2,
  184,
  11,
  61,
  171,
  85,
  16,
  47,
  2,
  378,
  296,
  4,
  0,
  66,
  9,
  2,
  0,
  4,
  0,
  465,
  2,
  0,
  0,
  4,
  0,
  730,
  25,
  10,
  16,
  656,
  118,
  4,
  107,
  146,
  489,
  27,
  37,
  429,
  197,
  4,
  0,
  2,
  0,
  96,
  47,
  0,
  7,
  337,
  0,
  31,
  0,
  40,
  27,
  21,
  2,
  0,
  146,
  20,
  37,
  429,
  197,
  4,
  0,
  0,
  34,
  0,
  8,
  24,
  0],
 [12,
  0,
  278,
  141,
  32,
  30,
  124,
  4,
  12,
  21,
  109,
  2,
  53,
  4,
  2,
  51,
  100,
  129,
  1628,
  0,
  7,
  3,
  457

In [None]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

list_ = []
data=list(x_train)
for x in range(len(data)):
    line =find_ngrams(data[x],3)
    list_.append(line)


# Hyperparameter Tuning

In [53]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of IMDB movie review tokens 
        @param target_list: list of IMDB movie review targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def IMBD_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = NewsGroupDataset(train_data_indices, train_targets)
#val_loader = NewsGroupDataset(val_data_indices, val_targets)
#test_loader = NewsGroupDataset(test_data_indices, test_targets)

BATCH_SIZE = 32
train_dataset = IMDBDataset(train_data_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=IMBD_collate_func,
                                           shuffle=True)

val_dataset = IMDBDataset(val_data_indices, y_val)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                         batch_size=BATCH_SIZE,
                                         collate_fn=IMBD_collate_func,
                                         shuffle=True)

test_dataset = IMDBDataset(test_data_indices, y_test)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=BATCH_SIZE,
                                          collate_fn=IMBD_collate_func,
                                          shuffle=False)

for i, (data, lengths, labels) in enumerate(train_loader):
    print (data)
    print (labels)
    break

tensor([[   0,    5,    3,  ...,   17,    2,  128],
        [  40,   49,   16,  ...,    0,    0,    0],
        [   8,    0, 1637,  ...,    0,    0,    0],
        ...,
        [   0,  125,   22,  ...,    0,    0,    0],
        [   0,    3,    0,  ...,    5,    2,  611],
        [ 147,    9,    0,  ...,    0,    0,    0]])
tensor([0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
        1, 0, 1, 0, 0, 0, 0, 1])


In [54]:
#import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

emb_dim = 100
model = BagOfWords(len(id2token), emb_dim)

In [59]:
learning_rate = 0.01
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 700 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

Epoch: [1/10], Step: [701/782], Validation Acc: 87.24
Epoch: [2/10], Step: [701/782], Validation Acc: 86.64
Epoch: [3/10], Step: [701/782], Validation Acc: 87.38
Epoch: [4/10], Step: [701/782], Validation Acc: 87.2
Epoch: [5/10], Step: [701/782], Validation Acc: 87.58
Epoch: [6/10], Step: [701/782], Validation Acc: 87.4
Epoch: [7/10], Step: [701/782], Validation Acc: 87.24
Epoch: [8/10], Step: [701/782], Validation Acc: 87.24
Epoch: [9/10], Step: [701/782], Validation Acc: 87.34
Epoch: [10/10], Step: [701/782], Validation Acc: 87.24


In [56]:
print ("After training for {} epochs".format(num_epochs))
print ("Val Acc {}".format(test_model(val_loader, model)))
print ("Test Acc {}".format(test_model(test_loader, model)))

After training for 10 epochs
Val Acc 87.1
Test Acc 87.23


In [None]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

list_ = []
data=list(x_train)
for x in range(len(data)):
    line =find_ngrams(data[x],3)
    list_.append(line)

782