In [1]:
import glob
import numpy as np
import pandas as pd
import random
import spacy
import string
from nltk.util import ngrams
from tqdm import tqdm_notebook
import torch
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
train_pos_txt = glob.glob("/Users/xinyangqiu/Downloads/aclImdb/train/pos/*.txt")
train_neg_txt = glob.glob("/Users/xinyangqiu/Downloads/aclImdb/train/neg/*.txt")
test_pos_txt = glob.glob("/Users/xinyangqiu/Downloads/aclImdb/test/pos/*.txt")
test_neg_txt = glob.glob("/Users/xinyangqiu/Downloads/aclImdb/test/neg/*.txt")

In [3]:
def read_txt(txt,ls):
    for fle in txt:
        with open(fle) as f:
            ls.append(f.read())
    return len(txt)
    
train_x = []
num_train_pos = read_txt(train_pos_txt,train_x)
num_train_neg = read_txt(train_neg_txt,train_x)
train_y = [1] * num_train_pos + [0] * num_train_neg

test_x = []
num_test_pos = read_txt(test_pos_txt,test_x)
num_test_neg = read_txt(test_neg_txt,test_x)
test_y = [1] * num_test_pos + [0] * num_test_neg

In [4]:
random.Random(4).shuffle(train_x)
random.Random(4).shuffle(train_y)

val_x = train_x[:5000]
val_y =train_y[:5000]
train_x = train_x[5000:]
train_y = train_y[5000:]

In [5]:
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
def tokenize(sent):
  tokens = tokenizer(sent)
  return [token.text.lower() for token in tokens if (token.text not in punctuations)]

def tokenizer_dt(dataset):
    token_dataset = []
    all_tokens = []
    for sample in tqdm_notebook(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=1)):
        tokens = [token.text.lower() for token in sample if (token.text not in punctuations)]
        token_dataset.append(tokens)
        all_tokens += tokens
    return token_dataset, all_tokens

def tokenizer_gram(dataset,n):
    gram_dataset = []
    all_grams = [] 
    for i in tqdm_notebook(dataset):
        token_gram = list(ngrams(i,n))
        gram_dataset.append(token_gram)
        all_grams += token_gram
    return gram_dataset, all_grams

In [6]:
#words
token_train, all_train_tokens = tokenizer_dt(train_x)
token_test, all_test_tokens = tokenizer_dt(test_x)
token_val, all_val_tokens = tokenizer_dt(val_x)











In [7]:
#creat 2-grams
bigram_val, all_bigrams_val = tokenizer_gram(token_val,2)
bigram_train, all_bigrams_train = tokenizer_gram(token_train,2)
bigram_test, all_bigrams_test = tokenizer_gram(token_test,2)
#creat 3-grams

trigram_val, all_trigrams_val = tokenizer_gram(token_val,3)
trigram_train, all_trigrams_train = tokenizer_gram(token_train,3)
trigram_test, all_trigrams_test = tokenizer_gram(token_test,3)
#creat 4-grams
quagram_val, all_quagrams_val = tokenizer_gram(token_val,4)
quagram_train, all_quagrams_train = tokenizer_gram(token_train,4)
quagram_test, all_quagrams_test = tokenizer_gram(token_test,4)




























In [55]:
max_vocab_size = 10000
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

def token2index_dataset(tokens_data,token_id):
    indices_data = []
    for tokens in tqdm_notebook(tokens_data):
        index_list = [token_id[token] if token in token_id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

def get_dt_indices(all_train_dt,token_train_dt,token_test_dt,token_val_dt):
    token2id, id2token = build_vocab(all_train_dt)
    train_data_indices = token2index_dataset(token_train_dt,token2id)
    test_data_indices = token2index_dataset(token_test_dt,token2id)
    val_data_indices = token2index_dataset(token_val_dt,token2id)
    return train_data_indices,test_data_indices,val_data_indices, id2token

In [56]:
train_data_indices1,test_data_indices1,val_data_indices1, id2token1 = get_dt_indices(all_train_tokens,token_train,token_test,token_val)
train_data_indices2,test_data_indices2,val_data_indices2,id2token2= get_dt_indices(all_bigrams_train,bigram_train,bigram_test,bigram_val)
train_data_indices3,test_data_indices3,val_data_indices3,id2token3 = get_dt_indices(all_trigrams_train,trigram_train,trigram_test,trigram_val)
train_data_indices4,test_data_indices4,val_data_indices4,id2token4 = get_dt_indices(all_quagrams_train,quagram_train,quagram_test,quagram_val)

In [10]:
#MAX_SENTENCE_LENGTH = 400
class NewsGroupDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]
def data_loader(BATCH_SIZE,indices,y,shuffle_ind):
    new_set = NewsGroupDataset(indices,y)
    return torch.utils.data.DataLoader(dataset=new_set, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=shuffle_ind)

In [11]:
class BagOfWords(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super(BagOfWords, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
        out = self.linear(out.float())
        return out

In [12]:
def test_model(loader, model):
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

def output_model(emb_dim,learning_rate,num_epochs,id_token_dt,train_loader_dt,test_loader_dt,val_loader_dt):
    model = BagOfWords(len(id_token_dt), emb_dim)
    criterion = torch.nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        for i, (data, lengths, labels) in enumerate(train_loader_dt):
            model.train()
            data_batch, length_batch, label_batch = data, lengths, labels
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
        # validate every 100 iterations
            if i > 0 and i % 100 == 0:
                # validate
                train_acc = test_model(train_loader_dt, model)
                val_acc = test_model(val_loader_dt, model)
                #print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                #           epoch+1, num_epochs, i+1, len(train_loader_dt), val_acc))
    #return [emb_dim,learning_rate,num_epochs,test_model(val_loader_dt, model),test_model(test_loader_dt, model)]
    return [emb_dim,learning_rate,num_epochs,train_acc,val_acc]#,test_model(test_loader_dt, model)]

In [127]:
#sgd optimizer
def output_model_sgd(emb_dim,learning_rate,num_epochs,id_token_dt,train_loader_dt,test_loader_dt,val_loader_dt):
    model = BagOfWords(len(id_token_dt), emb_dim)
    criterion = torch.nn.CrossEntropyLoss()  
    #optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    print(model.parameters())
    for epoch in tqdm_notebook(range(num_epochs)):
        for i, (data, lengths, labels) in enumerate(train_loader_dt):
            model.train()
            data_batch, length_batch, label_batch = data, lengths, labels
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
        # validate every 100 iterations
            if i > 0 and i % 100 == 0:
                # validate
                train_acc = test_model(train_loader_dt, model)
                val_acc = test_model(val_loader_dt, model)
                #print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                #           epoch+1, num_epochs, i+1, len(train_loader_dt), val_acc))
    #return [emb_dim,learning_rate,num_epochs,test_model(val_loader_dt, model),test_model(test_loader_dt, model)]
    return [emb_dim,learning_rate,num_epochs,train_acc,val_acc]#,test_model(test_loader_dt, model)]


In [73]:
#sgd_1_gram_lr
learning_rate = [0.025, 0.035,0.045]

for i in learning_rate:
    temp = output_model_sgd(emb_dim = 100,learning_rate = i,num_epochs = 150,
                            id_token_dt = id2token1,train_loader_dt = train_loader1,
                            test_loader_dt = test_loader1,val_loader_dt=val_loader1)
    print(temp)

<generator object Module.parameters at 0x1b21a29620>
[100, 0.025, 150, 81.135, 78.7]
<generator object Module.parameters at 0x1b21a29620>
[100, 0.035, 150, 83.185, 81.44]
<generator object Module.parameters at 0x1b17522db0>
[100, 0.045, 150, 84.99, 82.48]


In [74]:
#sgd_2_gram_lr
learning_rate = [0.025, 0.035,0.045]

for i in learning_rate:
    temp = output_model_sgd(emb_dim = 100,learning_rate = i,num_epochs = 150,
                            id_token_dt = id2token2,train_loader_dt = train_loader2,
                            test_loader_dt = test_loader2,val_loader_dt=val_loader2)
    print(temp)

<generator object Module.parameters at 0x1b1c0dc2b0>
[100, 0.025, 150, 70.695, 70.34]
<generator object Module.parameters at 0x1b174065c8>
[100, 0.035, 150, 75.06, 74.02]
<generator object Module.parameters at 0x1b17406728>
[100, 0.045, 150, 74.3, 72.4]


In [182]:
#sgd_3_gram_lr
learning_rate = [0.025, 0.035,0.045]

for i in learning_rate:
    temp = output_model_sgd(emb_dim = 100,learning_rate = i,num_epochs = 250,
                            id_token_dt = id2token3,train_loader_dt = train_loader3,
                            test_loader_dt = test_loader3,val_loader_dt=val_loader3)
    print(temp)

<generator object Module.parameters at 0x1b17908728>


[100, 0.025, 250, 65.1, 64.78]
<generator object Module.parameters at 0x1b1b7b46d0>


[100, 0.035, 250, 65.825, 64.56]
<generator object Module.parameters at 0x1b1b7b4a98>


[100, 0.045, 250, 63.135, 62.24]


In [185]:
#sgd_4_gram_lr
learning_rate = [0.025, 0.035,0.045]

for i in learning_rate:
    temp = output_model_sgd(emb_dim = 100,learning_rate = i,num_epochs = 250,
                            id_token_dt = id2token4,train_loader_dt = train_loader4,
                            test_loader_dt = test_loader4,val_loader_dt=val_loader4)
    print(temp)

<generator object Module.parameters at 0x1b1746df10>


[100, 0.025, 250, 56.005, 54.94]
<generator object Module.parameters at 0x1b17008fc0>


[100, 0.035, 250, 54.77, 54.4]
<generator object Module.parameters at 0x1b17008fc0>


[100, 0.045, 250, 50.6, 50.22]


In [13]:
#get general understanding of 1gram_10000
MAX_SENTENCE_LENGTH = [50,100,200,300,400,500]
learning_rate = [0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,0.045,0.05,0.1 ]
dim = [10,50,100,150,200]
r_sen_rate = []
for i in tqdm_notebook(MAX_SENTENCE_LENGTH):
    MAX_SENTENCE_LENGTH = i
    train_loader1 = data_loader(BATCH_SIZE=32,indices = train_data_indices1,y = train_y,shuffle_ind=True)
    test_loader1 = data_loader(BATCH_SIZE=32,indices = test_data_indices1,y = test_y,shuffle_ind=True)
    val_loader1 = data_loader(BATCH_SIZE=32,indices = val_data_indices1,y = val_y,shuffle_ind=False)
    for j in learning_rate:
        for k in dim:
            temp = output_model(emb_dim = k,learning_rate = j,num_epochs = 2,
                                id_token_dt = id2token1,train_loader_dt = train_loader1,
                                test_loader_dt = test_loader1,val_loader_dt=val_loader1)
            temp
            temp.insert(0,i)
            r_sen_rate.append(temp)
a= r_sen_rate
dt = pd.DataFrame(a, columns=['sen_len','emb_dim',"learning_rate","epochs",'train_acc','val_acc'])
dt.to_csv('1gram1_10000.csv')




In [187]:
#test max sentence length for 4-gram
MAX_SENTENCE_LENGTH = 50
train_loader4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
val_loader4= data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
a = output_model(emb_dim = 200,learning_rate = 0.01,num_epochs = 2,
            id_token_dt = id2token4,train_loader_dt = train_loader4,
            test_loader_dt = test_loader4,val_loader_dt=val_loader4)

MAX_SENTENCE_LENGTH = 100
train_loade4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
val_loader4 = data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
b = output_model(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
            id_token_dt = id2token4,train_loader_dt = train_loader4,
            test_loader_dt = test_loader4,val_loader_dt=val_loader4)

MAX_SENTENCE_LENGTH = 150
train_loader4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
val_loader4 = data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
c = output_model(emb_dim = 100,learning_rate = 0.05,num_epochs = 2,
            id_token_dt = id2token4,train_loader_dt = train_loader4,
            test_loader_dt = test_loader4,val_loader_dt=val_loader4)

MAX_SENTENCE_LENGTH = 200
train_loader4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
val_loader4 = data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
d = output_model(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
            id_token_dt = id2token4,train_loader_dt = train_loader4,
            test_loader_dt = test_loader4,val_loader_dt=val_loader4)

dt = pd.DataFrame([a,b,c,d], columns=['emb_dim',"learning_rate","epochs",'train_acc','val_acc'])
dt.to_csv('msl_4gram.csv')

In [59]:
#test learning rate
#3gram_10000_lrate
learning_rate = [0.005,0.01,0.02, 0.025,0.03,0.035,0.04,0.045,0.05,0.1]
r3_lr = []

MAX_SENTENCE_LENGTH = 500
train_loader3 = data_loader(BATCH_SIZE=32,indices = train_data_indices3,y = train_y,shuffle_ind=True)
test_loader3 = data_loader(BATCH_SIZE=32,indices = test_data_indices3,y = test_y,shuffle_ind=True)
val_loader3 = data_loader(BATCH_SIZE=32,indices = val_data_indices3,y = val_y,shuffle_ind=False)
for j in tqdm_notebook(learning_rate):
    temp = output_model(emb_dim = 100,learning_rate = j,num_epochs = 2,
                        id_token_dt = id2token3,train_loader_dt = train_loader3,
                        test_loader_dt = test_loader3,val_loader_dt=val_loader3)
    temp
    temp.insert(0,i)
    r3_lr.append(temp)

dt = pd.DataFrame(r3_lr, columns=['max_voc_len','emb_dim',"learning_rate","epochs",'train_acc','val_acc'])
dt.to_csv('r3_lr.csv')

In [60]:
#4gram_10000_lrate
r4_lr = []

MAX_SENTENCE_LENGTH = 500
train_loader4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
val_loader4 = data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
for j in tqdm_notebook(learning_rate):
    temp = output_model(emb_dim = 100,learning_rate = j,num_epochs = 2,
                        id_token_dt = id2token4,train_loader_dt = train_loader4,
                        test_loader_dt = test_loader4,val_loader_dt=val_loader4)
    temp
    temp.insert(0,i)
    r4_lr.append(temp)

dt = pd.DataFrame(r4_lr, columns=['max_voc_len','emb_dim',"learning_rate","epochs",'train_acc','val_acc'])
dt.to_csv('r4_lr.csv')

In [61]:
#2gram_10000_lrate
learning_rate = [0.005,0.01,0.02,0.03,0.035,0.04,0.045,0.05,0.1]
r2_lr = []

MAX_SENTENCE_LENGTH = 500
train_loader2 = data_loader(BATCH_SIZE=32,indices = train_data_indices2,y = train_y,shuffle_ind=True)
test_loader2 = data_loader(BATCH_SIZE=32,indices = test_data_indices2,y = test_y,shuffle_ind=True)
val_loader2 = data_loader(BATCH_SIZE=32,indices = val_data_indices2,y = val_y,shuffle_ind=False)
for j in tqdm_notebook(learning_rate):
    temp = output_model(emb_dim = 100,learning_rate = j,num_epochs = 2,
                        id_token_dt = id2token2,train_loader_dt = train_loader2,
                        test_loader_dt = test_loader2,val_loader_dt=val_loader2)
    temp
    temp.insert(0,i)
    r2_lr.append(temp)

dt = pd.DataFrame(r2_lr, columns=['max_voc_len','emb_dim',"learning_rate","epochs",'train_acc','val_acc'])
dt.to_csv('r2_lr.csv')

In [41]:
#test vocabulary size
PAD_IDX = 0
UNK_IDX = 1

def build_vocab_size(all_tokens,max_size):
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

def get_dt_indices_voca_size(all_train_dt,token_train_dt,token_test_dt,token_val_dt,max_size):
    token2id, id2token = build_vocab_size(all_train_dt,max_size)
    train_data_indices = token2index_dataset(token_train_dt,token2id)
    test_data_indices = token2index_dataset(token_test_dt,token2id)
    val_data_indices = token2index_dataset(token_val_dt,token2id)
    return train_data_indices,test_data_indices,val_data_indices, id2token


[100, 0.045, 2, 90.33, 83.44]

In [45]:
#_1 gram_lr = 0.045
voca_size = [5000,10000,15000,20000,25000]
r_vac1 = []
for i in voca_size:
    train_data_indices1,test_data_indices1,val_data_indices1,id2token1= get_dt_indices_voca_size(all_train_tokens,token_train,token_test,token_val,voca_size[0])
    MAX_SENTENCE_LENGTH = 500
    train_loader1 = data_loader(BATCH_SIZE=32,indices = train_data_indices1,y = train_y,shuffle_ind=True)
    test_loader1 = data_loader(BATCH_SIZE=32,indices = test_data_indices1,y = test_y,shuffle_ind=True)
    val_loader1 = data_loader(BATCH_SIZE=32,indices = val_data_indices1,y = val_y,shuffle_ind=False)
    temp = output_model(emb_dim = 100,learning_rate = 0.045,num_epochs = 2,
                     id_token_dt = id2token1,train_loader_dt = train_loader1,
                     test_loader_dt = test_loader1,val_loader_dt=val_loader1)
    temp.insert(0,i)
    r_vac1.append(temp)
r_vac1

[[5000, 100, 0.045, 2, 92.345, 84.62],
 [10000, 100, 0.045, 2, 93.47, 87.26],
 [15000, 100, 0.045, 2, 91.285, 84.7],
 [20000, 100, 0.045, 2, 93.105, 86.42],
 [25000, 100, 0.045, 2, 93.69, 87.06]]

In [48]:
#_1 gram_lr = 0.04
r_vac1_1 = []
for i in voca_size:
    train_data_indices1,test_data_indices1,val_data_indices1,id2token1= get_dt_indices_voca_size(all_train_tokens,token_train,token_test,token_val,voca_size[0])
    MAX_SENTENCE_LENGTH = 500
    train_loader1 = data_loader(BATCH_SIZE=32,indices = train_data_indices1,y = train_y,shuffle_ind=True)
    test_loader1 = data_loader(BATCH_SIZE=32,indices = test_data_indices1,y = test_y,shuffle_ind=True)
    val_loader1 = data_loader(BATCH_SIZE=32,indices = val_data_indices1,y = val_y,shuffle_ind=False)
    temp = output_model(emb_dim = 100,learning_rate = 0.04,num_epochs = 2,
                     id_token_dt = id2token1,train_loader_dt = train_loader1,
                     test_loader_dt = test_loader1,val_loader_dt=val_loader1)
    temp.insert(0,i)
    r_vac1_1.append(temp)
r_vac1_1

[[5000, 100, 0.04, 2, 90.34, 84.44],
 [10000, 100, 0.04, 2, 93.7, 87.06],
 [15000, 100, 0.04, 2, 92.705, 85.74],
 [20000, 100, 0.04, 2, 93.245, 86.54],
 [25000, 100, 0.04, 2, 93.895, 87.28]]

In [49]:
#_1 gram_lr = 0.05
r_vac1_2 = []
for i in voca_size:
    train_data_indices1,test_data_indices1,val_data_indices1,id2token1= get_dt_indices_voca_size(all_train_tokens,token_train,token_test,token_val,voca_size[0])
    MAX_SENTENCE_LENGTH = 500
    train_loader1 = data_loader(BATCH_SIZE=32,indices = train_data_indices1,y = train_y,shuffle_ind=True)
    test_loader1 = data_loader(BATCH_SIZE=32,indices = test_data_indices1,y = test_y,shuffle_ind=True)
    val_loader1 = data_loader(BATCH_SIZE=32,indices = val_data_indices1,y = val_y,shuffle_ind=False)
    temp = output_model(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
                     id_token_dt = id2token1,train_loader_dt = train_loader1,
                     test_loader_dt = test_loader1,val_loader_dt=val_loader1)
    temp.insert(0,i)
    r_vac1_2.append(temp)
r_vac1_2

[[5000, 200, 0.05, 2, 88.945, 83.7],
 [10000, 200, 0.05, 2, 92.825, 86.7],
 [15000, 200, 0.05, 2, 91.88, 85.42],
 [20000, 200, 0.05, 2, 91.575, 85.02],
 [25000, 200, 0.05, 2, 92.645, 86.12]]

In [51]:
#_2 gram_lr = 0.045
voca_size = [5000,10000,15000,20000,25000]
r_vac2= []
for i in voca_size:
    train_data_indices2,test_data_indices2,val_data_indices2,id2token2= get_dt_indices_voca_size(all_bigrams_train,bigram_train,bigram_test,bigram_val,voca_size[0])
    MAX_SENTENCE_LENGTH = 500
    train_loader2 = data_loader(BATCH_SIZE=32,indices = train_data_indices2,y = train_y,shuffle_ind=True)
    test_loader2 = data_loader(BATCH_SIZE=32,indices = test_data_indices2,y = test_y,shuffle_ind=True)
    val_loader2 = data_loader(BATCH_SIZE=32,indices = val_data_indices2,y = val_y,shuffle_ind=False)
    temp = output_model(emb_dim = 100,learning_rate = 0.045,num_epochs = 2,
                     id_token_dt = id2token2,train_loader_dt = train_loader2,
                     test_loader_dt = test_loader2,val_loader_dt=val_loader2)
    temp.insert(0,i)
    r_vac2.append(temp)
r_vac2

[[5000, 100, 0.045, 2, 91.42, 84.2],
 [10000, 100, 0.045, 2, 90.205, 83.72],
 [15000, 100, 0.045, 2, 90.28, 83.5],
 [20000, 100, 0.045, 2, 91.165, 83.46],
 [25000, 100, 0.045, 2, 86.525, 80.58]]

In [142]:
#_2 gram_Adam_lr = 0.025,0.035.0.045
lr2 = [0.025,0.035,0.045]
r_lr_adam2= []
for i in lr2:
    train_data_indices2,test_data_indices2,val_data_indices2,id2token2= get_dt_indices_voca_size(all_bigrams_train,bigram_train,bigram_test,bigram_val,10000)
    MAX_SENTENCE_LENGTH = 500
    train_loader2 = data_loader(BATCH_SIZE=32,indices = train_data_indices2,y = train_y,shuffle_ind=True)
    test_loader2 = data_loader(BATCH_SIZE=32,indices = test_data_indices2,y = test_y,shuffle_ind=True)
    val_loader2 = data_loader(BATCH_SIZE=32,indices = val_data_indices2,y = val_y,shuffle_ind=False)
    temp = output_model(emb_dim = 100,learning_rate = i,num_epochs = 2,
                     id_token_dt = id2token2,train_loader_dt = train_loader2,
                     test_loader_dt = test_loader2,val_loader_dt=val_loader2)
    r_lr_adam2.append(temp)
r_lr_adam2

[[100, 0.025, 2, 94.515, 84.78],
 [100, 0.035, 2, 95.85, 84.8],
 [100, 0.045, 2, 93.685, 83.32]]

In [144]:
#_3 gram_Adam_lr = 0.025,0.035,0.045
lr3 = [0.025,0.035,0.045]
r_lr_adam3= []
for i in lr3:
    train_data_indices3,test_data_indices3,val_data_indices3,id2token3= get_dt_indices_voca_size(all_trigrams_train,trigram_train,trigram_test,trigram_val,10000)
    MAX_SENTENCE_LENGTH = 500
    train_loader3 = data_loader(BATCH_SIZE=32,indices = train_data_indices3,y = train_y,shuffle_ind=True)
    test_loader3 = data_loader(BATCH_SIZE=32,indices = test_data_indices3,y = test_y,shuffle_ind=True)
    val_loader3 = data_loader(BATCH_SIZE=32,indices = val_data_indices3,y = val_y,shuffle_ind=False)
    temp = output_model(emb_dim = 100,learning_rate = i,num_epochs = 2,
                     id_token_dt = id2token3,train_loader_dt = train_loader3,
                     test_loader_dt = test_loader3,val_loader_dt=val_loader3)
    r_lr_adam3.append(temp)
r_lr_adam3

[[100, 0.025, 2, 92.45, 80.0],
 [100, 0.035, 2, 85.64, 74.22],
 [100, 0.045, 2, 93.09, 79.38]]

In [145]:
#_4 gram_Adam_lr = 0.025,0.035,0.045
lr4 = [0.025,0.035,0.045]
r_lr_adam4= []
for i in lr4:
    train_data_indices3,test_data_indices3,val_data_indices3,id2token3= get_dt_indices_voca_size(all_quagrams_train,quagram_train,quagram_test,quagram_val,10000)
    MAX_SENTENCE_LENGTH = 500
    train_loader4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
    test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
    val_loader4 = data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
    temp = output_model(emb_dim = 100,learning_rate = i,num_epochs = 2,
                     id_token_dt = id2token4,train_loader_dt = train_loader4,
                     test_loader_dt = test_loader4,val_loader_dt=val_loader4)
    r_lr_adam4.append(temp)
r_lr_adam4

[[100, 0.025, 2, 88.4, 74.74],
 [100, 0.035, 2, 82.23, 71.96],
 [100, 0.045, 2, 84.47, 73.16]]

In [161]:
#print test set
def test_model_print_example(loader, model):
    correct = 0
    total = 0
    model.eval()
    pred_val =  []
    labellist = []
    predictlist = []
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        predictlist.append(predicted)
        labellist.append(labels)
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
        pred_val.append(predicted)
    return (100 * correct / total),pred_val
    #return labellist,predictlist

def output_model_test(emb_dim,learning_rate,num_epochs,id_token_dt,train_loader_dt,test_loader_dt,val_loader_dt):
    model = BagOfWords(len(id_token_dt), emb_dim)
    criterion = torch.nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        for i, (data, lengths, labels) in enumerate(train_loader_dt):
            model.train()
            data_batch, length_batch, label_batch = data, lengths, labels
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
        # validate every 100 iterations
            if i > 0 and i % 100 == 0:
                # validate
                train_acc = test_model(train_loader_dt, model)
                val_acc,val_r = test_model_print_example(val_loader_dt, model)
                test_acc = test_model(test_loader_dt, model)
                #print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                #           epoch+1, num_epochs, i+1, len(train_loader_dt), val_acc))
    #return [emb_dim,learning_rate,num_epochs,test_model(val_loader_dt, model),test_model(test_loader_dt, model)]
    #return [emb_dim,learning_rate,num_epochs,train_acc,val_acc,test_model(test_loader_dt, model)]
    return model

In [162]:
model_trained = output_model_test(emb_dim = 200,learning_rate = 0.045,num_epochs = 2,
                    id_token_dt = id2token1,train_loader_dt = train_loader1,
                    test_loader_dt = test_loader1,val_loader_dt=val_loader1)


In [163]:
def test_model_print(loader, model):
    correct = 0
    total = 0
    model.eval()
    pred_val =  []
    labellist = []
    predictlist = []
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        predictlist.append(predicted)
        labellist.append(labels)
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
        pred_val.append(predicted)
    #return (100 * correct / total),pred_val
    return labellist,predictlist

In [164]:
val_loader1 = data_loader(BATCH_SIZE=1,indices = val_data_indices1,y = val_y,shuffle_ind=False)
lablls, predls = test_model_print(val_loader1, model_trained)

In [166]:
#find final result
lablls[0:5], predls[0:5]

([tensor([1]), tensor([1]), tensor([0]), tensor([0]), tensor([0])],
 [tensor([[1]]), tensor([[0]]), tensor([[0]]), tensor([[1]]), tensor([[0]])])

In [175]:
#find final result
lablls[5:10], predls[5:10]

([tensor([1]), tensor([1]), tensor([0]), tensor([0]), tensor([0])],
 [tensor([[1]]), tensor([[0]]), tensor([[0]]), tensor([[0]]), tensor([[0]])])

In [184]:
#true pred:
val_x[7],val_y[7],val_x[9],val_y[9],val_x[6],val_y[6]

('This is surely one of the worst films ever made. Each scene is painful. You will groan at the flimsy attempts at humor, the awkward camera work, the sexism and racism, the ridiculous story line, the wooden acting. Poor Joan Bennett; she is the only one in the movie who is not an embarrassment. In all, dreadful.',
 0,
 "I caught this movie on Sci-Fi before heading into work. If you've any interest in seeing Dean Cain dive and avoid being enveloped in flames at least a dozen times, this movie is for you. If that doesn't peak your interest, well, I'm afraid you'll wish that YOU were the one about to be enveloped in flames, because this movie is pretty bad. The acting, to begin with, is awful, awful, awful. The characters are all completely obnoxious, and the dialogue is worse than your typical Z-grade, Sci-Fi movie. Towards the end, the movie began to remind me of 'Hollow Man' (complete with escape via elevator shaft), except with a Dragon, not a naked, invisible man. Unlike other simil

In [181]:
#false pred:
val_x[1],val_y[1],val_x[3],val_y[3],val_x[5],val_y[5]

 1,
 'The Tooth Fairy is about the ghost of an old deformed witch that lures children to her house to get a prize for their loose tooth and then takes their lives. The first few minutes introduce you to the 1949 beginning of the legend of the tooth fairy and then switches to present day. The worn out horror plot is pretty much saved by the solid acting. They could have done without the Hammond brothers and a few other scenes, but overall the gore scenes were bloody but quick which had a minimizing effect. The eye candy is pretty good for both genders. Camera work is good. Dialog is fair but cheesy. I expected the film to be a bare bones, low budget, slasher with very few redeeming factors. I was surprised by the quality of the film.',
 0,
 'While a 9 might seem like an unusually high score for such a slight film, however, compared to the hundreds and hundreds of series detective films from the 1930s and 40s, this is among the very best and also compares very favorably to Powell\'s late

In [120]:
#test_1_gram_lr = 0.05
train_data_indices1,test_data_indices1,val_data_indices1,id2token1= get_dt_indices_voca_size(all_train_tokens,token_train,token_test,token_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader1 = data_loader(BATCH_SIZE=32,indices = train_data_indices1,y = train_y,shuffle_ind=True)
test_loader1 = data_loader(BATCH_SIZE=32,indices = test_data_indices1,y = test_y,shuffle_ind=True)
val_loader1 = data_loader(BATCH_SIZE=32,indices = val_data_indices1,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
                    id_token_dt = id2token1,train_loader_dt = train_loader1,
                    test_loader_dt = test_loader1,val_loader_dt=val_loader1)

In [130]:
#test_2_gram_lr = 0.05
train_data_indices2,test_data_indices2,val_data_indices2,id2token2= get_dt_indices_voca_size(all_bigrams_train,bigram_train,bigram_test,bigram_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader2 = data_loader(BATCH_SIZE=32,indices = train_data_indices2,y = train_y,shuffle_ind=True)
test_loader2 = data_loader(BATCH_SIZE=32,indices = test_data_indices2,y = test_y,shuffle_ind=True)
val_loader2 = data_loader(BATCH_SIZE=32,indices = val_data_indices2,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
                    id_token_dt = id2token2,train_loader_dt = train_loader2,
                    test_loader_dt = test_loader2,val_loader_dt=val_loader2)

In [131]:
temp

[200, 0.05, 2, 90.575, 82.8, 83.476]

In [133]:
#test_3_gram_lr = 0.05
train_data_indices3,test_data_indices3,val_data_indices3,id2token3= get_dt_indices_voca_size(all_trigrams_train,trigram_train,trigram_test,trigram_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader3 = data_loader(BATCH_SIZE=32,indices = train_data_indices3,y = train_y,shuffle_ind=True)
test_loader3 = data_loader(BATCH_SIZE=32,indices = test_data_indices3,y = test_y,shuffle_ind=True)
val_loader3 = data_loader(BATCH_SIZE=32,indices = val_data_indices3,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
                    id_token_dt = id2token3,train_loader_dt = train_loader3,
                    test_loader_dt = test_loader3,val_loader_dt=val_loader3)
temp

[200, 0.05, 2, 76.915, 70.68, 78.612]

In [136]:
#test_4_gram_lr = 0.05
train_data_indices3,test_data_indices3,val_data_indices3,id2token3= get_dt_indices_voca_size(all_quagrams_train,quagram_train,quagram_test,quagram_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
val_loader4 = data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
                    id_token_dt = id2token4,train_loader_dt = train_loader4,
                    test_loader_dt = test_loader4,val_loader_dt=val_loader4)
temp

[200, 0.05, 2, 83.965, 71.54, 73.144]

In [137]:
#test_1_gram_lr = 0.045
train_data_indices1,test_data_indices1,val_data_indices1,id2token1= get_dt_indices_voca_size(all_train_tokens,token_train,token_test,token_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader1 = data_loader(BATCH_SIZE=32,indices = train_data_indices1,y = train_y,shuffle_ind=True)
test_loader1 = data_loader(BATCH_SIZE=32,indices = test_data_indices1,y = test_y,shuffle_ind=True)
val_loader1 = data_loader(BATCH_SIZE=32,indices = val_data_indices1,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.045,num_epochs = 2,
                    id_token_dt = id2token1,train_loader_dt = train_loader1,
                    test_loader_dt = test_loader1,val_loader_dt=val_loader1)
temp

[200, 0.045, 2, 92.615, 86.54, 86.364]

In [138]:
#test_2_gram_lr = 0.045
train_data_indices2,test_data_indices2,val_data_indices2,id2token2= get_dt_indices_voca_size(all_bigrams_train,bigram_train,bigram_test,bigram_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader2 = data_loader(BATCH_SIZE=32,indices = train_data_indices2,y = train_y,shuffle_ind=True)
test_loader2 = data_loader(BATCH_SIZE=32,indices = test_data_indices2,y = test_y,shuffle_ind=True)
val_loader2 = data_loader(BATCH_SIZE=32,indices = val_data_indices2,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.05,num_epochs = 2,
                    id_token_dt = id2token2,train_loader_dt = train_loader2,
                    test_loader_dt = test_loader2,val_loader_dt=val_loader2)
temp

[200, 0.05, 2, 81.41, 77.08, 83.764]

In [139]:
#test_3_gram_lr = 0.045
train_data_indices3,test_data_indices3,val_data_indices3,id2token3= get_dt_indices_voca_size(all_trigrams_train,trigram_train,trigram_test,trigram_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader3 = data_loader(BATCH_SIZE=32,indices = train_data_indices3,y = train_y,shuffle_ind=True)
test_loader3 = data_loader(BATCH_SIZE=32,indices = test_data_indices3,y = test_y,shuffle_ind=True)
val_loader3 = data_loader(BATCH_SIZE=32,indices = val_data_indices3,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.045,num_epochs = 2,
                    id_token_dt = id2token3,train_loader_dt = train_loader3,
                    test_loader_dt = test_loader3,val_loader_dt=val_loader3)
temp

[200, 0.045, 2, 76.14, 69.88, 76.356]

In [143]:
#test_4_gram_lr = 0.045
train_data_indices3,test_data_indices3,val_data_indices3,id2token3= get_dt_indices_voca_size(all_quagrams_train,quagram_train,quagram_test,quagram_val,voca_size[0])
MAX_SENTENCE_LENGTH = 500
train_loader4 = data_loader(BATCH_SIZE=32,indices = train_data_indices4,y = train_y,shuffle_ind=True)
test_loader4 = data_loader(BATCH_SIZE=32,indices = test_data_indices4,y = test_y,shuffle_ind=True)
val_loader4 = data_loader(BATCH_SIZE=32,indices = val_data_indices4,y = val_y,shuffle_ind=False)
temp = output_model_test(emb_dim = 200,learning_rate = 0.045,num_epochs = 2,
                    id_token_dt = id2token4,train_loader_dt = train_loader4,
                    test_loader_dt = test_loader4,val_loader_dt=val_loader4)
temp

[200, 0.045, 2, 88.945, 74.64, 74.28]