In [157]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
import numpy as np 
from random import shuffle
from time import time 



# 1. Word Averaging Binary Classiﬁer (20 point)

## 1.1 Implementation and Experimentation (15 points) 

In [158]:
def dataset_preparation(data,max_sequence_len):
    corpus = data.split("\n")
    predictors = []
    labels = []
    mean_len = []
    max_sequence_len
    
    for line in corpus:
        sent, score = line.split("\t")
        labels.append(int(score))
        token_list = tokenizer.texts_to_sequences([sent])[0]
        mean_len.append(max_sequence_len/len(token_list))
        token_list = np.array(pad_sequences([token_list], maxlen=max_sequence_len, padding='post'))[0]
        predictors.append(torch.tensor(token_list,dtype=torch.int64))
    return predictors, labels, mean_len

def evaluate(model,predictors,labels,mean_len,mb_size):
    model.eval()
    with torch.no_grad():
            total_pred = 0
            total_correct = 0
            total_loss =0 
            sequences = [i for i in range(len(predictors))]
            last_size = len(sequences) % mb_size
            for mb in range(1+ len(sequences)//mb_size):
                if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.float).cuda()
                    mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.float).\
                                view(mb_size,1).repeat(1,d).cuda()
                else: 
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.float).cuda()
                    mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:]),dtype=torch.float).\
                                view(last_size,1).repeat(1,d).cuda()
                    mb_size = last_size 
                
                probs = model(batch.cuda(),mean_l)
                loss =  - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.000001).view(mb_size)))
                total_loss += loss.item()
                total_pred += mb_size 
                predict = list(map(lambda x : x >= 0.5,probs.view(mb_size).cpu().data.numpy() ))
                target = target.cpu().data.numpy() 
                for j in range(len(predict)):
                    if predict[j] == target[j]:
                        total_correct += 1     
    model.train()    
    return total_pred,total_correct,total_loss/total_pred


In [159]:
# build vocabulary and data preprocessing
tokenizer = Tokenizer(lower=False,filters='\t')
train = open('senti.train.tsv').read()
dev = open('senti.dev.tsv').read()
test = open('senti.test.tsv').read()
data = train + dev + test
corpus = data.split("\n")
max_sequence_len = 0 
corpus = list(map(lambda x: x[:-1],corpus))
for i in corpus: 
    max_sequence_len = max(max_sequence_len, len(re.split(' |\t',i))-1)
    
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)

predictors, labels,mean_len = dataset_preparation(train,max_sequence_len)
d_predictors, d_labels, d_mean_len = dataset_preparation(dev,max_sequence_len)
t_predictors, t_labels, t_mean_len = dataset_preparation(test,max_sequence_len)

In [89]:
# model implementation and training
from torch.nn.utils import clip_grad_norm_
class WordAvg(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(WordAvg, self).__init__()
        self.embeddings = nn.Embedding(vocab_size+1, embedding_dim,padding_idx=0)
        self.embeddings.weight.data.uniform_(-1, 1)
        self.embeddings.weight.data[0] =  torch.zeros_like(self.embeddings.weight.data[0])
        self.linear1 = nn.Linear(embedding_dim, 1,bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs,mean_l):
        embeds = self.embeddings(inputs)
        out = torch.mean(embeds,dim=1)/mb_size
        out = torch.mul(out, mean_l)
        out = self.linear1(out)
        probs = self.sigmoid(out)
        return probs

d =100
model = WordAvg(total_words, d)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
model = model.cuda()
 
a = time()

for epoch in range(10):
    mb_size = 32
    sequences = [i for i in range(len(predictors))]
    shuffle(sequences)
    last_size = len(sequences) % mb_size
    for mb in range(1+len(sequences)//mb_size):
        if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.float).cuda()
                    mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.float).\
                                view(mb_size,1).repeat(1,d).cuda()
        else: 
            if last_size == 0:
                break
            batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
            target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.double).cuda()
            mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:]),dtype=torch.double).\
                        view(last_size,1).repeat(1,d).cuda()
            mb_size = last_size
        
        model.zero_grad()
        probs = model(batch.cuda(),mean_l)
        
        loss = - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.000001).view(mb_size)))
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.05)
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(), "./model_earlystop_" + str(epoch))    
    total_pred,total_correct,avg_loss= evaluate(model,predictors, labels,mean_len,mb_size)
    print("epoch: {}".format(epoch))
    print("total prediction: {}, train_accuracy: {}, train avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
    total_pred,total_correct,avg_loss= evaluate(model,d_predictors, d_labels,d_mean_len,mb_size)
    print("total prediction: {}, validation_accuracy: {}, validation avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
print(time()-a)

epoch: 0
total prediction: 872, train_accuracy: 0.8061926605504587, train avg_loss: 0.42769300992335746
total prediction: 872, validation_accuracy: 0.6938073394495413, validation avg_loss: 0.5665919075318433
epoch: 1
total prediction: 872, train_accuracy: 0.6938073394495413, train avg_loss: 0.5665919075318433
total prediction: 872, validation_accuracy: 0.7958715596330275, validation avg_loss: 0.4552579795548675
epoch: 2
total prediction: 872, train_accuracy: 0.7958715596330275, train avg_loss: 0.4552579795548675
total prediction: 872, validation_accuracy: 0.8107798165137615, validation avg_loss: 0.4274528190630292
epoch: 3
total prediction: 872, train_accuracy: 0.8107798165137615, train avg_loss: 0.4274528190630292
total prediction: 872, validation_accuracy: 0.8142201834862385, validation avg_loss: 0.4275120947885951
epoch: 4
total prediction: 872, train_accuracy: 0.8142201834862385, train avg_loss: 0.4275120947885951
total prediction: 872, validation_accuracy: 0.8176605504587156, vali

In [38]:
# test set evaluation
model = WordAvg(total_words, d)
model.cuda()
model.load_state_dict(torch.load("./model_earlystop_7"))
total_pred,total_correct,avg_loss= evaluate(model,t_predictors, t_labels,t_mean_len,mb_size)
print("dev_accuracy: {}, dev avg_loss: {}".format(0.8153669724770642,0.486369001209189))
print("test_accuracy: {}, test avg_loss: {}".format(total_correct/total_pred,avg_loss))

dev_accuracy: 0.8153669724770642, dev avg_loss: 0.486369001209189
test_accuracy: 0.8039538714991763, test avg_loss: 0.44997723478330875


## 1.2 Analysis (5 points)

In [92]:
weights = []
for param in model.parameters():
    weights.append(param.data)
embed_weights = weights[0]
l2_norms = torch.norm(embed_weights,dim=1).cpu().data.numpy()
sorted_norm = sorted(list(zip(range(len(l2_norms)),l2_norms)),key=lambda x: x[1])
large =  sorted_norm[-15:]
small =  sorted_norm[0:16] # account for 0

print("------largest 15 words with norms-----")
for i in range(15):
    print("{}: {}".format(tokenizer.index_word[large[14-i][0]],large[14-i][1]))
print("")
print("------smallest 15 words with norms------")
for i in range(16):
    if small[i][0] != 0:
        print("{}: {}".format(tokenizer.index_word[small[i][0]],small[i][1]))

------largest 15 words with norms-----
worst: 26.194583892822266
unfunny: 25.550601959228516
poorly: 25.409120559692383
wonderful: 25.06730842590332
suffers: 25.066425323486328
devoid: 24.72293472290039
touching: 24.331138610839844
remarkable: 23.52764892578125
mess: 23.515443801879883
playful: 23.075441360473633
heartwarming: 23.02166175842285
terrific: 23.005342483520508
unnecessary: 22.89522361755371
flat: 22.829540252685547
badly: 22.794445037841797

------smallest 15 words with norms------
Yimou: 4.877057075500488
combination: 4.896606922149658
Crimen: 4.912820816040039
unplundered: 4.942306041717529
Randall: 4.950387477874756
Mixes: 4.953327178955078
theater: 4.958885669708252
arguable: 4.9900031089782715
caring: 4.990123271942139
medicine: 4.997435569763184
lie: 5.021319389343262
spouse: 5.029397964477539
brassy: 5.034116268157959
Birot: 5.051749229431152
oatmeal: 5.053561687469482


In [107]:
#torch.save(model.state_dict(), "./model1")
ab = torch.load( "./model1")

# 2. Attention-Weighted Word Averaging (20 points)


## 2.1. Implementation and Experimentation (10 points) 

In [39]:
# model implementation and training
def masked_softmax(A, dim=1, epsilon=1e-5):
    A_exp = torch.exp(A)
    A_exp = A_exp * (A != 0).type(torch.FloatTensor).cuda() # this step masks
    A_softmax = A_exp / torch.sum(A_exp,dim,keepdim=True)
    return A_softmax

class Attention(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Attention, self).__init__()
        self.embeddings = nn.Embedding(vocab_size+1, embedding_dim,padding_idx=0)
        self.embeddings.weight.data.uniform_(-0.1, 0.1)
        self.embeddings.weight.data[0] =  torch.zeros_like(self.embeddings.weight.data[0])
        
        self.u = torch.nn.Parameter(torch.randn(embedding_dim).view(1,1,embedding_dim).cuda())
        self.linear1 = nn.Linear(embedding_dim, 1,bias=False)
        self.sigmoid = nn.Sigmoid()


    def forward(self, inputs,mb_size):
        embeds = self.embeddings(inputs)                
        att =torch.nn.functional.cosine_similarity(self.u,embeds,dim=2)
        att = masked_softmax(att,dim=1)
        att =  torch.unsqueeze(att, 2)
        att = att.repeat(1,1,100)
        out = torch.mul(embeds, att)
        out = torch.sum(out,dim=1)
        out = self.linear1(out)
        probs = self.sigmoid(out)
        return probs


def evaluate2(model,predictors,labels,mb_size,max_sequence_len):
    model.eval()
    with torch.no_grad():
            total_pred = 0
            total_correct = 0
            total_loss =0 
            sequences = [i for i in range(len(predictors))]
            last_size = len(sequences) % mb_size
            for mb in range(1+ len(sequences)//mb_size):
                if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.float).cuda()    
                else: 
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.float).cuda()
                    mb_size = last_size 
                probs = model(batch.cuda(),max_sequence_len)
                loss =  - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.000001).view(mb_size)))
                total_loss += loss.item()
                total_pred += mb_size 
                predict = list(map(lambda x : x >= 0.5,probs.view(mb_size).cpu().data.numpy() ))
                target = target.cpu().data.numpy() 
                for j in range(len(predict)):
                    if predict[j] == target[j]:
                        total_correct += 1     
    model.train()    
    return total_pred,total_correct,total_loss/total_pred
    
d =100
mb_size = 100
model2 = Attention(total_words, d)
optimizer = optim.Adam(model2.parameters(), lr = 0.0003)
model2 = model2.cuda()
 
a = time()

for epoch in range(10):
    mb_size = 32
    sequences = [i for i in range(len(predictors))]
    shuffle(sequences)
    last_size = len(sequences) % mb_size
    for mb in range(1+len(sequences)//mb_size):
        if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.float).cuda()
        else: 
            if last_size == 0:
                break
            batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
            target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.float).cuda()
            mb_size = last_size
        model2.zero_grad()
        probs = model2(batch.cuda(),max_sequence_len)
        loss = - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.00001).view(mb_size)))
        loss.backward()
        optimizer.step()
    torch.save(model2.state_dict(), "./model_earlystop_" + str(epoch))    
    total_pred,total_correct,avg_loss= evaluate2(model2,predictors, labels,mb_size,max_sequence_len)
    print("epoch: {}".format(epoch))
    print("total prediction: {}, train_accuracy: {}, train avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
    total_pred,total_correct,avg_loss= evaluate2(model2,d_predictors, d_labels,mb_size,max_sequence_len)
    print("total prediction: {}, validation_accuracy: {}, validation avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
print(time()-a)

epoch: 0
total prediction: 67349, train_accuracy: 0.8854474453963682, train avg_loss: 0.3273802132777774
total prediction: 872, validation_accuracy: 0.7958715596330275, validation avg_loss: 0.4668108049882661
epoch: 1
total prediction: 67349, train_accuracy: 0.9246313976451024, train avg_loss: 0.2125090480989779
total prediction: 872, validation_accuracy: 0.8256880733944955, validation avg_loss: 0.41458561278264455
epoch: 2
total prediction: 67349, train_accuracy: 0.9403109177567596, train avg_loss: 0.16834892617262087
total prediction: 872, validation_accuracy: 0.8222477064220184, validation avg_loss: 0.4094048216255433
epoch: 3
total prediction: 67349, train_accuracy: 0.9481358297821794, train avg_loss: 0.14370701670992708
total prediction: 872, validation_accuracy: 0.8211009174311926, validation avg_loss: 0.4197514059893582
epoch: 4
total prediction: 67349, train_accuracy: 0.9536147530030141, train avg_loss: 0.12745158337327117
total prediction: 872, validation_accuracy: 0.810779816

In [47]:
# test set evaluation
model2 = Attention(total_words, d)
model2.cuda()
model2.load_state_dict(torch.load("./model_earlystop_2"))
total_pred,total_correct,avg_loss= evaluate2(model2,t_predictors, t_labels,mb_size,max_sequence_len)
print("dev_accuracy: {}, dev avg_loss: {}".format(0.8256880733944955,0.41458561278264455))
print("test_accuracy: {}, test avg_loss: {}".format(total_correct/total_pred,avg_loss))

dev_accuracy: 0.8256880733944955, dev avg_loss: 0.41458561278264455
test_accuracy: 0.8121911037891268, test avg_loss: 0.4002359582460038


## 2.2. Analysis: Word Embeddings and the Attention Vector (5 points) 

In [68]:
weights = []
for param in model2.parameters():
    weights.append(param.data)
embed_weights = weights[1]
u = weights[0].view(1,100)
cos_similarity =torch.nn.functional.cosine_similarity(embed_weights,u,dim=1).cpu().data.numpy()
sorted_similarity = sorted(list(zip(range(len(cos_similarity)),cos_similarity)),key=lambda x: x[1])
large =  sorted_similarity[-15:]
small =  sorted_similarity[0:15] # account for 0

print("------largest 15 words with similarity-----")
for i in range(15):
    print("{}: {}".format(tokenizer.index_word[large[14-i][0]],large[14-i][1]))
print("")
print("------smallest 15 words with similarity------")
for i in range(15):
    if small[i][0] != 0:
        print("{}: {}".format(tokenizer.index_word[small[i][0]],small[i][1]))

------largest 15 words with similarity-----
never: 0.7919396758079529
bad: 0.779381513595581
not: 0.7775062322616577
wrong: 0.7515763640403748
less: 0.6921879649162292
n't: 0.686932384967804
drag: 0.6524785161018372
missed: 0.6430465579032898
inadvertent: 0.6347647309303284
no: 0.6342142224311829
nor: 0.6315809488296509
falls: 0.6269453167915344
too: 0.6262093186378479
loud: 0.6106578707695007
none: 0.6090220212936401

------smallest 15 words with similarity------
,: -0.9990133047103882
that: -0.9986844062805176
a: -0.9979445338249207
it: -0.9977834224700928
's: -0.9958602786064148
and: -0.9949108362197876
to: -0.9870907664299011
the: -0.9857791662216187
is: -0.9839534163475037
The: -0.9826574325561523
--: -0.9804767966270447
all: -0.9666380882263184
.: -0.9649766087532043
this: -0.9649695754051208
in: -0.9637553095817566


## 2.3. Analysis: Variance of Attentions (5 points)

In [240]:
cos = torch.nn.functional.cosine_similarity(embed_weights,u,dim=1)
predictors2 = torch.stack(predictors).cuda()
att_weights = torch.where(predictors2 != 0 ,cos[predictors2], torch.zeros_like(predictors2).float().cuda())
att_weights = masked_softmax(att_weights,dim=1)
pred = predictors2.cpu().numpy()
att = att_weights.cpu().numpy()

att_dict = {}
for i in range(pred.shape[0]):
    for j in range(pred.shape[1]):
        if pred[i][j] in att_dict: 
            att_dict[pred[i][j]].append(att[i][j])
        else: 
            att_dict[pred[i][j]] = [att[i][j]]  
att_dict.pop(0,None)
att_list = []
for key, val in att_dict.items():
    if len(val) >= 100:
        val2 = np.array(val)
        att_list.append((key,val2.std()/val2.mean()))
    
sorted_att = sorted(att_list,key=lambda x: x[1])
large =  sorted_att[-30:]

print("------largest 30 words with Variance -----")
for i in range(30):
    print("{}: {}".format(tokenizer.index_word[large[29-i][0]],large[29-i][1]))


------largest 30 words with Variance -----
create: 1.4647003412246704
Like: 1.4148080348968506
hero: 1.347995638847351
seat: 1.3016000986099243
New: 1.2686944007873535
quality: 1.2547513246536255
All: 1.2311038970947266
kind: 1.2117339372634888
interest: 1.192842960357666
give: 1.1914682388305664
talent: 1.1802623271942139
war: 1.1794241666793823
written: 1.178371548652649
comes: 1.1640310287475586
children: 1.1596442461013794
sentimental: 1.1452240943908691
full: 1.1182912588119507
personal: 1.1128511428833008
remains: 1.1107527017593384
us: 1.1040270328521729
man: 1.1016396284103394
fans: 1.0930447578430176
lives: 1.0926544666290283
imagination: 1.0903058052062988
everyone: 1.0865960121154785
-RRB-: 1.0865715742111206
theater: 1.0792664289474487
visually: 1.0787477493286133
moment: 1.0761202573776245
classic: 1.072454571723938


# 3. Simple Self-Attention (15 points)

## 3.1. Implementation and Experimentation 

### Self Attention without Mean

In [9]:
# model implementation and training
def masked_softmax(A, dim=1, epsilon=1e-5):
    B = (A != 0).type(torch.DoubleTensor).cuda()
    s = A.size()[0]
    A_exp = torch.exp(A)
    A_exp = A_exp * B # this step masks
    A_softmax = A_exp / (torch.sum(A_exp,dim,keepdim=True)+epsilon)
    
    return A_softmax

class Self_Attention(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Self_Attention, self).__init__()
        self.embeddings = nn.Embedding(vocab_size+1, embedding_dim,padding_idx=0).double()
        self.embeddings.weight.data.uniform_(-0.05, 0.05)
        self.embeddings.weight.data[0] =  torch.zeros_like(self.embeddings.weight.data[0])
        self.linear1 = nn.Linear(embedding_dim, 1,bias=False).double()
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs,mb_size):
        embeds = self.embeddings(inputs)
        A = torch.transpose(embeds,1,2)
        A = torch.matmul(embeds, A)
        att = torch.sum(A,dim=1)
        att = masked_softmax(att,dim=1)

        att =  torch.unsqueeze(att, 2)
        att = att.repeat(1,1,100)
        out = torch.mul(embeds, att)
        out = torch.sum(out,dim=1)
        out = self.linear1(out)
        probs = self.sigmoid(out)
        return probs


def evaluate2(model,predictors,labels,mb_size,max_sequence_len):
    model.eval()
    with torch.no_grad():
            total_pred = 0
            total_correct = 0
            total_loss =0 
            sequences = [i for i in range(len(predictors))]
            last_size = len(sequences) % mb_size
            for mb in range(1+ len(sequences)//mb_size):
                if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).cuda()    
                else: 
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.double).cuda()
                    mb_size = last_size 
                probs = model(batch.cuda(),max_sequence_len)
                loss =  - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.0001).view(mb_size)))
                total_loss += loss.item()
                total_pred += mb_size 
                predict = list(map(lambda x : x >= 0.5,probs.view(mb_size).cpu().data.numpy()))
                target = target.cpu().data.numpy() 
                for j in range(len(predict)):
                    if predict[j] == target[j]:
                        total_correct += 1     
    model.train()    
    return total_pred,total_correct,total_loss/total_pred
    
d =100

model3 = Self_Attention(total_words, d)
optimizer = optim.Adam(model3.parameters(), lr = 0.0003)
model3 = model3.cuda()
 
a = time()

for epoch in range(10):
    mb_size = 32
    sequences = [i for i in range(len(predictors))]
    shuffle(sequences)
    last_size = len(sequences) % mb_size
    for mb in range(1+len(sequences)//mb_size):
        if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).cuda()
        else: 
            if last_size == 0:
                break
            batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
            target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.double).cuda()
            mb_size = last_size
        model3.zero_grad()
        probs = model3(batch.cuda(),max_sequence_len)
        loss = - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.00001).view(mb_size)))
        loss.backward()
        optimizer.step()
    torch.save(model3.state_dict(), "./model_earlystop_" + str(epoch))    
    mb_size = 100
    total_pred,total_correct,avg_loss= evaluate2(model3,predictors, labels,mb_size,max_sequence_len)
    print("epoch: {}".format(epoch))
    print("total prediction: {}, train_accuracy: {}, train avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
    total_pred,total_correct,avg_loss= evaluate2(model3,d_predictors, d_labels,mb_size,max_sequence_len)
    print("total prediction: {}, validation_accuracy: {}, validation avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
print(time()-a)

epoch: 0
total prediction: 67349, train_accuracy: 0.8930644849960653, train avg_loss: 0.28914963394176807
total prediction: 872, validation_accuracy: 0.8038990825688074, validation avg_loss: 0.46741994397568143
epoch: 1
total prediction: 67349, train_accuracy: 0.9276752438788994, train avg_loss: 0.19371040116303223
total prediction: 872, validation_accuracy: 0.8096330275229358, validation avg_loss: 0.5056266776019307
epoch: 2
total prediction: 67349, train_accuracy: 0.9414987601894609, train avg_loss: 0.1528761970414319
total prediction: 872, validation_accuracy: 0.801605504587156, validation avg_loss: 0.5708180678734782
epoch: 3
total prediction: 67349, train_accuracy: 0.9514766366241518, train avg_loss: 0.12894692133284658
total prediction: 872, validation_accuracy: 0.8084862385321101, validation avg_loss: 0.6132693307177669
epoch: 4
total prediction: 67349, train_accuracy: 0.956925863784169, train avg_loss: 0.11297883763044998
total prediction: 872, validation_accuracy: 0.8107798165

In [12]:
# test set evaluation
model3 = Self_Attention(total_words, d)
model3.cuda()
model3.load_state_dict(torch.load("./model_earlystop_4"))
total_pred,total_correct,avg_loss= evaluate2(model3,t_predictors, t_labels,mb_size,max_sequence_len)
print("dev_accuracy: {}, dev avg_loss: {}".format(0.8107798165137615,0.6930137940848276))
print("test_accuracy: {}, test avg_loss: {}".format(total_correct/total_pred,avg_loss))


dev_accuracy: 0.8107798165137615, dev avg_loss: 0.6930137940848276
test_accuracy: 0.8034047226798462, test avg_loss: 0.619780004929865


### Residual Connection

In [16]:
# model implementation and training
def masked_softmax(A, dim=1, epsilon=1e-5):
    B = (A != 0).type(torch.DoubleTensor).cuda()
    A_exp = torch.exp(A)
    A_exp = A_exp * B # this step masks
    A_softmax = A_exp / (torch.sum(A_exp,dim,keepdim=True)+epsilon)
    return A_softmax

class Residual_Connection(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Residual_Connection, self).__init__()
        self.embeddings = nn.Embedding(vocab_size+1, embedding_dim,padding_idx=0).double()
        self.embeddings.weight.data.uniform_(-0.1, 0.1)
        self.embeddings.weight.data[0] =  torch.zeros_like(self.embeddings.weight.data[0])
        self.linear1 = nn.Linear(embedding_dim, 1,bias=False).double()
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs,mean_l):
        embeds = self.embeddings(inputs)
        avg = torch.mean(embeds,dim=1)
        avg = torch.mul(avg, mean_l)
        A = torch.matmul(embeds, torch.transpose(embeds,1,2))
        att = torch.sum(A,dim=1) 
        att = masked_softmax(att,dim=1)
        att =  torch.unsqueeze(att, 2)
        att = att.repeat(1,1,100)
        out = torch.mul(embeds, att)
        out = torch.sum(out,dim=1)
        out = out + avg
        out = self.linear1(out)
        probs = self.sigmoid(out)
        return probs


def evaluate3(model,predictors,labels,mb_size,mean_len,max_sequence_len):
    model.eval()
    with torch.no_grad():
            total_pred = 0
            total_correct = 0
            total_loss =0 
            sequences = [i for i in range(len(predictors))]
            last_size = len(sequences) % mb_size
            for mb in range(1+len(sequences)//mb_size):
                
                if mb < len(sequences)//mb_size:
                            batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                            target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).cuda()
                            mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).\
                                        view(mb_size,1).repeat(1,d).cuda()
                else: 
                    if last_size == 0:
                        break
                    
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.double).cuda()
                    mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:]),dtype=torch.double).\
                                view(last_size,1).repeat(1,d).cuda()
                    mb_size = last_size
                probs = model(batch.cuda(),mean_l)
                loss =  - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.0001).view(mb_size)))
                total_loss += loss.item()
                total_pred += mb_size 
                predict = list(map(lambda x : x >= 0.5,probs.view(mb_size).cpu().data.numpy() ))
                target = target.cpu().data.numpy() 
                for j in range(len(predict)):
                    if predict[j] == target[j]:
                        total_correct += 1     
    model.train()    
    return total_pred,total_correct,total_loss/total_pred
    
d =100

model4 = Residual_Connection(total_words, d)
optimizer = optim.Adam(model4.parameters(), lr = 0.0003)
model4 = model4.cuda()
 
a = time()

for epoch in range(10):
    mb_size = 100
    sequences = [i for i in range(len(predictors))]
    shuffle(sequences)
    last_size = len(sequences) % mb_size
    for mb in range(1+len(sequences)//mb_size):
        if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).cuda()
                    mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).\
                                view(mb_size,1).repeat(1,d).cuda()
        else: 
            if last_size == 0:
                break
            batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
            target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.double).cuda()
            mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:]),dtype=torch.double).\
                        view(last_size,1).repeat(1,d).cuda()
            mb_size = last_size
        model4.zero_grad()
        probs = model4(batch.cuda(),mean_l)
        loss = - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.00001).view(mb_size)))
        loss.backward()
        optimizer.step()
    mb_size = 100
    torch.save(model4.state_dict(), "./model_earlystop_" + str(epoch))    
    total_pred,total_correct,avg_loss= evaluate3(model4,predictors, labels,mb_size,mean_len,max_sequence_len)
    print("epoch: {}".format(epoch))
    print("total prediction: {}, train_accuracy: {}, train avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
    total_pred,total_correct,avg_loss= evaluate3(model4,d_predictors, d_labels,mb_size,d_mean_len,max_sequence_len)
    print("total prediction: {}, validation_accuracy: {}, validation avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
print(time()-a)

epoch: 0
total prediction: 67349, train_accuracy: 0.8553207916969814, train avg_loss: 0.4039127049165434
total prediction: 872, validation_accuracy: 0.783256880733945, validation avg_loss: 0.49383315046221965
epoch: 1
total prediction: 67349, train_accuracy: 0.9120996599801037, train avg_loss: 0.25111061599153783
total prediction: 872, validation_accuracy: 0.8119266055045872, validation avg_loss: 0.4474375131036764
epoch: 2
total prediction: 67349, train_accuracy: 0.9320108687582592, train avg_loss: 0.19218755941552232
total prediction: 872, validation_accuracy: 0.8084862385321101, validation avg_loss: 0.46405982689414643
epoch: 3
total prediction: 67349, train_accuracy: 0.9430874994431988, train avg_loss: 0.15905578937600126
total prediction: 872, validation_accuracy: 0.8107798165137615, validation avg_loss: 0.48975637376359554
epoch: 4
total prediction: 67349, train_accuracy: 0.9500957697961365, train avg_loss: 0.1370567823563549
total prediction: 872, validation_accuracy: 0.81192660

In [34]:
# test set evaluation
model4 = Residual_Connection(total_words, d)
model4.cuda()
model4.load_state_dict(torch.load("./model_earlystop_3"))
total_pred,total_correct,avg_loss= evaluate3(model4,t_predictors, t_labels,mb_size,t_mean_len,max_sequence_len)
print("dev_accuracy: {}, dev avg_loss: {}".format(0.8119266055045872,0.5219749716070143))
print("test_accuracy: {}, test avg_loss: {}".format(total_correct/total_pred,avg_loss))


dev_accuracy: 0.8119266055045872, dev avg_loss: 0.5219749716070143
test_accuracy: 0.8110928061504667, test avg_loss: 0.432737020717729


# 4. Enriching the Attention Function (15 points)

In [305]:
#I embedded position and sentence length, and concatenate them with the word embedding 

def masked_softmax(A, dim=1, epsilon=1e-5):
    B = (A != 0).type(torch.DoubleTensor).cuda()
    A_exp = torch.exp(A)
    A_exp = A_exp * B # this step masks
    A_softmax = A_exp / (torch.sum(A_exp,dim,keepdim=True)+epsilon)
    return A_softmax

class Enriched_Att(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Enriched_Att, self).__init__()
        self.embeddings = nn.Embedding(vocab_size+1, embedding_dim,padding_idx=0).double()
        self.embeddings.weight.data.uniform_(-0.1, 0.1)
        self.embeddings.weight.data[0] =  torch.zeros_like(self.embeddings.weight.data[0])
        
        self.embeddings2 = nn.Embedding(56, 40,padding_idx=0).double() # position
        self.embeddings3 = nn.Embedding(56, 10,padding_idx=0).double()# sent length
        
        self.embeddings2.weight.data.uniform_(-0.05, 0.05)
        self.embeddings3.weight.data.uniform_(-0.05, 0.05)
        
        self.linear1 = nn.Linear(embedding_dim//2 + embedding_dim, 1,bias=False).double()
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, inputs,mean_l):
        
        pos = torch.tensor([[i for i in range(56)] for j in range(inputs.shape[0])],dtype=torch.long).cuda()
        pos_embeds =  self.embeddings2(pos)
        
        embeds = self.embeddings(inputs)
        B = (inputs != 0).type(torch.DoubleTensor).cuda()
        C = B.sum(dim=1).long()
        
        B = B.unsqueeze(2).repeat(1,1,50)
        C = C.unsqueeze(1).repeat(1,56)
        
        len_embeds = self.embeddings3(C)
        
        pos_embeds = torch.cat((pos_embeds,len_embeds),2)
        pos_embeds = B * pos_embeds
        embeds = torch.cat((embeds,pos_embeds),2)
        avg = torch.mean(embeds,dim=1)
        avg = torch.mul(avg, mean_l)
        A = torch.matmul(embeds, torch.transpose(embeds,1,2))
        self_att = torch.sum(A,dim=1)         
        att = self_att  
        att = masked_softmax(att,dim=1)
        att =  torch.unsqueeze(att, 2)
        att = att.repeat(1,1,150)
        
        out = torch.mul(embeds, att)
        out = torch.sum(out,dim=1)
        out = out + avg
        out = self.linear1(out)
        
        probs = self.sigmoid(out)
        return probs


def evaluate3(model,predictors,labels,mb_size,mean_len,max_sequence_len):
    model.eval()
    with torch.no_grad():
            total_pred = 0
            total_correct = 0
            total_loss =0 
            sequences = [i for i in range(len(predictors))]
            last_size = len(sequences) % mb_size
            for mb in range(1+len(sequences)//mb_size):
                
                if mb < len(sequences)//mb_size:
                            batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                            target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).cuda()
                            mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).\
                                        view(mb_size,1).repeat(1,150).cuda()
                else: 
                    if last_size == 0:
                        break
                    
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.double).cuda()
                    mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:]),dtype=torch.double).\
                                view(last_size,1).repeat(1,150).cuda()
                    mb_size = last_size
                probs = model(batch.cuda(),mean_l)
                loss =  - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.0001).view(mb_size)))
                total_loss += loss.item()
                total_pred += mb_size 
                predict = list(map(lambda x : x >= 0.5,probs.view(mb_size).cpu().data.numpy() ))
                target = target.cpu().data.numpy() 
                for j in range(len(predict)):
                    if predict[j] == target[j]:
                        total_correct += 1     
    model.train()    
    return total_pred,total_correct,total_loss/total_pred
    
d =100

model4 = Enriched_Att(total_words, d)
optimizer = optim.Adam(model4.parameters(), lr = 0.0003)
model4 = model4.cuda()

a = time()

for epoch in range(10):
    mb_size = 50
    sequences = [i for i in range(len(predictors))]
    shuffle(sequences)
    last_size = len(sequences) % mb_size
    for mb in range(1+len(sequences)//mb_size):
        #print(mb)
        if mb < len(sequences)//mb_size:
                    batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:(mb+1)*mb_size])).cuda()
                    target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).cuda()
                    mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:(mb+1)*mb_size]),dtype=torch.double).\
                                view(mb_size,1).repeat(1,150).cuda()
        else: 
            if last_size == 0:
                break
            batch = torch.stack(list(predictors[i] for i in sequences[mb*mb_size:])).cuda()
            target = torch.tensor(list(labels[i] for i in sequences[mb*mb_size:]),dtype=torch.double).cuda()
            mean_l = torch.tensor(list(mean_len[i] for i in sequences[mb*mb_size:]),dtype=torch.double).\
                        view(last_size,1).repeat(1,150).cuda()
            mb_size = last_size
        model4.zero_grad()
        probs = model4(batch.cuda(),mean_l)
        loss = - torch.sum(target*torch.log(probs).view(mb_size)) - torch.sum(((1-target)*torch.log(1-probs+0.00001).view(mb_size)))
        loss.backward()
        optimizer.step()
    mb_size = 100
    torch.save(model4.state_dict(), "./model_earlystop_" + str(epoch))    
    total_pred,total_correct,avg_loss= evaluate3(model4,predictors, labels,mb_size,mean_len,max_sequence_len)
    print("epoch: {}".format(epoch))
    print("total prediction: {}, train_accuracy: {}, train avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
    total_pred,total_correct,avg_loss= evaluate3(model4,d_predictors, d_labels,mb_size,d_mean_len,max_sequence_len)
    print("total prediction: {}, validation_accuracy: {}, validation avg_loss: {}".format(total_pred,total_correct/total_pred,avg_loss))
print(time()-a)

epoch: 0
total prediction: 67349, train_accuracy: 0.8789588561077373, train avg_loss: 0.33624531412840963
total prediction: 872, validation_accuracy: 0.7901376146788991, validation avg_loss: 0.46895605635812876
epoch: 1
total prediction: 67349, train_accuracy: 0.9246759417363287, train avg_loss: 0.2092989992390989
total prediction: 872, validation_accuracy: 0.8027522935779816, validation avg_loss: 0.47445440058949256
epoch: 2
total prediction: 67349, train_accuracy: 0.943265675808104, train avg_loss: 0.1573031131694194
total prediction: 872, validation_accuracy: 0.8038990825688074, validation avg_loss: 0.5132261945575513
epoch: 3
total prediction: 67349, train_accuracy: 0.953555360881379, train avg_loss: 0.12787920629052996
total prediction: 872, validation_accuracy: 0.8119266055045872, validation avg_loss: 0.546792625309283
epoch: 4
total prediction: 67349, train_accuracy: 0.9601775824436889, train avg_loss: 0.10836914784203588
total prediction: 872, validation_accuracy: 0.81651376146

In [320]:
# test set evaluation
model4 = Enriched_Att(total_words, d)
model4.cuda()
model4.load_state_dict(torch.load("./model_earlystop_5"))
total_pred,total_correct,avg_loss= evaluate3(model4,t_predictors, t_labels,mb_size,t_mean_len,max_sequence_len)
print("dev_accuracy: {}, dev avg_loss: {}".format(0.8176605504587156,0.6364916900036274))
print("test_accuracy: {}, test avg_loss: {}".format(total_correct/total_pred,avg_loss))


dev_accuracy: 0.8119266055045872, dev avg_loss: 0.5219749716070143
test_accuracy: 0.8215266337177375, test avg_loss: 0.4196136991457575
