# Assignment 2
You will tackle with a sentiment classification task using LSTM model and attention mechanism in this assigment.

# Dependencies
Please make sure that you are using **GPU** to accelarate computation.

Colab FAQ: https://research.google.com/colaboratory/faq.html

## Import dependencies

In [255]:
import torch
import os
import collections
from torch import nn,optim
from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn.functional as F
from torch.utils.data.dataset import T_co
from tqdm import tqdm
import math
import random
import numpy as np


In [256]:
# Set up your device 
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")
print('Using {} device'.format(device))
# The assertion is to make sure GPU is available
assert cuda == True

Using cuda:0 device


In [257]:
# Set up random seed to 1008. Do not change the random seed.
# Yes, these are all necessary when you run experiments!
seed = 1008
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


## Data
The script below will download the required sentiment analysis data.

Data folder will be visible in the Colab file-explorer pane, which is loacted at left side of the page.


In [258]:
# !wget --no-check-certificate "https://docs.google.com/uc?export=download&id=1jqYJ9jhjukhXvEk4GnMAPYE-SvhSG24i" -O data.zip
# !unzip data.zip

## Corpus
Glove will be used as the word embedding tool in this assigment.

In [259]:
# !wget https://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

# Preprocess
Preprocess data, then construct dataloader and vocabulary.

## Load Glove pretrained word embedding.

In [260]:
# TODO
vocab,embeddings = [],[]
with open('glove.6B.50d.txt','rt',encoding="utf-8") as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [261]:
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

#insert '<pad>' and '<unk>' tokens at start of vocab_npa.
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
print(vocab_npa[:10])


pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']
(400002, 50)


In [262]:
my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())

assert my_embedding_layer.weight.shape == embs_npa.shape
print(my_embedding_layer.weight.shape)

torch.Size([400002, 50])


## Construct your own vocabulary without other corpus.
Hint: You should construct a vocabulary to map the word to index.

In [263]:
# TODO
wordmap={}

idx=0
for item in vocab_npa:
    wordmap[item]=idx
    idx+=1

showw=0
for it in wordmap.items():
    print(it)
    showw+=1
    if showw>=30:
        break

('<pad>', 0)
('<unk>', 1)
('the', 2)
(',', 3)
('.', 4)
('of', 5)
('to', 6)
('and', 7)
('in', 8)
('a', 9)
('"', 10)
("'s", 11)
('for', 12)
('-', 13)
('that', 14)
('on', 15)
('is', 16)
('was', 17)
('said', 18)
('with', 19)
('he', 20)
('as', 21)
('it', 22)
('by', 23)
('at', 24)
('(', 25)
(')', 26)
('from', 27)
('his', 28)
("''", 29)


## Load data
Load data and construct dataloader.

In [264]:
# TODO

import torch
from torch import nn

data_dir = 'sentiment'
trainTextPath=data_dir+os.sep+"train_text.txt"
trainLabelPath=data_dir+os.sep+"train_labels.txt"

testTextPath=data_dir+os.sep+"test_text.txt"
testLabelPath=data_dir+os.sep+"test_labels.txt"

valTextPath=data_dir+os.sep+"val_text.txt"
valLabelPath=data_dir+os.sep+"val_labels.txt"

maxSentenceLength=0
with open(trainTextPath, "r", encoding="utf-8") as f:
    sentences = f.read().strip().split("\n")
    for sentence in sentences:
        sentence = sentence.split()
        if maxSentenceLength < len(sentence):
            maxSentenceLength = len(sentence)

with open(testTextPath, "r", encoding="utf-8") as f:
    sentences = f.read().strip().split("\n")
    for sentence in sentences:
        sentence = sentence.split()
        if maxSentenceLength < len(sentence):
            maxSentenceLength = len(sentence)

with open(valTextPath, "r", encoding="utf-8") as f:
    sentences = f.read().strip().split("\n")
    for sentence in sentences:
        sentence = sentence.split()
        if maxSentenceLength < len(sentence):
            maxSentenceLength = len(sentence)


maxSentenceLength+=1
print(maxSentenceLength)

padList=torch.zeros(maxSentenceLength,dtype=int)

class myDataset(Dataset):
    def __init__(self,textpath,labelpath):
        self.vecList=[]
        with open(textpath,mode="r",encoding="utf-8") as f:
            tempList=f.read().lower().strip().split("\n")
            # self.textList=tempList[:]
            for line in tempList:
                addList=padList.clone().detach()
                line=line.split()
                # print("length",len(line))
                for indx, itm in enumerate(line):
                    if itm in wordmap:
                        addList[indx]=wordmap[itm]
                    else:
                        addList[indx]=wordmap["<unk>"]
                self.vecList.append(addList.clone().detach())


        with open(labelpath,mode="r",encoding="utf-8") as f:
            self.labelList=f.read().strip().split("\n")
            self.labelList=torch.tensor([int(i) for i in self.labelList])

        if len(self.vecList)==len(self.labelList):
            print("myDataset from",textpath,"is created,length is",len(self.vecList))
        else:
            print("length different error")
    def __getitem__(self, index):
        # print("----------\n",index,self.vecList[index],"\n----------")
        return self.vecList[index],self.labelList[index]

    def __len__(self):
        return len(self.vecList)


36


In [361]:
trainDataset=myDataset(trainTextPath,trainLabelPath)
valDataset=myDataset(valTextPath,valLabelPath)
testDataset=myDataset(testTextPath,testLabelPath)

trainDataLoader=DataLoader(trainDataset,batch_size=128,shuffle=True)
valDataLoader=DataLoader(valDataset,batch_size=128,shuffle=False)
testDataLoader=DataLoader(testDataset,batch_size=128,shuffle=False)


myDataset from sentiment\train_text.txt is created,length is 45615
myDataset from sentiment\val_text.txt is created,length is 2000
myDataset from sentiment\test_text.txt is created,length is 12284


## Model Zoo

In [439]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, pretrained_embedding=None, **kwargs):
        super(BiRNN, self).__init__()
        if pretrained_embedding is None:
            self.embedding = nn.Embedding(vocab_size, embed_size,device=device)
        else:
            self.embedding= nn.Embedding.from_pretrained(torch.tensor(pretrained_embedding.clone().detach(), dtype=torch.float).clone().detach(), freeze=True).to(device)
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, batch_first=True).to(device)
        self.decoder = nn.Sequential(nn.Linear(4 * num_hiddens, num_hiddens).to(device),
                                     nn.Linear(num_hiddens, 3)).to(device)
        self.to(device)
        print("init")

    def forward(self, inputs):
        # print("forward")

        inputs=inputs.to(device)
        self.to(device)
        self.embedding.to(device)

        embeddingOut = self.embedding(inputs)

        self.encoder.flatten_parameters()
        outputs, _ = self.encoder(embeddingOut)
        encoding = torch.cat((outputs[:,0,:], outputs[:,-1,:]), dim=1)
        outs = self.decoder(encoding)
        softmax_fun=nn.Softmax(dim=1)
        outs=softmax_fun(outs)
        return outs

In [440]:
class BiRNN_attention(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, pretrained_embedding=None, **kwargs):
        super(BiRNN_attention, self).__init__()
        if pretrained_embedding is None:
            self.embedding = nn.Embedding(vocab_size, embed_size)
        else:
            self.embedding = nn.Embedding.from_pretrained(torch.tensor(pretrained_embedding, dtype=torch.float),
                                                          freeze=True)
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.weight_W = nn.Parameter(torch.Tensor(2 * num_hiddens, 2 * num_hiddens))
        self.weight_proj = nn.Parameter(torch.Tensor(2 * num_hiddens, 1))

        self.decoder = nn.Sequential(nn.Linear(2 * num_hiddens, num_hiddens),
                                     nn.Linear(num_hiddens, 3))
        nn.init.uniform_(self.weight_W, -0.1, 0.1)
        nn.init.uniform_(self.weight_proj, -0.1, 0.1)


    def forward(self, inputs):
        mask = 1 - torch.clamp(inputs, min=0, max=1)
        embeddings = self.embedding(inputs)
        states, hidden = self.encoder(embeddings.permute([0, 1, 2]))
        u = torch.tanh(torch.matmul(states, self.weight_W))
        att = torch.matmul(u, self.weight_proj)
        att = att + mask.unsqueeze(2) * -1e7
        att_score = F.softmax(att, dim=1)
        scored_x = states * att_score
        encoding = torch.sum(scored_x, dim=1)
        outputs = self.decoder(encoding)

        return outputs

## Training
You should train two models above with Glove pretrained word embedding and random initialized word embedding.

Evaluation on the validation set and print out accuracy after training one epoch is required.

You can tune some parameters and try different techniques, such as learning rate scheduler.

In [444]:
def train(model,train_loader=trainDataLoader,val_loader=valDataLoader,epoch=30,log_interval = 100):
    print("statr training")
    for ep in range(epoch):
        optimizer = optim.SGD(model.parameters(), lr=0.1,momentum=0.5)
        lossFunction=nn.NLLLoss(reduction="sum")

        for batch_idx, (data, target) in enumerate(train_loader):
            model.train()
            model.to(device)
            data=data.to(device)
            target=target.to(device)
            optimizer.zero_grad()

            output=model.forward(data)
            loss=lossFunction(output,target)
            loss.backward()
            optimizer.step()
            # TODO:还没写val
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    ep, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
                model.eval()
                test_loss = 0
                num_correct = 0
                with torch.no_grad():
                    for data, target in val_loader:
                        data=data.to(device)
                        target=target.to(device)
                        output=model.forward(data)
                        test_loss+=lossFunction(output,target)
                        pred = output.data.max(1, keepdim=True)[1]
                        # print("pred",pred)
                        # print("target",target)
                        num_correct+= pred.eq(target.data.view_as(pred)).sum()
                avg_test_loss = test_loss/ len(val_loader.dataset)
                print('\nVal set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                    avg_test_loss, num_correct, len(val_loader.dataset),
                    100. * num_correct / len(val_loader.dataset)))

In [445]:
def test(model, test_loader=testDataLoader,device=device):
    model.eval()
    test_loss = 0
    num_correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data.to(device)
            target.to(device)
            output=model.forward(data)
            lossFunction=nn.NLLLoss(reduction="sum")
            test_loss+=lossFunction(output,target)
            pred = output.data.max(1, keepdim=True)[1]
            num_correct+= pred.eq(target.data.view_as(pred)).sum()
    avg_test_loss = test_loss/ len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        avg_test_loss, num_correct, len(test_loader.dataset),
        100. * num_correct / len(test_loader.dataset)))

In [446]:
# Train BiRNN with Glove pretrained word embedding
# TODO
# num_hiddens随便大小吗
myBiRNNpretrained=BiRNN(vocab_size=len(wordmap),embed_size=50,num_hiddens=5,num_layers=2,pretrained_embedding=my_embedding_layer.weight.clone().detach())
train(myBiRNNpretrained)
# test(myBiRNNpretrained)

  self.embedding= nn.Embedding.from_pretrained(torch.tensor(pretrained_embedding.clone().detach(), dtype=torch.float).clone().detach(), freeze=True).to(device)


init
statr training

Val set: Average loss: -0.4125, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Accuracy: 869/2000 (43%)


Val set: Average loss: -0.4345, Acc

In [412]:
with myBiRNNpretrained.

SyntaxError: expected ':' (514093880.py, line 1)

In [None]:
with myBiRNNpretrained.

In [None]:
# Train BiRNN without pretrained word embedding
# TODO

In [None]:
# Train BiRNN_attention with Glove pretrained embedding
# TODO

In [None]:
# Train BiRNN_attention without pretrained word embedding
# TODO

# Report (optional)
You can briefly report what strategies you attempted in this assignment.