In [1]:
import math
import numpy as np
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

torch.manual_seed(1)

<torch._C.Generator at 0x7fa9e1998b90>

In [2]:
def get_embvec(sentence, vol):
    words = sentence.split()
    words = list(filter(lambda token: token not in string.punctuation, words))
    vec = torch.FloatTensor(len(words), EMBEDDING_DIM)
    for i in range(0,len(words)):
        if words[i] in vol.keys():
            vec[i] = (vol.get(words[i]))
        else:
            vec[i] = (vol.get("#UNK#"))

    return vec

In [7]:
class BagOfWords(nn.Module):

    def __init__(self,embedding_dim, hidden_dim, tagset_size, vol):
        super(BagOfWords, self).__init__()
        self.embedding_dim = embedding_dim
        
        self.hidden_dim = hidden_dim

        self.vol = vol
        
        self.hidden2tag = nn.Linear(embedding_dim, hidden_dim)
        
        self.activation_function1 = nn.Tanh()
        
        self.hidden3tag = nn.Linear(hidden_dim, len(vol))


    def bow_vec(self, sentence):
        embeds = get_embvec(sentence, self.vol)
        
        embeds = torch.mean(embeds, dim=0)
               
        return embeds
    
    def forward(self, sentence):
        bilstm_out = self.bow_vec(sentence)

        tag_space = self.hidden2tag(bilstm_out)
        
        tag_space = self.activation_function1(tag_space)
        
        tag_space = self.hidden3tag(tag_space)

        tag_scores = F.log_softmax(tag_space, dim = -1)

        return tag_scores.unsqueeze(0)



In [8]:
def read_by_tokens(fileobj):

    for line in fileobj:

        for token in line.split():

            yield token

In [9]:
class ReadCorpus:
    def __init__(self,corpus_name):
        self.corpus_name = corpus_name

    def generate_sentences(self):
        count = 0
        features = []
        labels = []
        with open(self.corpus_name,'r') as file:
            for line in file:
                sentence = line.replace('\n','')
                [label, feature]= sentence.split(sep=' ', maxsplit=1)
                for sent in sentence:
                    count += 1
                labels.append(label)
                features.append(feature)
        file.close()

        return features, labels



In [10]:

    vols = []
    vecs = []
    vec = []
    with open("../data/glove.small.txt") as f:

        tokenized = read_by_tokens(f)
        count = 0
        for token in tokenized:
            if count % 301 == 0:
                vols.append(token)
                if count != 0:
                    vecs.append(torch.Tensor(vec))
                    vec = []
            else:
                v = torch.tensor(float(token), dtype=torch.float64)
                vec.append(v)
            count += 1
        vecs.append(torch.Tensor(vec))
    f.close()

    glove_vec = {}
    for i in range(0, len(vols)):
        v = vols[i]
        glove_vec[v] = vecs[i]

    RC = ReadCorpus("../data/train.txt")
    features,labels = RC.generate_sentences()

    tag_to_ix = {}
    id = 0
    for l in labels:
        if l not in tag_to_ix.keys():
            tag_to_ix[l] = id
            id += 1

    EMBEDDING_DIM = 300
    HIDDEN_DIM = int(EMBEDDING_DIM*2/3)

    model = BagOfWords(EMBEDDING_DIM, HIDDEN_DIM, len(tag_to_ix), glove_vec)
    
    loss_function = nn.NLLLoss()
    optimizer = optim.Adagrad(model.parameters(), lr=0.1)
        
    for epoch in range(10):  # again, normally you would NOT do 300 epochs, it is toy data
        for sentence, tags in zip(features, labels):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()
    
            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            targets = torch.tensor([tag_to_ix[tags]],dtype=torch.long)
    
            # Step 3. Run our forward pass.
            tag_scores = model.forward(sentence)

            #
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            
            loss.backward()
            optimizer.step()
        
        print("Epoch", epoch)
        with torch.no_grad():
            acc = 0
            for sentence, tags in zip(features,labels):
                
                tag_scores = model.forward(sentence)

                ind = torch.argmax(tag_scores)

                k = [k for k, v in tag_to_ix.items() if v == ind]

                if k[0] == tags:
                    acc += 1
            print(acc / len(features))
        
#     with torch.no_grad():
#         inputs = get_embvec("What are liver enzymes ?", glove_vec)
#         print(inputs)
#         tag_scores = model.forward("What are liver enzymes ?")
#         print(tag_scores)

Epoch 0
0.7278842233999184
Epoch 1
0.8071748878923767
Epoch 2
0.852017937219731
Epoch 3
0.8783122706889523
Epoch 4
0.9003261312678353
Epoch 5
0.918467183041174
Epoch 6
0.930900937627395
Epoch 7
0.9423155320016307
Epoch 8
0.9518956379942927
Epoch 9
0.9580105992662047


In [11]:
RC = ReadCorpus("../data/test.txt")
features,labels = RC.generate_sentences()

with torch.no_grad():
    acc = 0
    for sentence, tags in zip(features,labels):
                
        tag_scores = model.forward(sentence)

        ind = torch.argmax(tag_scores)

        k = [k for k, v in tag_to_ix.items() if v == ind]

        if k[0] == tags:
            acc += 1
    print(acc / len(features))

0.742
