In [9]:
import math
import numpy as np
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

torch.manual_seed(1)

<torch._C.Generator at 0x7fed2fb99b90>

In [10]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 4

In [11]:
def get_embvec(sentence, vol):
    words = sentence.split()
    words = list(filter(lambda token: token not in string.punctuation, words))
    vec = torch.FloatTensor(len(words), EMBEDDING_DIM)
    for i in range(0,len(words)):
        if words[i] in vol.keys():
            vec[i] = (vol.get(words[i]))
        else:
            vec[i] = (vol.get("#UNK#"))

    return vec

In [12]:
class BiLSTMTagger(nn.Module):

   def __init__(self, embedding_dim, hidden_dim, tagset_size, vol):
       super(BiLSTMTagger, self).__init__()
       self.hidden_dim = hidden_dim

       self.vol = vol

       self.bilstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

       self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)


   def bilstm_vec(self, sentence):
       embeds = get_embvec(sentence, self.vol)

       bilstm_out, (h_n, c_n) = self.bilstm(embeds.view(len(embeds), 1, -1))

       out = torch.hstack((h_n[-2, :, :], h_n[-1, :, :]))

       return out


   def forward(self, sentence):
        bilstm_out = self.bilstm_vec(sentence)

        tag_space = self.hidden2tag(bilstm_out)

        tag_scores = F.log_softmax(tag_space, dim=1)

        return tag_scores


In [13]:
def read_by_tokens(fileobj):

    for line in fileobj:

        for token in line.split():

            yield token

In [14]:
class ReadCorpus:
    def __init__(self,corpus_name):
        self.corpus_name = corpus_name

    def generate_sentences(self):
        count = 0
        features = []
        labels = []
        with open(self.corpus_name,'r') as file:
            for line in file:
                sentence = line.replace('\n','')
                [label, feature]= sentence.split(sep=' ', maxsplit=1)
                for sent in sentence:
                    count += 1
                labels.append(label)
                features.append(feature)
        file.close()

        return features, labels


In [15]:

    vols = []
    vecs = []
    vec = []
    with open("../data/glove.small.txt") as f:

        tokenized = read_by_tokens(f)
        count = 0
        for token in tokenized:
            if count % 301 == 0:
                vols.append(token)
                if count != 0:
                    vecs.append(torch.Tensor(vec))
                    vec = []
            else:
                v = torch.tensor(float(token), dtype=torch.float64)
                vec.append(v)
            count += 1
        vecs.append(torch.Tensor(vec))
    f.close()

    glove_vec = {}
    for i in range(0, len(vols)):
        v = vols[i]
        glove_vec[v] = vecs[i]

    RC = ReadCorpus("../data/train.txt")
    features,labels = RC.generate_sentences()

    tag_to_ix = {}
    id = 0
    for l in labels:
        if l not in tag_to_ix.keys():
            tag_to_ix[l] = id
            id += 1

    EMBEDDING_DIM = 300
    HIDDEN_DIM = 4

    model = BiLSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(tag_to_ix), glove_vec)
    
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    

    for epoch in range(10):  # again, normally you would NOT do 300 epochs, it is toy data
        for sentence, tags in zip(features, labels):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()
    
            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            targets = torch.tensor([tag_to_ix[tags]],dtype=torch.long)
    
            # Step 3. Run our forward pass.
            tag_scores = model.forward(sentence)
            #
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            
            loss.backward()
            optimizer.step()
        
        print("Epoch", epoch)
        with torch.no_grad():
            acc = 0
            for sentence, tags in zip(features,labels):
                
                tag_scores = model.forward(sentence)

                ind = torch.argmax(tag_scores)

                k = [k for k, v in tag_to_ix.items() if v == ind]

                if k[0] == tags:
                    acc += 1
            print(acc / len(features))
        
#     with torch.no_grad():
#         inputs = get_embvec("What are liver enzymes ?", glove_vec)
#         print(inputs)
#         tag_scores = model.forward("What are liver enzymes ?")
#         print(tag_scores)


Epoch 0
0.4739094985731757
Epoch 1
0.5538116591928252
Epoch 2
0.5917244190786791
Epoch 3
0.6434977578475336
Epoch 4
0.6390134529147982
Epoch 5
0.6785568691398288
Epoch 6
0.6877293110476967
Epoch 7
0.7024052181002853
Epoch 8
0.713819812474521
Epoch 9
0.7258459029759479


In [16]:
#both accuracy and F1 scores.
RC = ReadCorpus("../data/test.txt")
features,labels = RC.generate_sentences()

with torch.no_grad():
    acc = 0
    for sentence, tags in zip(features,labels):
                
        tag_scores = model.forward(sentence)

        ind = torch.argmax(tag_scores)

        k = [k for k, v in tag_to_ix.items() if v == ind]

        if k[0] == tags:
            acc += 1
    print(acc / len(features))

0.606


In [17]:
tag_to_ix

{'ENTY:cremat': 0,
 'NUM:count': 1,
 'HUM:ind': 2,
 'LOC:country': 3,
 'DESC:manner': 4,
 'NUM:money': 5,
 'ENTY:body': 6,
 'NUM:perc': 7,
 'ABBR:exp': 8,
 'DESC:def': 9,
 'LOC:city': 10,
 'HUM:desc': 11,
 'ENTY:veh': 12,
 'LOC:other': 13,
 'DESC:desc': 14,
 'LOC:mount': 15,
 'ENTY:other': 16,
 'ENTY:event': 17,
 'ENTY:food': 18,
 'DESC:reason': 19,
 'ENTY:termeq': 20,
 'ENTY:lang': 21,
 'HUM:gr': 22,
 'NUM:date': 23,
 'ENTY:techmeth': 24,
 'ENTY:animal': 25,
 'NUM:period': 26,
 'NUM:other': 27,
 'ENTY:dismed': 28,
 'HUM:title': 29,
 'ENTY:sport': 30,
 'ENTY:product': 31,
 'NUM:temp': 32,
 'ENTY:color': 33,
 'LOC:state': 34,
 'ENTY:word': 35,
 'NUM:dist': 36,
 'ENTY:letter': 37,
 'ABBR:abb': 38,
 'ENTY:substance': 39,
 'ENTY:religion': 40,
 'ENTY:symbol': 41,
 'ENTY:currency': 42,
 'NUM:code': 43,
 'ENTY:instru': 44,
 'NUM:weight': 45,
 'NUM:speed': 46,
 'ENTY:plant': 47,
 'NUM:volsize': 48,
 'NUM:ord': 49}

In [15]:
[k for k, v in tag_to_ix.items() if v == 1]

['ENTY:cremat']

In [32]:
    with torch.no_grad():
        sentence = "Who replies `` I know '' to Princess Leia 's confession `` I love you '' in The Empire Strikes Back ?"
        inputs = get_embvec(sentence, glove_vec)
        print(inputs)
        tag_scores = model.forward(sentence)
        print(tag_scores)

        ind = torch.argmax(tag_scores)

        k = [k for k, v in tag_to_ix.items() if v == ind]
        
        print(k)

tensor([[-0.0474,  0.0093,  0.3878,  ..., -0.0024,  0.5159,  0.0343],
        [-0.3816,  0.2921, -0.2116,  ...,  0.3511,  0.3492,  0.5205],
        [ 0.0837,  0.1675, -0.5997,  ...,  0.0799, -0.0178, -0.1576],
        ...,
        [-0.1770, -0.3460, -0.1370,  ..., -0.0861,  0.2466,  0.2335],
        [-0.4350, -0.2168,  0.1815,  ..., -0.5749, -0.0223,  0.2256],
        [-0.2397,  0.0764,  0.0171,  ...,  0.4198,  0.0359, -0.0668]])
tensor([[ -3.3651, -22.4164,  -0.0391, -20.5387, -20.9542, -16.1962, -13.2518,
         -17.1531, -18.3534, -18.4736, -22.3520,  -7.4908,  -8.8826,  -7.9462,
          -6.5135, -21.3394,  -8.0555,  -8.3200, -11.0501, -11.9784, -10.8983,
         -12.9006,  -8.0495, -10.3201, -11.8362, -10.5255, -13.4433, -17.7182,
         -20.3574,  -9.0610, -15.6763,  -9.3264, -16.5689, -10.8354, -19.2723,
         -13.3709, -19.4676, -12.3230, -15.5846, -13.4462, -10.5748, -14.2968,
         -15.0885, -15.8220, -10.1026, -16.5966, -16.6027, -12.8928, -15.1876,
         -13.