# LSTM part-of-speech tagger with character-level features

In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

torch.manual_seed(1)

<torch._C.Generator at 0x2233e140990>

In [2]:
f = open("conll2000/train.txt","r") 
train_data = []
word = []
tag = []
words = []
tags = []
for line in f:
    line = line.lower().strip()
    if line == '':
        train_data.append((word,tag))
        word = []
        tag = []
        sen = []
    else:
        word.append(line.split()[0])
        tag.append(line.split()[1])
        
        words.append(line.split()[0])
        tags.append(line.split()[1])
f.close()


In [3]:
f = open("conll2000/test.txt","r") 
test_data = []
word = []
tag = []
words = []
tags = []
for line in f:
    line = line.lower().strip()
    if line == '':
        test_data.append((word,tag))
        word = []
        tag = []
        sen = []
    else:
        word.append(line.split()[0])
        tag.append(line.split()[1])
        
        words.append(line.split()[0])
        tags.append(line.split()[1])
f.close()


In [4]:
word_to_ix = {}
tag_to_ix = {}
char_to_ix = {}

for sent, tags in train_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        for char in list(word):
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
    
for sent, tags in test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        for char in list(word):
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

In [5]:
vocab_size = len(word_to_ix)
tag_size = len(tag_to_ix)
char_size = len(char_to_ix)
print(vocab_size,tag_size,char_size)

19460 44 54


In [6]:
CHAR_EMBEDDING_DIM = 100
CHAR_REP_DIM = 100
WORD_EMBEDDING_DIM = 1000
HIDDEN_DIM = 1000
PRINT_EVERY = 100

In [7]:
def prepare_sequence(seq, to_ix, is_train=False):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    # if not in training mode, return a volatile variable (no backward pass)
    return autograd.Variable(tensor, requires_grad=False, volatile=not is_train)

In [240]:
class CharLvlRep(nn.Module):
  
    def __init__(self, embedding_dim, rep_dim, char_size):  
        super(CharLvlRep, self).__init__()
        self.rep_dim = rep_dim
        self.char_embeddings = nn.Embedding(char_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, rep_dim)
        
        self.char_hidden = self.init_hidden()
        
    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.rep_dim)), 
                autograd.Variable(torch.zeros(1, 1, self.rep_dim)))
  
    def forward(self, word, is_train=False):
        embeds = self.char_embeddings(word)
        
        char_reps, _ = self.lstm(embeds.view(len(word), 1, -1), self.char_hidden)

        final_char_rep = (char_reps[char_reps.size()[0]-1, :, :])

        return final_char_rep


class LSTMTagger(nn.Module):

    def __init__(self, char_embedding_dim, char_rep_dim, char_size, word_embedding_dim, vocab_size, hidden_dim, tagset_size):
        super(LSTMTagger, self).__init__()
        self.char_rep_dim = char_rep_dim
        self.hidden_dim = hidden_dim
        #char_embed_dim, char_hidden_dim, char_dict_size
        self.model_char = CharLvlRep(char_embedding_dim, char_rep_dim, char_size)
        
        self.word_embeddings = nn.Embedding(vocab_size, word_embedding_dim)
      
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(word_embedding_dim + char_rep_dim, hidden_dim)
      
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.word_hidden = self.init_hidden()
        
    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), 
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence, words, is_train=False):
        word_embeds = self.word_embeddings(sentence)
        word_embeds = word_embeds.view(len(sentence), 1, -1)
      
        char_reps = autograd.Variable(torch.zeros(len(words),1,self.char_rep_dim), volatile=not is_train)

        for idx, char in enumerate(words):
            char_reps[idx, :, :] = self.model_char(char, is_train)
            
        embeds_cat = torch.cat((word_embeds, char_reps), dim=2)
        
        lstm_out, _ = self.lstm(embeds_cat, self.word_hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [241]:
model = LSTMTagger(CHAR_EMBEDDING_DIM,CHAR_REP_DIM, char_size, WORD_EMBEDDING_DIM, vocab_size, HIDDEN_DIM, tag_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)


In [251]:
running_accuracy = 0
losses = []
train_accuracies = []
test_accuracies = []

for epoch in range(1):
    is_train = True
    train_accuracy = 0.0
    for idx, (sentence, tags) in enumerate(train_data):
        # new training sequence -> zero the gradients of all models
        model.zero_grad()

        words_in = []
        for word in sentence:
            words_in.append(prepare_sequence(word, char_to_ix, is_train))

        # one-hot encoding of words in sentence
        sentence_in = prepare_sequence(sentence, word_to_ix, is_train)
    
        # compute the scores (forward pass)
        tag_scores = model(sentence_in, words_in, is_train)
    
        # one-hot encoding of the labels for each word
        targets = prepare_sequence(tags, tag_to_ix, is_train)
    
        # compute the accuracy
        accuracy = sum(np.argmax(tag_scores.data.numpy(),1) == targets.data.numpy())/len(sentence)
        train_accuracy += accuracy

        # compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data[0])
    
        if idx % PRINT_EVERY == 0:
            print("It {}: loss = {}, accuracy = {}".format(idx,loss.data[0],accuracy))

    train_accuracy = train_accuracy / len(train_data)
    train_accuracies.append(train_accuracy)
    print("Epoch {}: train_accuracy = {}".format(epoch, train_accuracy))

    
#     evaluate the validation accuracy after each epoch
    is_train = False
    test_accuracy = 0.0
    for sentence, tags in test_data:
    
        # one-hot encoding of chars in words
        words_in = []
        for word in sentence:
            words_in.append(prepare_sequence(word, char_to_ix, is_train))
            
        # one-hot encoding of words in sentence
        sentence_in = prepare_sequence(sentence, word_to_ix, is_train)
    
        # compute the scores (forward pass)
        tag_scores = model(sentence_in, words_in, is_train)
    
        # one-hot encoding of the labels for each word
        targets = prepare_sequence(tags, tag_to_ix, is_train)
    
        # compute the accuracy
        test_accuracy += sum(np.argmax(tag_scores.data.numpy(),1) == targets.data.numpy())/len(sentence)
    
    test_accuracy = test_accuracy / len(test_data)
    test_accuracies.append(test_accuracy)

    print("Epoch {}: test_accuracy = {}".format(epoch, test_accuracy))
    
#     # save the best model so far
#     if valid_accuracy.data[0] > best_valid_accuracy:
#         torch.save(model.state_dict(), SAVE_PATH)
#         best_valid_accuracy = valid_accuracy.data[0]



It 0: loss = 0.0060774278827011585, accuracy = 1.0
It 100: loss = 0.3171910047531128, accuracy = 0.8333333333333334
It 200: loss = 0.8777325749397278, accuracy = 0.6774193548387096
It 300: loss = 0.5802592635154724, accuracy = 0.8461538461538461
It 400: loss = 1.1272722482681274, accuracy = 0.625
It 500: loss = 0.6786277890205383, accuracy = 0.7631578947368421
It 600: loss = 0.1586916744709015, accuracy = 1.0
It 700: loss = 0.39816898107528687, accuracy = 0.9333333333333333
It 800: loss = 0.07978050410747528, accuracy = 1.0
It 900: loss = 0.8574633002281189, accuracy = 0.7619047619047619
It 1000: loss = 0.31176719069480896, accuracy = 0.9130434782608695
It 1100: loss = 0.39146387577056885, accuracy = 0.8125
It 1200: loss = 0.42262521386146545, accuracy = 0.8529411764705882
It 1300: loss = 0.7608103156089783, accuracy = 0.7647058823529411
It 1400: loss = 0.18127794563770294, accuracy = 0.8333333333333334
It 1500: loss = 0.02548396587371826, accuracy = 1.0
It 1600: loss = 0.2161622047424