In [15]:
from __future__ import print_function
import sys

import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F
import torch.optim as optim

## Corpus Processing

In [16]:
def read_dataset(filename='./penn.train.pos'):
    fp = open(filename, 'r')
    dataset = []
    for line in fp:
        tokens = line.strip().split()
        dataset.append(([t.rsplit('/', 1)[0] for t in tokens], [t.rsplit('/', 1)[1] for t in tokens]))
    return dataset

training_data = read_dataset("train.pos")[:200]
devel_data = read_dataset("test.pos")[:100]

CONTEXT_SIZE = 3 # an odd number
def gen_instances(corpus):
    instances = []
    for sent, tags in corpus:
        n_pad = (CONTEXT_SIZE - 1) / 2
        padded_sent = ['<S>'] * n_pad + sent + ['</S>'] * n_pad
        for i in xrange(len(sent)):
            instances.append((padded_sent[i:i+CONTEXT_SIZE], tags[i]))
    return instances

training_instances = gen_instances(training_data)
devel_instances = gen_instances(devel_data)

word_to_ix = {'UNK':0, '<S>':1, '</S>':2}
tag_to_ix = {'UNK':0}
for sent, tags in training_data:
    for word, tag in zip(sent, tags):
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

def prepare_sequence(seq, to_ix):
    idxs = map(lambda w: to_ix[w] if w in to_ix else to_ix['UNK'], seq)
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

## MLP Tagger

<img src="images/mlp.png" style="width: 500px;"/>

In [17]:
class MLPTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, context_size, tagset_size):
        super(MLPTagger, self).__init__()

        # The word embedding layer map words (IDs) to their embedding vectors
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # The linear layer map input vectors to hidden layer
        # Input: context_size * embedding_dim
        self.input2hidden = nn.Linear(context_size * embedding_dim, hidden_dim)
        
        # The linear layer map hidden layer to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, context):
        embeds = self.word_embeddings(context).view(1, -1) # (N, in\_features)
        hidden = F.tanh(self.input2hidden(embeds))
        
        # Softmax layer: map RNN hidden states to the tag space
        tag_space = self.hidden2tag(hidden)
        log_probs = F.log_softmax(tag_space)
        return log_probs

## Training and Testing

In [18]:
torch.manual_seed(66666)

EMBEDDING_DIM = 10
HIDDEN_DIM = 10

model = MLPTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), CONTEXT_SIZE, len(tag_to_ix))

# Negative Log-likelihood loss
loss_function = nn.NLLLoss()

# Stochastic Gradient Descent (SGD)
optimizer = optim.SGD(model.parameters(), lr=0.01)

def train(epoch):
    model.train()
    train_loss = torch.Tensor([0])
    for context, tag in training_instances:
        model.zero_grad()
        context_in = prepare_sequence(context, word_to_ix)
        target = prepare_sequence([tag], tag_to_ix)
        log_probs = model(context_in)
        loss = loss_function(log_probs, target)
        train_loss += loss.data
        loss.backward()
        optimizer.step()
    print("Epoch %d: loss = %.2f, " % (epoch, train_loss[0]), end='')

def test(epoch):
    model.eval()
    test_loss = torch.Tensor([0])
    correct = 0
    tot_words = len(devel_instances)
    for context, tag in devel_instances:
        context_in = prepare_sequence(context, word_to_ix)
        target = prepare_sequence([tag], tag_to_ix)
        log_probs = model(context_in)
        pred = log_probs.data.max(dim=1)[1]
        correct += pred.eq(target.data).sum()
    print("Testing accuracy = %.2f%%" % (100. * correct / tot_words))

for epoch in xrange(30):
    train(epoch)
    test(epoch)

Epoch 0: loss = 14141.99, Testing accuracy = 35.69%
Epoch 1: loss = 10994.79, Testing accuracy = 39.61%
Epoch 2: loss = 9976.27, Testing accuracy = 43.00%
Epoch 3: loss = 9350.19, Testing accuracy = 45.37%
Epoch 4: loss = 8864.57, Testing accuracy = 47.28%
Epoch 5: loss = 8462.36, Testing accuracy = 48.43%
Epoch 6: loss = 8131.51, Testing accuracy = 49.29%
Epoch 7: loss = 7852.69, Testing accuracy = 49.98%
Epoch 8: loss = 7608.44, Testing accuracy = 50.67%
Epoch 9: loss = 7387.95, Testing accuracy = 51.45%
Epoch 10: loss = 7184.34, Testing accuracy = 51.98%
Epoch 11: loss = 6993.12, Testing accuracy = 51.98%
Epoch 12: loss = 6813.90, Testing accuracy = 52.23%
Epoch 13: loss = 6645.93, Testing accuracy = 52.47%
Epoch 14: loss = 6486.15, Testing accuracy = 53.04%
Epoch 15: loss = 6332.50, Testing accuracy = 53.49%
Epoch 16: loss = 6186.03, Testing accuracy = 54.10%
Epoch 17: loss = 6047.36, Testing accuracy = 54.10%
Epoch 18: loss = 5915.98, Testing accuracy = 54.39%
Epoch 19: loss = 579