# BiLSTM for PoS Tagging

A multi-layer bi-directional LSTM followed by a Conditional Random Field (CRF) for Part-of-Speech (PoS) Tagging.  

In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Preparing Data

The dataset is Universal Dependencies English Web Treebank (UDPOS).  
This dataset actually has two different sets of tags, [universal dependency (UD) tags](https://universaldependencies.org/u/pos/) and [Penn Treebank (PTB) tags](https://www.sketchengine.eu/penn-treebank-tagset/).  

In [2]:
from torchtext.data import Field, BucketIterator

TEXT = Field(lower=True, include_lengths=True)
# Because the set of possible tags is finite, do NOT use unknown token for it. 
UD_TAGS = Field(unk_token=None, include_lengths=True)
PTB_TAGS = Field(unk_token=None, include_lengths=True)

In [3]:
from torchtext.datasets import UDPOS

fields = [('text', TEXT), ('udtags', UD_TAGS), ('ptbtags', PTB_TAGS)]
train_data, valid_data, test_data = UDPOS.splits(fields=fields, root='data/')

In [4]:
print(train_data[0].text)
print(train_data[0].udtags)
print(train_data[0].ptbtags)

['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.']
['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']
['NNP', 'HYPH', 'NNP', ':', 'JJ', 'NNS', 'VBD', 'NNP', 'NNP', 'NNP', 'HYPH', 'NNP', ',', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NNP', ',', 'IN', 'DT', 'JJ', 'NN', '.']


In [5]:
TEXT.build_vocab(train_data, min_freq=2, 
                 vectors="glove.6B.100d", vectors_cache="vector_cache", 
                 unk_init=torch.Tensor.normal_)

UD_TAGS.build_vocab(train_data)
PTB_TAGS.build_vocab(train_data)

len(TEXT.vocab), len(UD_TAGS.vocab), len(PTB_TAGS.vocab)

(8866, 18, 51)

In [6]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [7]:
for batch in train_iterator:
    batch_text, batch_text_lens = batch.text
    batch_tags, batch_tags_lens = batch.udtags
    break

print(batch_text)
print(batch_text_lens)
print(batch_tags)
print(batch_tags_lens)

tensor([[  27,   56,  116,  ...,  127,    9, 3715],
        [  12,  244,    4,  ...,    4,   76,    1],
        [  73,   13,    1,  ...,    1, 1904,    1],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')
tensor([19, 16,  2, 20, 44, 11, 29, 13, 10, 38, 22, 71, 17,  7, 15, 12,  7, 10,
        12, 29, 20,  5, 42, 20, 25, 11, 11,  4, 22, 16, 31, 28,  2, 24, 60, 18,
         4,  7,  4, 17, 26, 38, 34,  5,  2,  6,  1,  4, 23, 24, 33,  9, 16,  1,
        20, 27, 26, 23, 20, 13, 14, 20, 29, 14,  7, 13,  6, 23, 15, 11, 14, 27,
        31, 18,  2, 38, 52,  2,  2,  5,  7, 22,  7, 12, 16, 12,  5, 42, 18, 19,
        15,  8, 11, 13,  3, 33,  7,  4,  7,  1, 25, 48, 20, 11,  2, 26, 22, 19,
        21,  4, 12,  9, 33, 16, 15, 25, 10, 36,  3,  9,  5, 20, 17, 14,  4,  2,
        19,  1], device='cuda:0')
tensor([[14, 13,  8,  ...,  1,  4,  7],
        [ 4,  1,  2,  .

## Building the Model

A Seq2Seq model  
* The elements in two sequences are not matched one by one  
* The two sequences may have different lengths  

A PoS-tagger  
* The elements in two sequences are strictly matched one by one  
* The two sequences have definitely the same length  