In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Preparing Data

In [2]:
from torchtext.data import Field, BucketIterator

# Refs: https://github.com/pytorch/text/blob/master/test/sequence_tagging.py
INPUTS = Field(lower=True, include_lengths=True)
CHUNK_TAGS = Field(unk_token=None, include_lengths=True)

In [3]:
from torchtext.datasets import CoNLL2000Chunking

fields = [('inputs', INPUTS), (None, None), ('tags', CHUNK_TAGS)]
train_data, valid_data, test_data = CoNLL2000Chunking.splits(fields=fields, root='data/')

In [4]:
print(train_data[0].inputs)
print(train_data[0].tags)

['construction', 'of', 'apartments', 'and', 'other', 'multi-family', 'dwellings', 'slipped', '2.2', '%', 'to', 'an', 'annual', 'rate', 'of', '1,022,000', 'following', 'a', '3.5', '%', 'decline', 'in', 'august', '.']
['B-NP', 'B-PP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'O']


In [5]:
INPUTS.build_vocab(train_data, min_freq=2, 
                   vectors="glove.6B.100d", vectors_cache="vector_cache", 
                   unk_init=torch.Tensor.normal_)

CHUNK_TAGS.build_vocab(train_data)

print(len(INPUTS.vocab), len(CHUNK_TAGS.vocab))
print(CHUNK_TAGS.vocab.itos)

8389 22
['<pad>', 'I-NP', 'B-NP', 'O', 'B-VP', 'B-PP', 'I-VP', 'B-ADVP', 'B-SBAR', 'B-ADJP', 'I-ADJP', 'B-PRT', 'I-ADVP', 'I-PP', 'I-CONJP', 'I-SBAR', 'B-CONJP', 'B-INTJ', 'B-LST', 'I-INTJ', 'I-UCP', 'B-UCP']


In [6]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [7]:
for batch in train_iterator:
    batch_text, batch_text_lens = batch.inputs
    batch_tags, batch_tags_lens = batch.tags
    break

print(batch_text)
print(batch_text_lens)
print(batch_tags)
print(batch_tags_lens)

tensor([[   0,    3,  166,  ...,    3,    3,    8],
        [  67,  823,  450,  ..., 3203, 2359,   40],
        [   1,   17,   29,  ..., 3570,  334,  103],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')
tensor([ 2, 37, 24, 28, 28, 43, 36, 34, 20, 22, 34, 23, 32, 27, 42,  8, 28, 23,
         3, 42, 19, 18, 24, 26, 12, 21, 25, 24, 19, 20, 39,  9, 28, 11,  4, 13,
        19, 40, 26, 26,  7,  9, 30, 26, 21, 35, 18,  8,  9, 25, 12,  6, 40, 18,
        29,  6, 19, 24, 44, 28, 42, 29, 56, 13, 11, 12, 48, 13, 15, 28, 21, 15,
        24,  9,  8, 24, 28, 24, 16, 34, 14, 28, 16,  6, 50, 30, 31, 27, 19, 31,
        10, 19, 16, 19, 28, 22, 18, 14, 32, 14, 29, 11, 28, 13, 29, 55, 37, 14,
        30, 27, 31, 39, 18, 26, 12, 24, 15, 23, 19, 21, 20,  2, 41, 40, 42, 20,
        39, 23], device='cuda:0')
tensor([[2, 2, 2,  ..., 2, 2, 5],
        [3, 1, 1,  ..., 1, 1,

## Building the Model