In [0]:
#@title Installation
!pip install --quiet allennlp
!pip install --quiet spacy==2.2.0

In [0]:
#@title Imports
import os

import torch
import torch.nn as nn
import torch.optim as optim

from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer

from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.models import CrfTagger

In [0]:
#@title Load datasets
train_path = "./drive/My Drive/CoNLL2003/train.txt" #@param {type: "string"}
valid_path = "./drive/My Drive/CoNLL2003/valid.txt" #@param {type: "string"}
reader = SequenceTaggingDatasetReader(
    word_tag_delimiter='###',
    token_delimiter=' ',
)

train_dataset = reader.read(train_path)
valid_dataset = reader.read(valid_path)
vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

In [0]:
#@title Build a BiLSTM-CRF
EMBED_DIM = 256 #@param {type: "number"}
HIDDEN_DIM = 256 #@param{type: "number"}

token_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=EMBED_DIM,
    )
word_embeddings = BasicTextFieldEmbedder({'tokens': token_embedding})

bilstm = nn.LSTM(
  EMBED_DIM,
  HIDDEN_DIM,
  batch_first=True,
  bidirectional=True
)
encoder = PytorchSeq2SeqWrapper(bilstm)

model = CrfTagger(
  vocab=vocab,
  text_field_embedder=word_embeddings,
  encoder=encoder,
)

In [0]:
#@title Set a Trainer
BATCH_SIZE =  32#@param {type: "number"}
LR =  0.1#@param {type: "number"}
PATIENCE=10 #@param {type: "number"}
MAX_EPOCH=50 #@param {type: "number"}
GRAD_NORM=1.0 #@param {type: "number"}

optimizer = optim.SGD(model.parameters(), lr=LR)
iterator = BucketIterator(
  batch_size=BATCH_SIZE, 
  sorting_keys=[("tokens", "num_tokens")]
)
iterator.index_with(vocab)

if torch.cuda.is_available():
  cuda_device = 0
  model = model.cuda(cuda_device)
else:
  cuda_device = -1


trainer = Trainer(
  model=model,
  optimizer=optimizer,
  iterator=iterator,
  train_dataset=train_dataset,
  validation_dataset=valid_dataset,
  patience=PATIENCE,
  num_epochs=MAX_EPOCH,
  grad_norm=GRAD_NORM,
  cuda_device=cuda_device
)

In [0]:
#@title Run training
trainer.train()

In [0]:
#@title Save a model
save_dir = './drive/My Drive/CoNLL2003' #@param{type: "string"}
file_name = 'bilstm-crf' #@param{type: "string"}

if not os.path.exists(save_dir):
        os.mkdir(save_dir)

with open(f'{save_dir}/{file_name}.th', 'wb') as f:
  torch.save(model.state_dict(), f)
vocab.save_to_files(f'{save_dir}/{file_name}.vocab')