In [37]:
from anntools import Collection
from pathlib import Path
from flair.data import Sentence, Corpus
import numpy as np


In [6]:
c = Collection()
for fname in Path("../2021/training/").rglob("*.txt"):
    c.load(fname)

In [39]:
#
def brat2flair(brat):
    out=[]
    for s in brat:
        sent=Sentence(s.text)
        for t in sent:
            for k in s.keyphrases:
                if t.text in k.text:
                    t.add_tag('ner', k.label)
        out.append(sent)
    return out

x = np.split(c, [int(.8 * len(c)), int(.9 * len(c))])
sentences_train=brat2flair(x[0])
sentences_dev=brat2flair(x[1])
sentences_test=brat2flair(x[2])
corpus: Corpus = Corpus(sentences_train, sentences_dev, sentences_test)

In [62]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
import torch
from torch.optim.lr_scheduler import OneCycleLR


# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

print(tag_dictionary)

# 4. initialize fine-tuneable transformer embeddings WITH document context
from flair.embeddings import TransformerWordEmbeddings

embeddings = TransformerWordEmbeddings(
    #model='distilbert-base-uncased',
    model='xlm-roberta-base',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

#tagger: SequenceTagger = SequenceTagger(hidden_size=256,
#                                        embeddings=embeddings,
#                                        tag_dictionary=tag_dictionary,
#                                        tag_type=tag_type,
#                                         use_crf=True)

from flair.models import SequenceTagger
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type='ner',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.AdamW)

# 7. start training
trainer.train('resources/taggers/ner-roberta-large',
             learning_rate=5.0e-5,
              mini_batch_size=4,
              mini_batch_chunk_size=1,
              max_epochs=1,
              scheduler=OneCycleLR,
              embeddings_storage_mode='none',
              weight_decay=0.,
              )



Dictionary with 9 tags: <unk>, O, , Action, Concept, Predicate, Reference, <START>, <STOP>
2021-03-18 14:53:39,287 ----------------------------------------------------------------------------------------------------
2021-03-18 14:53:39,292 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_fea

{'test_score': 0.7899,
 'dev_score_history': [0.7806],
 'train_loss_history': [0.6464005837837855],
 'dev_loss_history': [0.5483847260475159]}