In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.data_fetcher import NLPTaskDataFetcher
from flair.data import Corpus
from typing import List
from flair.embeddings import (
    TokenEmbeddings,
    WordEmbeddings,
    StackedEmbeddings,
    FlairEmbeddings,
    TransformerWordEmbeddings,
    CharacterEmbeddings,)
from flair.training_utils import EvaluationMetric
from flair.visual.training_curves import Plotter
# from flair.datasets import ClassificationCorpus doesnt work with this
from flair.datasets import ColumnCorpus

# this is the folder in which train, test and dev files reside
data_folder = './corpus'

# define columns
columns = {0: 'text', 1: 'iso'}

# load corpus containing training, test and dev data
corpus: Corpus = ColumnCorpus(data_folder, columns,
                                      test_file='test.txt', 
                                      train_file='train.txt')
print(corpus)

# 2. what tag do we want to predict?
tag_type = "iso"


# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    #WordEmbeddings("glove"),
    TransformerWordEmbeddings('distilbert-base-uncased', fine_tune=True),
    # comment in this line to use character embeddings
    # CharacterEmbeddings(),
    # comment in these lines to use contextual string embeddings
    #
    # FlairEmbeddings('news-forward'),
    #
    # FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True,
)

# initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train(
    "resources2/taggers/example-iso",
    learning_rate=0.5,
    mini_batch_size=32,
    max_epochs=3,
)

2021-02-06 14:28:02,210 Reading data from corpus
2021-02-06 14:28:02,214 Train: corpus\train.txt
2021-02-06 14:28:02,215 Dev: None
2021-02-06 14:28:02,215 Test: corpus\test.txt
Corpus: 1003 train + 112 dev + 289 test sentences
[b'<unk>', b'O', b'PLACE', b'SPATIAL_ENTITY', b'NONMOTION_EVENT', b'SPATIAL_SIGNAL', b'MOTION', b'MOTION_SIGNAL', b'PATH', b'MEASURE', b'<START>', b'<STOP>']
2021-02-06 14:28:12,752 ----------------------------------------------------------------------------------------------------
2021-02-06 14:28:12,764 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transforme

{'test_score': 0.0549,
 'dev_score_history': [],
 'train_loss_history': [],
 'dev_loss_history': []}