In [20]:
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

In [21]:
columns = {0: 'text', 1: 'ner'}

In [22]:
data_folder = "./training_test"
corpus = ColumnCorpus(
    data_folder,
    columns,
    train_file="train.txt",
    test_file="test.txt",
    dev_file="dev.txt"
)

2024-12-16 06:03:19,751 Reading data from training_test
2024-12-16 06:03:19,754 Train: training_test/train.txt
2024-12-16 06:03:19,755 Dev: training_test/dev.txt
2024-12-16 06:03:19,757 Test: training_test/test.txt


In [23]:
embedding_types = [
    WordEmbeddings("glove"),
    FlairEmbeddings("news-forward"),
    FlairEmbeddings("news-backward")
]
embeddings = StackedEmbeddings(embeddings=embedding_types)

In [24]:
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=corpus.make_label_dictionary(label_type="ner"),
    tag_type="ner",
    use_crf=True
)

2024-12-16 06:03:25,630 Computing label dictionary. Progress:


0it [00:00, ?it/s]
63it [00:00, 9725.47it/s]

2024-12-16 06:03:25,646 Dictionary created for label 'ner' with 6 values: DIMENSION (seen 64 times), MEDIUM (seen 63 times), ARTIST (seen 63 times), DATE (seen 63 times), ART_TITLE (seen 62 times), TITLE (seen 1 times)
2024-12-16 06:03:25,647 SequenceTagger predicts: Dictionary with 25 tags: O, S-DIMENSION, B-DIMENSION, E-DIMENSION, I-DIMENSION, S-MEDIUM, B-MEDIUM, E-MEDIUM, I-MEDIUM, S-ARTIST, B-ARTIST, E-ARTIST, I-ARTIST, S-DATE, B-DATE, E-DATE, I-DATE, S-ART_TITLE, B-ART_TITLE, E-ART_TITLE, I-ART_TITLE, S-TITLE, B-TITLE, E-TITLE, I-TITLE





In [25]:
# Train the model.
trainer = ModelTrainer(tagger, corpus)

In [26]:
trainer.train(
    "./ner-model",
    learning_rate=0.1,
    mini_batch_size=32,
    max_epochs=10
)

2024-12-16 06:03:25,904 ----------------------------------------------------------------------------------------------------
2024-12-16 06:03:25,906 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=27, bias=True)
  (loss_f



2024-12-16 06:03:27,737 epoch 1 - iter 1/2 - loss 3.91807073 - time (sec): 1.79 - samples/sec: 227.19 - lr: 0.100000 - momentum: 0.000000
2024-12-16 06:03:29,655 epoch 1 - iter 2/2 - loss 3.73599163 - time (sec): 3.71 - samples/sec: 214.84 - lr: 0.100000 - momentum: 0.000000
2024-12-16 06:03:29,657 ----------------------------------------------------------------------------------------------------
2024-12-16 06:03:29,659 EPOCH 1 done: loss 3.7360 - lr: 0.100000
2024-12-16 06:03:29,661  - 0 epochs without improvement
2024-12-16 06:03:29,665 saving best model
2024-12-16 06:03:31,648 ----------------------------------------------------------------------------------------------------
2024-12-16 06:03:32,308 epoch 2 - iter 1/2 - loss 3.26421360 - time (sec): 0.66 - samples/sec: 611.37 - lr: 0.100000 - momentum: 0.000000
2024-12-16 06:03:32,948 epoch 2 - iter 2/2 - loss 3.13899627 - time (sec): 1.30 - samples/sec: 614.08 - lr: 0.100000 - momentum: 0.000000
2024-12-16 06:03:32,951 -----------

100%|██████████| 1/1 [00:01<00:00,  1.81s/it]

2024-12-16 06:03:49,618 
Results:
- F-score (micro) 0.0118
- F-score (macro) 0.0065
- Accuracy 0.0068

By class:
              precision    recall  f1-score   support

   DIMENSION     0.0308    0.0526    0.0388        38
        DATE     0.0000    0.0000    0.0000        38
      ARTIST     0.0000    0.0000    0.0000        38
   ART_TITLE     0.0000    0.0000    0.0000        36
      MEDIUM     0.0000    0.0000    0.0000        38
       TITLE     0.0000    0.0000    0.0000         1

   micro avg     0.0133    0.0106    0.0118       189
   macro avg     0.0051    0.0088    0.0065       189
weighted avg     0.0062    0.0106    0.0078       189

2024-12-16 06:03:49,619 ----------------------------------------------------------------------------------------------------





{'test_score': 0.011799410029498525}

In [27]:
# Load trained model.
model = SequenceTagger.load("./ner-model/final-model.pt")

2024-12-16 06:03:51,370 SequenceTagger predicts: Dictionary with 27 tags: O, S-DIMENSION, B-DIMENSION, E-DIMENSION, I-DIMENSION, S-MEDIUM, B-MEDIUM, E-MEDIUM, I-MEDIUM, S-ARTIST, B-ARTIST, E-ARTIST, I-ARTIST, S-DATE, B-DATE, E-DATE, I-DATE, S-ART_TITLE, B-ART_TITLE, E-ART_TITLE, I-ART_TITLE, S-TITLE, B-TITLE, E-TITLE, I-TITLE, <START>, <STOP>


In [30]:
# Test the model
from icecream import ic

# test_string = "'The Last Supper' by Leonardo da Vinnci; 1498 - tempera on stone, 460x880cm"
test_string = "'The Kiss' by Gustaf Klimt; 1908 - oil and gold leaf on canvas, 180x180 cm"
test_sentence = Sentence(test_string)

model.predict(test_sentence)

for entity in test_sentence.get_spans("ner"):
    ic(entity.text)
    ic(entity.start_position)
    ic(entity.end_position)
    ic(entity.tag)

ic| entity.text: "'The Kiss'"
ic| entity.start_position: 0
ic| entity.end_position: 10
ic| entity.tag: 'ART_TITLE'
ic| entity.text: 'by'
ic| entity.start_position: 11
ic| entity.end_position: 13
ic| entity.tag: 'ART_TITLE'
ic| entity.text: 'Gustaf Klimt'
ic| entity.start_position: 14
ic| entity.end_position: 26
ic| entity.tag: 'ARTIST'
ic| entity.text: 'oil and'
ic| entity.start_position: 35
ic| entity.end_position: 42
ic| entity.tag: 'MEDIUM'
ic| entity.text: 'gold leaf on canvas,'
ic| entity.start_position: 43
ic| entity.end_position: 63
ic| entity.tag: 'MEDIUM'
ic| entity.text: '180x180 cm'
ic| entity.start_position: 64
ic| entity.end_position: 74
ic| entity.tag: 'DIMENSION'
