In [1]:
from flair.data import Corpus
import flair
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
from typing import List
from flair.embeddings import CamembertEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
# 1. get the corpus
corpus: Corpus  = flair.datasets.WIKINER_FRENCH().downsample(0.1)

In [None]:
# 2. what tag do we want to predict?
tag_type = 'ner'

In [None]:
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [None]:
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    TransformerWordEmbeddings('camembert-base'),
    TransformerWordEmbeddings('illuin/lepetit')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [2]:
# 7. start training
trainer.train('resources/taggers/example-ner-combined-cam+lep',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=3,
              checkpoint=True)


2020-12-07 08:56:19,461 Reading data from /Users/amalbedoui/.flair/datasets/wikiner_french
2020-12-07 08:56:19,462 Train: /Users/amalbedoui/.flair/datasets/wikiner_french/aij-wikiner-fr-wp3.train
2020-12-07 08:56:19,462 Dev: None
2020-12-07 08:56:19,463 Test: None
2020-12-07 08:56:35,289 ----------------------------------------------------------------------------------------------------
2020-12-07 08:56:35,293 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): CamembertModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(32005, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (

2020-12-07 08:56:35,295 ----------------------------------------------------------------------------------------------------
2020-12-07 08:56:35,296 Corpus: "Corpus: 10713 train + 1190 dev + 1323 test sentences"
2020-12-07 08:56:35,296 ----------------------------------------------------------------------------------------------------
2020-12-07 08:56:35,297 Parameters:
2020-12-07 08:56:35,298  - learning_rate: "0.1"
2020-12-07 08:56:35,299  - mini_batch_size: "32"
2020-12-07 08:56:35,299  - patience: "3"
2020-12-07 08:56:35,300  - anneal_factor: "0.5"
2020-12-07 08:56:35,301  - max_epochs: "3"
2020-12-07 08:56:35,301  - shuffle: "True"
2020-12-07 08:56:35,302  - train_with_dev: "False"
2020-12-07 08:56:35,303  - batch_growth_annealing: "False"
2020-12-07 08:56:35,303 ----------------------------------------------------------------------------------------------------
2020-12-07 08:56:35,304 Model training base path: "resources/taggers/example-ner-combined-cam+lep"
2020-12-07 08:56:35,3

{'test_score': 0.8222137983320699,
 'dev_score_history': [0.758683314415437,
  0.8013856812933026,
  0.8147465437788018],
 'train_loss_history': [6.677065119814517,
  3.0292117912377883,
  2.394785000672981],
 'dev_loss_history': [2.668398141860962,
  1.9775595664978027,
  1.743804931640625]}

In [None]:
model = SequenceTagger.load('resources/taggers/example-ner-combined-cam+lep/best-model.pt')

In [5]:
# create example sentence
sentence = Sentence('Emma Louise, habite au 26 rue Alexandre, 75005 Paris, France, née le 11/11/1993. Elle travaille chez Zenika et elle est joignable sur 06660006.')

In [13]:
%%time
# predict tags and print
model.predict(sentence)

CPU times: user 209 ms, sys: 12.8 ms, total: 222 ms
Wall time: 218 ms


In [9]:
print(sentence.to_tagged_string())

Emma <B-PER> Louise <E-PER> , habite au 26 rue <B-LOC> Alexandre <E-LOC> , 75005 Paris <S-LOC> , France <S-LOC> , née le 11 / 11 / 1993 . Elle travaille chez Zenika <S-ORG> et elle est joignable sur 06660006 .
