In [12]:
from flair.data import Corpus
import flair
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
from typing import List
from flair.embeddings import CamembertEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
# 1. get the corpus
corpus: Corpus  = flair.datasets.WIKINER_FRENCH().downsample(0.1)

In [None]:
# 2. what tag do we want to predict?
tag_type = 'ner'

In [None]:
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [None]:
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    TransformerWordEmbeddings('camembert-base')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [4]:
# 7. start training
trainer.train('resources/taggers/example-ner-camembert',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=3,
              checkpoint=True)

2020-12-06 14:06:14,027 Reading data from /Users/amalbedoui/.flair/datasets/wikiner_french
2020-12-06 14:06:14,027 Train: /Users/amalbedoui/.flair/datasets/wikiner_french/aij-wikiner-fr-wp3.train
2020-12-06 14:06:14,028 Dev: None
2020-12-06 14:06:14,029 Test: None
2020-12-06 14:06:26,464 ----------------------------------------------------------------------------------------------------
2020-12-06 14:06:26,466 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): CamembertModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(32005, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (

2020-12-06 14:06:26,468 ----------------------------------------------------------------------------------------------------
2020-12-06 14:06:26,469 Corpus: "Corpus: 10713 train + 1190 dev + 1323 test sentences"
2020-12-06 14:06:26,470 ----------------------------------------------------------------------------------------------------
2020-12-06 14:06:26,470 Parameters:
2020-12-06 14:06:26,471  - learning_rate: "0.1"
2020-12-06 14:06:26,472  - mini_batch_size: "32"
2020-12-06 14:06:26,473  - patience: "3"
2020-12-06 14:06:26,474  - anneal_factor: "0.5"
2020-12-06 14:06:26,474  - max_epochs: "3"
2020-12-06 14:06:26,475  - shuffle: "True"
2020-12-06 14:06:26,476  - train_with_dev: "False"
2020-12-06 14:06:26,477  - batch_growth_annealing: "False"
2020-12-06 14:06:26,478 ----------------------------------------------------------------------------------------------------
2020-12-06 14:06:26,478 Model training base path: "resources/taggers/example-ner-camembert"
2020-12-06 14:06:26,479 ----

{'test_score': 0.8351297405189619,
 'dev_score_history': [0.8262095899327403,
  0.8371689101172383,
  0.8550347222222222],
 'train_loss_history': [5.999284717929897,
  2.484362497614391,
  1.955211645809572],
 'dev_loss_history': [2.063162088394165,
  1.7164278030395508,
  1.3991682529449463]}

In [8]:
model = SequenceTagger.load('resources/taggers/example-ner-camembert/best-model.pt')

2020-12-08 13:43:06,222 loading file resources/taggers/example-ner-camembert/best-model.pt


In [39]:
# create example sentence
sentence = Sentence('Emma Louise, habite au 26 rue Alexandre, 75005 Paris, France, née le 11/11/1993. Elle travaille chez Zenika et elle est joignable sur 06660006.')

In [42]:
%%time
# predict tags and print
model.predict(sentence)

CPU times: user 172 ms, sys: 9.91 ms, total: 182 ms
Wall time: 177 ms


In [43]:
print(sentence.to_tagged_string())

Emma <B-PER> Louise <E-PER> , habite au 26 rue <B-LOC> Alexandre <E-LOC> , 75005 <B-LOC> Paris <E-LOC> , France <S-LOC> , née le 11 / 11 / 1993 . Elle travaille chez Zenika <S-ORG> et elle est joignable sur 06660006 .
