In [1]:
from flair.data import Corpus
import flair
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
from typing import List
from flair.embeddings import CamembertEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
# 1. get the corpus
corpus: Corpus  = flair.datasets.WIKINER_FRENCH().downsample(0.1)

In [None]:
# 2. what tag do we want to predict?
tag_type = 'ner'

In [None]:
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [None]:
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    TransformerWordEmbeddings('camembert-base'),
    TransformerWordEmbeddings('illuin/lepetit'), 
    TransformerWordEmbeddings('flaubert/flaubert_large_cased')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [2]:
# 7. start training
trainer.train('resources/taggers/example-ner-combined',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=3,
              checkpoint=True)

2020-12-06 22:42:05,501 Reading data from /Users/amalbedoui/.flair/datasets/wikiner_french
2020-12-06 22:42:05,501 Train: /Users/amalbedoui/.flair/datasets/wikiner_french/aij-wikiner-fr-wp3.train
2020-12-06 22:42:05,501 Dev: None
2020-12-06 22:42:05,502 Test: None
2020-12-06 22:42:32,627 ----------------------------------------------------------------------------------------------------
2020-12-06 22:42:32,636 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): CamembertModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(32005, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (

2020-12-06 22:42:32,639 ----------------------------------------------------------------------------------------------------
2020-12-06 22:42:32,640 Corpus: "Corpus: 10713 train + 1190 dev + 1323 test sentences"
2020-12-06 22:42:32,642 ----------------------------------------------------------------------------------------------------
2020-12-06 22:42:32,643 Parameters:
2020-12-06 22:42:32,644  - learning_rate: "0.1"
2020-12-06 22:42:32,644  - mini_batch_size: "32"
2020-12-06 22:42:32,645  - patience: "3"
2020-12-06 22:42:32,646  - anneal_factor: "0.5"
2020-12-06 22:42:32,647  - max_epochs: "3"
2020-12-06 22:42:32,648  - shuffle: "True"
2020-12-06 22:42:32,649  - train_with_dev: "False"
2020-12-06 22:42:32,649  - batch_growth_annealing: "False"
2020-12-06 22:42:32,650 ----------------------------------------------------------------------------------------------------
2020-12-06 22:42:32,651 Model training base path: "resources/taggers/example-ner-combined"
2020-12-06 22:42:32,652 -----

{'test_score': 0.7504445761707172,
 'dev_score_history': [0.6179429275302813,
  0.7025683512841755,
  0.7682775712515489],
 'train_loss_history': [12.647268992751393,
  6.36160932939444,
  4.675048357693117],
 'dev_loss_history': [6.80029296875, 4.265444278717041, 3.039752244949341]}

In [None]:
model = SequenceTagger.load('resources/taggers/example-ner-combined/best-model.pt')

In [5]:
# create example sentence
sentence = Sentence('Emma Louise, habite au 26 rue Alexandre, 75005 Paris, France, née le 11/11/1993. Elle travaille chez Zenika et elle est joignable sur 06660006.')

In [6]:
%%time
# predict tags and print
model.predict(sentence)

CPU times: user 1.04 s, sys: 3.1 s, total: 4.14 s
Wall time: 8 s


In [7]:
print(sentence.to_tagged_string())

Emma <B-PER> Louise <E-PER> , habite au 26 rue Alexandre <E-PER> , 75005 <B-LOC> Paris <E-LOC> , France <S-LOC> , née le 11 / 11 / 1993 . Elle travaille chez Zenika <S-MISC> et elle est joignable sur 06660006 .
