In [1]:
from flair.data import Corpus
import flair
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
from typing import List
from flair.embeddings import CamembertEmbeddings
from flair.models import SequenceTagger
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
# 1. get the corpus
corpus: Corpus  = flair.datasets.WIKINER_FRENCH().downsample(0.1)

In [None]:
# 2. what tag do we want to predict?
tag_type = 'ner'

In [None]:
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [None]:
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    TransformerWordEmbeddings('illuin/lepetit')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [2]:
# 7. start training
trainer.train('resources/taggers/example-ner-lepetit',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=3,
              checkpoint=True)



2020-12-06 22:05:19,155 Reading data from /Users/amalbedoui/.flair/datasets/wikiner_french
2020-12-06 22:05:19,156 Train: /Users/amalbedoui/.flair/datasets/wikiner_french/aij-wikiner-fr-wp3.train
2020-12-06 22:05:19,156 Dev: None
2020-12-06 22:05:19,156 Test: None


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=507.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=810912.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=210.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=71917496.0), HTML(value='')))


2020-12-06 22:05:39,983 ----------------------------------------------------------------------------------------------------
2020-12-06 22:05:39,985 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): CamembertModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(32005, 256, padding_idx=1)
          (position_embeddings): Embedding(514, 256, padding_idx=1)
          (token_type_embeddings): Embedding(1, 256)
          (LayerNorm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0): RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=256, out_features=256, bias=True)
                  (key): Linear(in_features=256, out_features=256, bias=Tru

2020-12-06 22:05:39,986 ----------------------------------------------------------------------------------------------------
2020-12-06 22:05:39,987 Corpus: "Corpus: 10713 train + 1190 dev + 1323 test sentences"
2020-12-06 22:05:39,988 ----------------------------------------------------------------------------------------------------
2020-12-06 22:05:39,988 Parameters:
2020-12-06 22:05:39,989  - learning_rate: "0.1"
2020-12-06 22:05:39,989  - mini_batch_size: "32"
2020-12-06 22:05:39,990  - patience: "3"
2020-12-06 22:05:39,991  - anneal_factor: "0.5"
2020-12-06 22:05:39,991  - max_epochs: "3"
2020-12-06 22:05:39,992  - shuffle: "True"
2020-12-06 22:05:39,993  - train_with_dev: "False"
2020-12-06 22:05:39,993  - batch_growth_annealing: "False"
2020-12-06 22:05:39,994 ----------------------------------------------------------------------------------------------------
2020-12-06 22:05:39,995 Model training base path: "resources/taggers/example-ner-lepetit"
2020-12-06 22:05:39,996 ------

{'test_score': 0.7578014898328973,
 'dev_score_history': [0.735024048972453,
  0.7414803559800304,
  0.7701674277016742],
 'train_loss_history': [6.715684566924821,
  3.360003551084604,
  2.7185269476762457],
 'dev_loss_history': [2.9247255325317383,
  2.4928743839263916,
  2.0861685276031494]}

In [None]:
model = SequenceTagger.load('resources/taggers/example-ner-lepetit/best-model.pt')

In [5]:
# create example sentence
sentence = Sentence('Emma Louise, habite au 26 rue Alexandre, 75005 Paris, France, née le 11/11/1993. Elle travaille chez Zenika et elle est joignable sur 06660006.')

In [6]:
%%time
# predict tags and print
model.predict(sentence)

CPU times: user 46.6 ms, sys: 99.1 ms, total: 146 ms
Wall time: 256 ms


In [7]:
print(sentence.to_tagged_string())

Emma <B-PER> Louise <E-PER> , habite au 26 rue <B-LOC> Alexandre <E-LOC> , 75005 Paris <S-LOC> , France <S-LOC> , née le 11 / 11 / 1993 . Elle travaille chez Zenika <S-ORG> et elle est joignable sur 06660006 .
