In [1]:
from flair.data import Corpus
import flair
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
from typing import List
from flair.embeddings import CamembertEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
# 1. get the corpus
corpus: Corpus  = flair.datasets.WIKINER_FRENCH().downsample(0.1)

In [None]:
# 2. what tag do we want to predict?
tag_type = 'ner'

In [None]:
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [None]:
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    TransformerWordEmbeddings('flaubert/flaubert_large_cased')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [2]:
# 7. start training
trainer.train('resources/taggers/example-ner-flaubert',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=3,
              checkpoint=True)

2020-12-06 16:42:24,300 Reading data from /Users/amalbedoui/.flair/datasets/wikiner_french
2020-12-06 16:42:24,300 Train: /Users/amalbedoui/.flair/datasets/wikiner_french/aij-wikiner-fr-wp3.train
2020-12-06 16:42:24,301 Dev: None
2020-12-06 16:42:24,302 Test: None


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1516.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1561415.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=895731.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1493194721.0), HTML(value='')))


2020-12-06 16:44:23,525 ----------------------------------------------------------------------------------------------------
2020-12-06 16:44:23,527 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): FlaubertModel(
        (position_embeddings): Embedding(512, 1024)
        (embeddings): Embedding(68729, 1024, padding_idx=2)
        (layer_norm_emb): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attentions): ModuleList(
          (0): MultiHeadAttention(
            (q_lin): Linear(in_features=1024, out_features=1024, bias=True)
            (k_lin): Linear(in_features=1024, out_features=1024, bias=True)
            (v_lin): Linear(in_features=1024, out_features=1024, bias=True)
            (out_lin): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (1): MultiHeadAttention(
            (q_lin): Linear(in_features=1024, out_features=1024, bias=True)
            (k_lin)

2020-12-06 16:44:23,528 ----------------------------------------------------------------------------------------------------
2020-12-06 16:44:23,528 Corpus: "Corpus: 10713 train + 1190 dev + 1323 test sentences"
2020-12-06 16:44:23,529 ----------------------------------------------------------------------------------------------------
2020-12-06 16:44:23,530 Parameters:
2020-12-06 16:44:23,530  - learning_rate: "0.1"
2020-12-06 16:44:23,531  - mini_batch_size: "32"
2020-12-06 16:44:23,531  - patience: "3"
2020-12-06 16:44:23,532  - anneal_factor: "0.5"
2020-12-06 16:44:23,532  - max_epochs: "3"
2020-12-06 16:44:23,533  - shuffle: "True"
2020-12-06 16:44:23,533  - train_with_dev: "False"
2020-12-06 16:44:23,534  - batch_growth_annealing: "False"
2020-12-06 16:44:23,534 ----------------------------------------------------------------------------------------------------
2020-12-06 16:44:23,535 Model training base path: "resources/taggers/example-ner-flaubert"
2020-12-06 16:44:23,535 -----

{'test_score': 0.7853423336547734,
 'dev_score_history': [0.6436340371081255,
  0.7040235938487466,
  0.7684707903780068],
 'train_loss_history': [11.052329261267364,
  5.711321242887582,
  4.327354967772071],
 'dev_loss_history': [5.495382785797119,
  3.8199679851531982,
  2.8995649814605713]}

In [None]:
model = SequenceTagger.load('resources/taggers/example-ner-flaubert/best-model.pt')

In [5]:
# create example sentence
sentence = Sentence('Emma Louise, habite au 26 rue Alexandre, 75005 Paris, France, née le 11/11/1993. Elle travaille chez Zenika et elle est joignable sur 06660006.')

In [6]:
%%time
# predict tags and print
model.predict(sentence)

CPU times: user 705 ms, sys: 2.02 s, total: 2.72 s
Wall time: 5.13 s


In [7]:
print(sentence.to_tagged_string())

Emma <B-PER> Louise <E-PER> , habite au 26 rue Alexandre <S-PER> , 75005 <B-LOC> Paris <E-LOC> , France <S-LOC> , née le 11 / 11 / 1993 . Elle travaille chez Zenika <S-MISC> et elle est joignable sur 06660006 .
