In [1]:
import glob
import os

import flair
from flair.data import Corpus, MultiCorpus, Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import StackedEmbeddings, TokenEmbeddings, TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

from typing import List

In [2]:
path = os.getcwd()
data_folder = os.path.join(path, '../../data/processed')

columns = {0: 'text', 1: 'ner'}

In [3]:
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt')

# -------- In order to combine our own corpus with Flair WIKINER corpus, please remove the comments below --------
#flair_corpus: Corpus  = flair.datasets.WIKINER_FRENCH().downsample(0.1)
#corpus: MultiCorpus = MultiCorpus([ flair_corpus, own_corpus])

2021-05-14 15:57:36,250 Reading data from /home/ec2-user/SageMaker/processed
2021-05-14 15:57:36,251 Train: /home/ec2-user/SageMaker/processed/train.txt
2021-05-14 15:57:36,251 Dev: None
2021-05-14 15:57:36,252 Test: /home/ec2-user/SageMaker/processed/test.txt


In [4]:
tag_type = 'ner'

tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [5]:
embedding_types: List[TokenEmbeddings] = [
    TransformerWordEmbeddings('camembert-base')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [7]:
trainer: ModelTrainer = ModelTrainer(tagger, corpus)


trainer.train(base_path='resources/taggers/ner-our-own-corpus',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=70,
              write_weights=True)

2021-05-14 15:57:39,657 ----------------------------------------------------------------------------------------------------
2021-05-14 15:57:39,659 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): CamembertModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(32005, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0): RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True

Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


2021-05-14 16:04:24,548 0.9118	0.6739	0.7750
2021-05-14 16:04:24,549 
Results:
- F1-score (micro) 0.7750
- F1-score (macro) 0.5672

By class:
ADDRESS    tp: 1 - fp: 0 - fn: 0 - precision: 1.0000 - recall: 1.0000 - f1-score: 1.0000
DATE       tp: 0 - fp: 0 - fn: 3 - precision: 0.0000 - recall: 0.0000 - f1-score: 0.0000
EMAIL      tp: 1 - fp: 0 - fn: 0 - precision: 1.0000 - recall: 1.0000 - f1-score: 1.0000
LANG       tp: 0 - fp: 0 - fn: 1 - precision: 0.0000 - recall: 0.0000 - f1-score: 0.0000
LOC        tp: 17 - fp: 1 - fn: 4 - precision: 0.9444 - recall: 0.8095 - f1-score: 0.8718
ORG_ENS    tp: 0 - fp: 2 - fn: 2 - precision: 0.0000 - recall: 0.0000 - f1-score: 0.0000
PER        tp: 0 - fp: 0 - fn: 4 - precision: 0.0000 - recall: 0.0000 - f1-score: 0.0000
PERIODE    tp: 6 - fp: 0 - fn: 0 - precision: 1.0000 - recall: 1.0000 - f1-score: 1.0000
TEL        tp: 2 - fp: 0 - fn: 1 - precision: 1.0000 - recall: 0.6667 - f1-score: 0.8000
URL        tp: 4 - fp: 0 - fn: 0 - precision: 1.0000 - r

{'test_score': 0.775,
 'dev_score_history': [0.0,
  0.2857142857142857,
  0.2765957446808511,
  0.3716814159292035,
  0.49382716049382713,
  0.5822784810126582,
  0.6375,
  0.6231884057971013,
  0.676056338028169,
  0.7183098591549295,
  0.7284768211920529,
  0.6814814814814815,
  0.7333333333333333,
  0.7721518987341772,
  0.7741935483870968,
  0.7922077922077922,
  0.7421383647798743,
  0.751592356687898,
  0.8133333333333335,
  0.746987951807229,
  0.7810650887573963,
  0.8227848101265823,
  0.7948717948717949,
  0.7948717948717949,
  0.8266666666666665,
  0.8496732026143792,
  0.8311688311688312,
  0.8400000000000001,
  0.8074534161490684,
  0.8227848101265823,
  0.8662420382165605,
  0.8774193548387097,
  0.8535031847133757,
  0.8354430379746836,
  0.8481012658227848,
  0.8427672955974843,
  0.8662420382165605,
  0.8553459119496856,
  0.8749999999999999,
  0.8662420382165605,
  0.8553459119496856,
  0.8607594936708861,
  0.8805031446540881,
  0.8987341772151899,
  0.89873417721518

In [8]:
model = SequenceTagger.load('resources/taggers/ner-our-own-corpus/best-model.pt')

2021-05-14 16:04:24,563 loading file resources/taggers/ner-our-own-corpus/best-model.pt


In [9]:
phrase ="François Louise consultant 20 ans d'expérience, 20 rue Gabriel Péri, Paris, 09098798 françois.louise@zenika.com"
sentence = Sentence(phrase)
model.predict(sentence)
print(sentence.to_tagged_string())

François <B-PER> Louise <I-PER> consultant 20 <B-PERIODE> ans <I-PERIODE> d'expérience , 20 rue <B-ADDRESS> Gabriel <I-ADDRESS> Péri <I-ADDRESS> , Paris <B-LOC> , 09098798 françois.louise <B-EMAIL> @ zenika.com
