## CamemBERT

In [83]:
from transformers import CamembertTokenizer, CamembertForTokenClassification
from transformers import pipeline

In [84]:
tokenizer = CamembertTokenizer.from_pretrained("gilf/french-camembert-postag-model")

model = CamembertForTokenClassification.from_pretrained("gilf/french-camembert-postag-model")

In [85]:
nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)

nlp_token_class('Emma Louise, 26 rue Alexandre, 75005 Paris, France, née le 11/11/2007 et joignable sur 06660006')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'NPP', 'score': 0.997456818819046, 'word': 'Emma Louise'},
 {'entity_group': 'PONCT', 'score': 0.5417522192001343, 'word': ','},
 {'entity_group': 'ADJ', 'score': 0.8075023293495178, 'word': '26'},
 {'entity_group': 'NC', 'score': 0.9991540312767029, 'word': 'rue'},
 {'entity_group': 'NPP', 'score': 0.9982922077178955, 'word': 'Alexandre'},
 {'entity_group': 'PONCT', 'score': 0.7300308346748352, 'word': ','},
 {'entity_group': 'ADJ', 'score': 0.634022057056427, 'word': '750'},
 {'entity_group': 'DET', 'score': 0.4953150451183319, 'word': '05'},
 {'entity_group': 'NPP', 'score': 0.9984453320503235, 'word': 'Paris'},
 {'entity_group': 'P', 'score': 0.45102745294570923, 'word': ','},
 {'entity_group': 'NPP', 'score': 0.9979045391082764, 'word': 'France'},
 {'entity_group': 'PONCT', 'score': 0.6692338585853577, 'word': ','},
 {'entity_group': 'VPP', 'score': 0.998975932598114, 'word': 'née'},
 {'entity_group': 'DET', 'score': 0.9993326663970947, 'word': 'le'},
 {'entity_g

## Flair + CamemBERT Embeddings

In [86]:
from flair.embeddings import FlairEmbeddings, TransformerWordEmbeddings,CamembertEmbeddings, StackedEmbeddings, TokenEmbeddings, WordEmbeddings
from flair.data import Sentence
import flair.datasets
from flair.models import SequenceTagger

In [87]:
# init Flair embeddings
flair_forward_embedding = FlairEmbeddings('french-forward')
flair_backward_embedding = FlairEmbeddings('french-backward')

# init multilingual BERT
bert_embedding = CamembertEmbeddings()
#wordembedding = WordEmbeddings("fr")

  


In [88]:
stacked_embeddings = StackedEmbeddings([
                                        bert_embedding,
                                        flair_forward_embedding,
                                        flair_backward_embedding,
                                       ])
#stacked_embeddings = StackedEmbeddings([
#                                        wordembedding,
#                                        flair_forward_embedding,
#                                        flair_backward_embedding,
#                                       ])

In [94]:
#french_corpus = flair.datasets.XTREME('fr')
corpus = flair.datasets.WIKINER_FRENCH()

2020-11-26 21:26:28,042 Reading data from /Users/amalbedoui/.flair/datasets/wikiner_french
2020-11-26 21:26:28,043 Train: /Users/amalbedoui/.flair/datasets/wikiner_french/aij-wikiner-fr-wp3.train
2020-11-26 21:26:28,044 Dev: None
2020-11-26 21:26:28,044 Test: None


In [100]:
tag_type = 'ner'

#make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

tagger: SequenceTagger = SequenceTagger(hidden_size=1,
                                        embeddings=stacked_embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type)

In [101]:
sentence = Sentence("Emma Louise s'est installée au 26 rue Alexandre, 75005 Paris, France, née le 11/11/2007 et joignable sur 06660006")

In [102]:
tagger.predict(sentence)

In [103]:
for entity in sentence.get_spans('ner'):
    print(entity.to_dict())

{'text': 'Emma', 'start_pos': 0, 'end_pos': 4, 'labels': [LOC (0.2098)]}
{'text': 'Louise', 'start_pos': 5, 'end_pos': 11, 'labels': [ORG (0.1289)]}
{'text': "s'est", 'start_pos': 12, 'end_pos': 17, 'labels': [PER (0.188)]}
{'text': 'installée', 'start_pos': 18, 'end_pos': 27, 'labels': [LOC (0.1761)]}
{'text': 'au 26', 'start_pos': 28, 'end_pos': 33, 'labels': [ORG (0.1188)]}
{'text': 'rue', 'start_pos': 34, 'end_pos': 37, 'labels': [ORG (0.1653)]}
{'text': 'Alexandre', 'start_pos': 38, 'end_pos': 47, 'labels': [MISC (0.1535)]}
{'text': ', 75005 Paris,', 'start_pos': 47, 'end_pos': 61, 'labels': [MISC (0.1221)]}
{'text': 'France', 'start_pos': 62, 'end_pos': 68, 'labels': [ORG (0.1246)]}
{'text': ',', 'start_pos': 68, 'end_pos': 69, 'labels': [MISC (0.1186)]}
{'text': 'née le 11/11', 'start_pos': 70, 'end_pos': 82, 'labels': [MISC (0.1564)]}
{'text': '/', 'start_pos': 82, 'end_pos': 83, 'labels': [ORG (0.1177)]}
{'text': '2007', 'start_pos': 83, 'end_pos': 87, 'labels': [MISC (0.1204)