In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

import torch

import pandas as pd

In [2]:
interventions = pd.read_csv('out/interventions.csv')

In [3]:
device = 0 if torch.cuda.is_available() else -1

In [10]:
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")

model = (
    AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")
    .to("cuda")
)

In [5]:
paroles = (
    interventions
    .assign(
        texte = lambda df: df.texte.astype(str)
    )
    .query('code_grammaire.isin(["PAROLE_GENERIQUE", "INTERRUPTION_1_10"])')
    .query('~texte.str.contains("parole est")')
)

paroles

Unnamed: 0,id_syceron,acteur,code_grammaire,code_style,stime,texte,seance,mots
3,2845798,PA719472,PAROLE_GENERIQUE,NORMAL,957.22,Je suis ravi de vous présenter les conclusion...,RUANR5L16S2022IDS26244,101
4,2845799,PA642847,INTERRUPTION_1_10,NORMAL,981.52,Un Lorrain !,RUANR5L16S2022IDS26244,3
5,2845800,PA719472,PAROLE_GENERIQUE,NORMAL,983.07,Les discussions franches et efficaces que nous...,RUANR5L16S2022IDS26244,110
6,2845802,PA719118,INTERRUPTION_1_10,NORMAL,1012.55,Très bien !,RUANR5L16S2022IDS26244,3
7,2845859,PA793736,INTERRUPTION_1_10,NORMAL,1012.78,Vous préférez l’ORTF !,RUANR5L16S2022IDS26244,4
...,...,...,...,...,...,...,...,...
156966,2945099,PA795778,PAROLE_GENERIQUE,NORMAL,9466.82,...tandis qu’un mix avec moins d’énergies ren...,RUANR5L16S2023IDS26598,60
156968,2944886,PA794008,PAROLE_GENERIQUE,NORMAL,9487.18,Je veux dire au Rassemblement national que nou...,RUANR5L16S2023IDS26598,66
156969,2944941,PA793382,INTERRUPTION_1_10,NORMAL,9500.31,"Honteux ! Vous êtes malhonnête, madame !",RUANR5L16S2023IDS26598,7
156970,2944940,PA794008,PAROLE_GENERIQUE,NORMAL,9502.38,Nous refusons le sous-amendement que vous prop...,RUANR5L16S2023IDS26598,83


In [6]:
device

0

In [16]:
#nlp = pipeline('ner', model=model, tokenizer=tokenizer, device=device, aggregation_strategy="simple")
nlp = pipeline('ner', model=model, tokenizer=tokenizer, device=device, device_map="auto",  aggregation_strategy="simple")
#nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [26]:
%%time
def get_entities(text):
    entities = nlp(text)
    return [ (e['word'], e['entity_group']) for e in entities ]
    
entities = (
    paroles
    .assign(
        entities = lambda df: nlp(list(df.texte))
    )
)

entities

CPU times: user 7min 41s, sys: 96 ms, total: 7min 41s
Wall time: 7min 41s


Unnamed: 0,id_syceron,acteur,code_grammaire,code_style,stime,texte,seance,mots,entities
3,2845798,PA719472,PAROLE_GENERIQUE,NORMAL,957.22,Je suis ravi de vous présenter les conclusion...,RUANR5L16S2022IDS26244,101,"[{'entity_group': 'LOC', 'score': 0.5331272, '..."
4,2845799,PA642847,INTERRUPTION_1_10,NORMAL,981.52,Un Lorrain !,RUANR5L16S2022IDS26244,3,"[{'entity_group': 'LOC', 'score': 0.6631856, '..."
5,2845800,PA719472,PAROLE_GENERIQUE,NORMAL,983.07,Les discussions franches et efficaces que nous...,RUANR5L16S2022IDS26244,110,"[{'entity_group': 'LOC', 'score': 0.92980194, ..."
6,2845802,PA719118,INTERRUPTION_1_10,NORMAL,1012.55,Très bien !,RUANR5L16S2022IDS26244,3,[]
7,2845859,PA793736,INTERRUPTION_1_10,NORMAL,1012.78,Vous préférez l’ORTF !,RUANR5L16S2022IDS26244,4,"[{'entity_group': 'ORG', 'score': 0.9869406, '..."
...,...,...,...,...,...,...,...,...,...
156966,2945099,PA795778,PAROLE_GENERIQUE,NORMAL,9466.82,...tandis qu’un mix avec moins d’énergies ren...,RUANR5L16S2023IDS26598,60,"[{'entity_group': 'LOC', 'score': 0.9333397, '..."
156968,2944886,PA794008,PAROLE_GENERIQUE,NORMAL,9487.18,Je veux dire au Rassemblement national que nou...,RUANR5L16S2023IDS26598,66,"[{'entity_group': 'ORG', 'score': 0.98559886, ..."
156969,2944941,PA793382,INTERRUPTION_1_10,NORMAL,9500.31,"Honteux ! Vous êtes malhonnête, madame !",RUANR5L16S2023IDS26598,7,"[{'entity_group': 'PER', 'score': 0.46364522, ..."
156970,2944940,PA794008,PAROLE_GENERIQUE,NORMAL,9502.38,Nous refusons le sous-amendement que vous prop...,RUANR5L16S2023IDS26598,83,"[{'entity_group': 'LOC', 'score': 0.97599787, ..."


In [25]:
%%time
nlp(list(paroles.sample(1000).texte))

CPU times: user 36.6 s, sys: 8 ms, total: 36.6 s
Wall time: 36.6 s


[[{'entity_group': 'LOC',
   'score': 0.59871626,
   'word': 'Gouvernement',
   'start': 18,
   'end': 31}],
 [{'entity_group': 'PER',
   'score': 0.99900293,
   'word': 'Raphaël Schellenberger',
   'start': 120,
   'end': 144},
  {'entity_group': 'LOC',
   'score': 0.932279,
   'word': 'la France',
   'start': 222,
   'end': 232},
  {'entity_group': 'ORG',
   'score': 0.9867693,
   'word': 'RN',
   'start': 456,
   'end': 459},
  {'entity_group': 'ORG',
   'score': 0.98748046,
   'word': 'LR',
   'start': 462,
   'end': 465},
  {'entity_group': 'ORG',
   'score': 0.839422,
   'word': 'RE.',
   'start': 515,
   'end': 519}],
 [],
 [],
 [],
 [{'entity_group': 'MISC',
   'score': 0.6044164,
   'word': 'Mais',
   'start': 0,
   'end': 4}],
 [],
 [{'entity_group': 'ORG',
   'score': 0.4414477,
   'word': 'Pour',
   'start': 0,
   'end': 4},
  {'entity_group': 'ORG',
   'score': 0.9854587,
   'word': 'sécurité sociale',
   'start': 17,
   'end': 36}],
 [],
 [{'entity_group': 'ORG',
   'scor

In [35]:
paroles_entities = (
    entities
    #.assign(entities = entities)
    [['id_syceron', 'acteur', 'entities']]
    .explode('entities')
    .dropna(subset="entities")
    .assign(
        entity = lambda df: df.entities.apply(lambda x: x['word']),
        label = lambda df: df.entities.apply(lambda x: x['entity_group']),
        score = lambda df: df.entities.apply(lambda x: x['score']),
    )
)

paroles_entities

Unnamed: 0,id_syceron,acteur,entities,entity,label,score
3,2845798,PA719472,"{'entity_group': 'LOC', 'score': 0.5331272, 'w...",l,LOC,0.533127
3,2845798,PA719472,"{'entity_group': 'ORG', 'score': 0.8141303, 'w...",Assemblée nationale,ORG,0.814130
3,2845798,PA719472,"{'entity_group': 'LOC', 'score': 0.8394284, 'w...",Sénat,LOC,0.839428
3,2845798,PA719472,"{'entity_group': 'ORG', 'score': 0.73330164, '...",Dem,ORG,0.733302
3,2845798,PA719472,"{'entity_group': 'ORG', 'score': 0.6866712, 'w...",HOR,ORG,0.686671
...,...,...,...,...,...,...
156969,2944941,PA793382,"{'entity_group': 'PER', 'score': 0.59652823, '...",madame,PER,0.596528
156970,2944940,PA794008,"{'entity_group': 'LOC', 'score': 0.97599787, '...",France,LOC,0.975998
156970,2944940,PA794008,"{'entity_group': 'LOC', 'score': 0.9880072, 'w...",Russie,LOC,0.988007
156970,2944940,PA794008,"{'entity_group': 'ORG', 'score': 0.98948157, '...",Écolo-NUPES,ORG,0.989482


In [41]:
paroles_entities.query('entity.str.contains("Mélenchon")')

Unnamed: 0,id_syceron,acteur,entities,entity,label,score
209,2846112,PA719118,"{'entity_group': 'PER', 'score': 0.99541086, '...",Mélenchon,PER,0.995411
1658,3106851,PA795386,"{'entity_group': 'PER', 'score': 0.99909717, '...",Jean-Luc Mélenchon,PER,0.999097
2747,2894021,PA720468,"{'entity_group': 'PER', 'score': 0.99897784, '...",Jean-Luc Mélenchon,PER,0.998978
2840,2894266,PA331582,"{'entity_group': 'PER', 'score': 0.9988296, 'w...",Jean-Luc Mélenchon,PER,0.998830
3701,2940924,PA773443,"{'entity_group': 'PER', 'score': 0.8622595, 'w...",. Mélenchon,PER,0.862260
...,...,...,...,...,...,...
152794,3019865,PA793464,"{'entity_group': 'PER', 'score': 0.9784897, 'w...",Mélenchon,PER,0.978490
153786,2829324,PA718850,"{'entity_group': 'PER', 'score': 0.998731, 'wo...",Jean-Luc Mélenchon,PER,0.998731
154414,2819674,PA720286,"{'entity_group': 'PER', 'score': 0.8236387, 'w...",. Mélenchon,PER,0.823639
154425,2819773,PA720892,"{'entity_group': 'PER', 'score': 0.99897826, '...",Jean-Luc Mélenchon,PER,0.998978


In [39]:
paroles_entities.query('label == "PER"').entity.value_counts().head(30)

Président de la République     1499
Emmanuel Macron                 748
Première ministre               551
Benjamin Lucas                  445
Marine Le Pen                   422
Macron                          412
Monsieur                        351
Madame la Première ministre     337
M. le ministre                  312
Bruno Le Maire                  286
la Première ministre            255
État                            250
Mme la Première ministre        231
M. le rapporteur                213
Gouvernement                    195
Mme la rapporteure              195
Bernalicis                      193
M. le                           173
François Hollande               167
M.                              156
Le Pen                          156
.                               156
Madame                          153
Jumel                           151
Mme la ministre                 151
Antoine Léaument                136
Jean-Luc Mélenchon              134
Monsieur le ministre        

In [42]:
paroles_entities.query('label == "ORG"').entity.value_counts().head(30)

LFI-NUPES                    12993
RE                            8609
Écolo-NUPES                   5117
RN                            4806
commission                    3927
GDR-NUPES                     3403
SOC                           3391
Dem                           2851
LR                            2812
Parlement                     1861
Dem.                          1741
Rassemblement national        1730
HOR                           1705
Union européenne              1455
RN.                           1315
Gouvernement                  1269
NUPES                         1195
Assemblée nationale           1072
Assemblée                     1045
EDF                            886
Europe                         838
Les Républicains               804
État                           656
LIOT                           626
Conseil d                      576
La France insoumise            558
l                              553
HOR.                           449
Cour des comptes    

In [43]:
paroles_entities.to_csv('out/interventions_entities_camenbert.csv', index=False)