# Reconnaissance d'entit√©s nomm√©es avec SpaCy

La documentation est accessible ici: https://spacy.io/api

## Imports

In [1]:
from collections import defaultdict
import os
import spacy
from spacy.lang.fr.examples import sentences
!python -m spacy download fr_core_news_md
import pandas as pd

2025-11-04 14:39:06.307063: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-04 14:39:07.936275: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
DEPRECATION: https://github.com/explosion/spacy-models/releases/download/-fr_core_news_md/-fr_core_news_md.tar.gz#egg===fr_core_news_md contains an egg fragment with a non-PEP 508 name. pip 25.3 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/13157

[notice] A new release of pi

In [10]:
nlp = spacy.load('fr_core_news_md')

## Reconnaissance des entit√©s dans le corpus de 1965

In [11]:
annee_choisie = "1965"
folder_path = r"C:\Users\tommy\TAC2\TAC\data\txt"
corpus_file = os.path.join(folder_path, f"corpus_{annee_choisie}.txt")
output_file = os.path.join(folder_path, f"entites_{annee_choisie}.csv")



# Charger le texte complet de 1965 --> version trop longue !!! et sans sauvegarde !!!
# corpus_file = os.path.join(folder_path, f"corpus_{annee_choisie}.txt")
# with open(corpus_file, 'r', encoding='utf-8') as f:
  #  texte = f.read()

# Analyse linguistique compl√®te du corpus
# nlp.max_length = 3_000_000  # ‚Üê augmente la limite √† 3 millions de caract√®res
# doc = nlp(texte)

# D√©couper le texte en phrases
# sentences = list(doc.sents)

# Parcourir chaque phrase et extraire les entit√©s
# for sent in sentences:
    # entities = [f"{ent.text} ({ent.label_})" for ent in sent.ents]
    # if entities:
        # print(f"‚Üí '{sent.text.strip()}' contient les entit√©s : {', '.join(entities)}")

# refaire en d√©coupant en blocs pour gagner du temps
# === Param√®tres ===
# === Charger le mod√®le spaCy (NER uniquement) ===
nlp = spacy.load("fr_core_news_md", disable=["tagger", "parser", "lemmatizer", "attribute_ruler"])
nlp.max_length = 3_000_000

# === Lire le texte complet ===
with open(corpus_file, 'r', encoding='utf-8') as f:
    texte = f.read()

# === Param√®tres de traitement ===
chunk_size = 200_000
entites = []

# === Si le fichier existe d√©j√†, on ne l‚Äô√©crase pas ===
if not os.path.exists(output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("Texte;Type\n")

# === Boucle par blocs ===
for i in range(0, len(texte), chunk_size):
    bloc = texte[i:i+chunk_size]
    doc = nlp(bloc)
    bloc_ents = [(ent.text, ent.label_) for ent in doc.ents]

    # Sauvegarde imm√©diate du bloc dans le CSV
    df_bloc = pd.DataFrame(bloc_ents, columns=["Texte", "Type"])
    df_bloc.to_csv(output_file, mode='a', index=False, sep=';', header=False)

    print(f"üíæ Bloc {i//chunk_size + 1} trait√© ({len(bloc):,} caract√®res, {len(bloc_ents)} entit√©s)")
    entites.extend(bloc_ents)

print(f"\n‚úÖ Analyse termin√©e : {len(entites):,} entit√©s reconnues.")
print(f"üíæ R√©sultat enregistr√© dans : {output_file}")

üíæ Bloc 1 trait√© (200,000 caract√®res, 3899 entit√©s)
üíæ Bloc 2 trait√© (200,000 caract√®res, 2806 entit√©s)
üíæ Bloc 3 trait√© (200,000 caract√®res, 3654 entit√©s)
üíæ Bloc 4 trait√© (200,000 caract√®res, 4579 entit√©s)
üíæ Bloc 5 trait√© (200,000 caract√®res, 5243 entit√©s)
üíæ Bloc 6 trait√© (200,000 caract√®res, 5879 entit√©s)
üíæ Bloc 7 trait√© (200,000 caract√®res, 4056 entit√©s)
üíæ Bloc 8 trait√© (200,000 caract√®res, 4952 entit√©s)
üíæ Bloc 9 trait√© (200,000 caract√®res, 5248 entit√©s)
üíæ Bloc 10 trait√© (200,000 caract√®res, 5564 entit√©s)
üíæ Bloc 11 trait√© (200,000 caract√®res, 3547 entit√©s)
üíæ Bloc 12 trait√© (200,000 caract√®res, 3691 entit√©s)
üíæ Bloc 13 trait√© (200,000 caract√®res, 4940 entit√©s)
üíæ Bloc 14 trait√© (200,000 caract√®res, 4666 entit√©s)
üíæ Bloc 15 trait√© (109,828 caract√®res, 1876 entit√©s)

‚úÖ Analyse termin√©e : 64,600 entit√©s reconnues.
üíæ R√©sultat enregistr√© dans : C:\Users\tommy\TAC2\TAC\data\txt\entites_1965.csv


## Compter, trier et imprimer les entit√©s du corpus de 1965

In [12]:
print('doc' in locals())

True


In [14]:
from collections import defaultdict, Counter

In [15]:
# === Charger le CSV des entit√©s ===
annee_choisie = "1965"
folder_path = r"C:\Users\tommy\TAC2\TAC\data\txt"
entites_file = os.path.join(folder_path, f"entites_{annee_choisie}.csv")

df = pd.read_csv(entites_file, sep=';')

# === Cr√©er des dictionnaires pour chaque type d'entit√© ===
people = defaultdict(int)
places = defaultdict(int)
orgs   = defaultdict(int)

# === Parcourir toutes les entit√©s du CSV ===
for _, row in df.iterrows():
    label = row["Type"]
    text = str(row["Texte"]).strip()

    if len(text) <= 2:  # filtrer les entit√©s trop courtes
        continue

    if label == "PER":
        people[text] += 1
    elif label == "LOC":
        places[text] += 1
    elif label == "ORG":
        orgs[text] += 1

# === Trier les r√©sultats (15 premiers de chaque type) ===
top_people = Counter(people).most_common(15)
top_places = Counter(places).most_common(15)
top_orgs   = Counter(orgs).most_common(15)

# === Cr√©er un tableau r√©capitulatif ===
df_summary = pd.DataFrame({
    "Personnes": [p[0] for p in top_people],
    "Fr√©quence (PER)": [p[1] for p in top_people],
    "Lieux": [p[0] for p in top_places],
    "Fr√©quence (LOC)": [p[1] for p in top_places],
    "Organisations": [p[0] for p in top_orgs],
    "Fr√©quence (ORG)": [p[1] for p in top_orgs],
})

# === Afficher le tableau ===
print(df_summary)

    Personnes  Fr√©quence (PER)       Lieux  Fr√©quence (LOC)  Organisations  \
0      Rossel              307   Bruxelles              551  Agence Rossel   
1         Ecr              126        Brux              302         Mutuel   
2    Monsieur              105    Belgique              135            bur   
3        SENT               84      T√©l√©ph              101            dem   
4       - T√©l               77       Paris               90            maz   
5        trav               44       Li√®ge               87            chf   
6       Bonne               43  Anderlecht               87           Fiat   
7         T√©l               43        Etat               80       Standard   
8    MONSIEUR               40      Anvers               79      T√©l√©phone   
9     Moli√®re               38     Ixelles               67        Conseil   
10        Sem               31        Gand               66            adr   
11        Roi               31       Uccle            