# Reconnaissance d'entit√©s nomm√©es avec SpaCy

La documentation est accessible ici: https://spacy.io/api

## Imports

In [1]:
from collections import defaultdict
import os
import spacy
from spacy.lang.fr.examples import sentences
!python -m spacy download fr_core_news_md
import pandas as pd

2025-11-04 14:39:06.307063: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-04 14:39:07.936275: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
DEPRECATION: https://github.com/explosion/spacy-models/releases/download/-fr_core_news_md/-fr_core_news_md.tar.gz#egg===fr_core_news_md contains an egg fragment with a non-PEP 508 name. pip 25.3 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/13157

[notice] A new release of pi

In [10]:
nlp = spacy.load('fr_core_news_md')

## Reconnaissance des entit√©s dans le corpus de 1965

In [11]:
annee_choisie = "1965"
folder_path = r"C:\Users\tommy\TAC2\TAC\data\txt"
corpus_file = os.path.join(folder_path, f"corpus_{annee_choisie}.txt")
output_file = os.path.join(folder_path, f"entites_{annee_choisie}.csv")



# Charger le texte complet de 1965 --> version trop longue !!! et sans sauvegarde !!!
# corpus_file = os.path.join(folder_path, f"corpus_{annee_choisie}.txt")
# with open(corpus_file, 'r', encoding='utf-8') as f:
  #  texte = f.read()

# Analyse linguistique compl√®te du corpus
# nlp.max_length = 3_000_000  # ‚Üê augmente la limite √† 3 millions de caract√®res
# doc = nlp(texte)

# D√©couper le texte en phrases
# sentences = list(doc.sents)

# Parcourir chaque phrase et extraire les entit√©s
# for sent in sentences:
    # entities = [f"{ent.text} ({ent.label_})" for ent in sent.ents]
    # if entities:
        # print(f"‚Üí '{sent.text.strip()}' contient les entit√©s : {', '.join(entities)}")

# refaire en d√©coupant en blocs pour gagner du temps
# === Param√®tres ===
# === Charger le mod√®le spaCy (NER uniquement) ===
nlp = spacy.load("fr_core_news_md", disable=["tagger", "parser", "lemmatizer", "attribute_ruler"])
nlp.max_length = 3_000_000

# === Lire le texte complet ===
with open(corpus_file, 'r', encoding='utf-8') as f:
    texte = f.read()

# === Param√®tres de traitement ===
chunk_size = 200_000
entites = []

# === Si le fichier existe d√©j√†, on ne l‚Äô√©crase pas ===
if not os.path.exists(output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("Texte;Type\n")

# === Boucle par blocs ===
for i in range(0, len(texte), chunk_size):
    bloc = texte[i:i+chunk_size]
    doc = nlp(bloc)
    bloc_ents = [(ent.text, ent.label_) for ent in doc.ents]

    # Sauvegarde imm√©diate du bloc dans le CSV
    df_bloc = pd.DataFrame(bloc_ents, columns=["Texte", "Type"])
    df_bloc.to_csv(output_file, mode='a', index=False, sep=';', header=False)

    print(f"üíæ Bloc {i//chunk_size + 1} trait√© ({len(bloc):,} caract√®res, {len(bloc_ents)} entit√©s)")
    entites.extend(bloc_ents)

print(f"\n‚úÖ Analyse termin√©e : {len(entites):,} entit√©s reconnues.")
print(f"üíæ R√©sultat enregistr√© dans : {output_file}")

üíæ Bloc 1 trait√© (200,000 caract√®res, 3899 entit√©s)
üíæ Bloc 2 trait√© (200,000 caract√®res, 2806 entit√©s)
üíæ Bloc 3 trait√© (200,000 caract√®res, 3654 entit√©s)
üíæ Bloc 4 trait√© (200,000 caract√®res, 4579 entit√©s)
üíæ Bloc 5 trait√© (200,000 caract√®res, 5243 entit√©s)
üíæ Bloc 6 trait√© (200,000 caract√®res, 5879 entit√©s)
üíæ Bloc 7 trait√© (200,000 caract√®res, 4056 entit√©s)
üíæ Bloc 8 trait√© (200,000 caract√®res, 4952 entit√©s)
üíæ Bloc 9 trait√© (200,000 caract√®res, 5248 entit√©s)
üíæ Bloc 10 trait√© (200,000 caract√®res, 5564 entit√©s)
üíæ Bloc 11 trait√© (200,000 caract√®res, 3547 entit√©s)
üíæ Bloc 12 trait√© (200,000 caract√®res, 3691 entit√©s)
üíæ Bloc 13 trait√© (200,000 caract√®res, 4940 entit√©s)
üíæ Bloc 14 trait√© (200,000 caract√®res, 4666 entit√©s)
üíæ Bloc 15 trait√© (109,828 caract√®res, 1876 entit√©s)

‚úÖ Analyse termin√©e : 64,600 entit√©s reconnues.
üíæ R√©sultat enregistr√© dans : C:\Users\tommy\TAC2\TAC\data\txt\entites_1965.csv


## Compter, trier et imprimer les entit√©s du corpus de 1965

In [12]:
print('doc' in locals())

True


In [14]:
from collections import defaultdict, Counter

In [None]:
# === Charger le CSV des entit√©s ===
annee_choisie = "1965"
folder_path = r"C:\Users\tommy\TAC2\TAC\data\txt"
entites_file = os.path.join(folder_path, f"entites_{annee_choisie}.csv")

df = pd.read_csv(entites_file, sep=';')

# === Cr√©er des dictionnaires pour chaque type d'entit√© ===
people = defaultdict(int)
places = defaultdict(int)
orgs   = defaultdict(int)

# === Parcourir toutes les entit√©s du CSV ===
for _, row in df.iterrows():
    label = row["Type"]
    text = str(row["Texte"]).strip()

    if len(text) <= 2:  # filtrer les entit√©s trop courtes
        continue

    if label == "PER":
        people[text] += 1
    elif label == "LOC":
        places[text] += 1
    elif label == "ORG":
        orgs[text] += 1

# === Trier les r√©sultats (10 premiers de chaque type) ===
top_people = Counter(people).most_common(10)
top_places = Counter(places).most_common(10)
top_orgs   = Counter(orgs).most_common(10)

# === Cr√©er un tableau r√©capitulatif ===
df_summary = pd.DataFrame({
    "Personnes": [p[0] for p in top_people],
    "Fr√©quence (PER)": [p[1] for p in top_people],
    "Lieux": [p[0] for p in top_places],
    "Fr√©quence (LOC)": [p[1] for p in top_places],
    "Organisations": [p[0] for p in top_orgs],
    "Fr√©quence (ORG)": [p[1] for p in top_orgs],
})

# === Afficher le tableau ===
print(df_summary)

  Personnes  Fr√©quence (PER)       Lieux  Fr√©quence (LOC)  Organisations  \
0    Rossel              307   Bruxelles              551  Agence Rossel   
1       Ecr              126        Brux              302         Mutuel   
2  Monsieur              105    Belgique              135            bur   
3      SENT               84      T√©l√©ph              101            dem   
4     - T√©l               77       Paris               90            maz   
5      trav               44       Li√®ge               87            chf   
6     Bonne               43  Anderlecht               87           Fiat   
7       T√©l               43        Etat               80       Standard   
8  MONSIEUR               40      Anvers               79      T√©l√©phone   
9   Moli√®re               38     Ixelles               67        Conseil   

   Fr√©quence (ORG)  
0              118  
1               61  
2               40  
3               38  
4               34  
5               29  
6    

## Nettoyer la liste des entit√©s

In [21]:
import unicodedata
import re

In [22]:
# === Charger le CSV des entit√©s ===
df = pd.read_csv(entites_file, sep=';')

# === Liste de stopwords simples ===
sw = {
    "le","la","les","de","du","des","d","l","au","aux","et","ou","√†","en","sur","sous",
    "un","une","pour","par","dans","avec","chez","sans","entre","contre","vers",
    "ce","cet","cette","ces","se","sa","son","leurs","leur","plus","moins","fait", "bruxelles", "belgique", "rue", "plus", "t√©l", "rossel", "ans", "deux", "tout", "cette", "van", "dem", "prix", "apr√®s", "bien", "sans", "tr√®s", "brux", "comme", "faire", 
    "faire","√™tre", "sous", "heures", "grand", "√©crire", "soir", "tous", "fait", "part", "ecrire", "place", "demande", "maison", "jours", "dont", "app", "bon", "temps", 
    "avenue", "entre", "service", "encore", "gar", "aussi", "leurs", "non", "contre", "premi√®re", "avant", "bonne", "peut", "mois", "lieu", "peu", "autre", "ecr", "jeune", 
    "jour", "jour", "samedi", "lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche", "janvier", "f√©vrier", "mars", "avril", "mai", "ao√ªt", "septembre", "octobre",
    "novembre", "d√©cembre",  "autres", "t√©l√©ph", "monsieur", "pr√©s", "grande", "moins", "pays", "midi", "madame", "dimanche", "cours", "toutes", "semaine", "ainsi", 
    "toute", "premier", "dit", "francs", "quelques", "quelque", "fois", "importante", "cuis", "etc", "vente", "terr", "jeudi", "conf", "avoir", "jeunes", "depuis", "chauss√©e", 
     "vers", "ann√©e", "juin", "juillet", "mai", "d√©j√†", "chez", "d√®s", "cet", "mercredi", "jusqu", "cherche", "pr√®s", "mod", "louer", "partie", "celui", "belle", "fin", 
     "vendre", "engage", "bel", "alors", "toujours", "petit", "suite", "partir", "ceux", "dire", "trav", "faut", "car", "rez", "devant", "jard", "celle", "doit", "frs", 
    "rien", "dernier", "num√©ro", "tel", "beau", "chaque", "elles", "je", "tu", "vous", "il", "nous", "ils", "elles", "t√©l√©phone", "petits", "points", "cela", "nouvelle", "donc", 
     "aff", "voir", "plusieurs", "trop", "beaux", "quand", "assez", "demi", "haut", "gros", "ann√©es", "heure", "bur", "vend", "cependant", "six", "puis", "seul", "cas", "parmi", 
     "h√¥tel", "recherche", "appart", "beaucoup", "petite", "pers", "bat", "prendre", "grd", "deuxi√®me", "troisi√®me", "quatre", "cinq", "sept", "huit", "neuf", "dix", "sent", "√©galement", 
    "nouveau", "bas", "pendant", "ici", "l√†", "txt", "tant",  "courant", "surtout", "rem", "fa√ß", "bonnes", "minutes", "jamais", "enfin", "bons", "certains", "mieux", "quart", 
    "seulement", "voit", "maz", "mat", "d√©s", "sem", "poss", "mise", "notamment","villa", "services", "bureau", "bureaux", "jardin", "chambres","agence", "vue", "trois", "garage", "march√©", 
    "fit", "adresser", "serie", "adr", "imm", "com", "peuvent", "agit", "selon", "les", "plus", "cette", "fait", "faire", "√™tre", "deux", "comme", "dont", "tout", "ils", "bien", "sans", "peut", 
    "tous", "apr√®s", "ainsi", "donc", "cet", "sous","celle", "entre", "encore", "toutes", "pendant", "moins", "dire", "cela", "non", "faut", "trois", "aussi", "dit", "avoir", "doit", "contre", 
    "depuis", "autres","van", "het", "autre", "jusqu", "ville", "rossel", "dem", "t√©l", "Soir", "ecr", "rue", "bon", "Bruxelles","bruxelles", "prix", "Brux", "ans", "maison", "√©crire", "pr√®s", 
    "peu", "d√©s", "ecrire", "Brux", "brux", "part", "grand", "vendre", "tr√®s", "vend", "pr√©s", "mod", "etc", "avant", "pet", "cherche", "vente"
}

def norm_unicode(s: str) -> str:
    return unicodedata.normalize("NFKC", s)

def clean_entity(text: str, label: str) -> str | None:
    """Nettoie une entit√© extraite par spaCy (supprime stopwords, ponctuation, etc.)"""
    if not isinstance(text, str):
        return None
    t = norm_unicode(text).strip()
    # Retire ponctuation en bord
    t = re.sub(r"^[^\w]+|[^\w]+$", "", t)
    if not t:
        return None
    toks = re.split(r"\s+", t)
    # Si tous les tokens sont des stopwords, on ignore
    if all(tok.lower() in sw for tok in toks):
        return None
    # Retire les stopwords au d√©but et √† la fin
    while toks and toks[0].lower() in sw:
        toks.pop(0)
    while toks and toks[-1].lower() in sw:
        toks.pop()
    if not toks:
        return None
    t = " ".join(toks)
    if label == "PER":
        t = t.title()
    else:
        t = re.sub(r"\s+", " ", t).strip()
    if len(t) <= 2:
        return None
    return t

# === Appliquer le nettoyage aux entit√©s existantes ===
df["Texte_net"] = [clean_entity(txt, lbl) for txt, lbl in zip(df["Texte"], df["Type"])]

# === Garder uniquement les entit√©s valides ===
df_clean = df[df["Texte_net"].notna() & df["Type"].isin(["PER", "LOC", "ORG"])].copy()

# === Calculer la fr√©quence de chaque entit√© ===
df_clean["Fr√©quence"] = df_clean.groupby(["Type", "Texte_net"])["Texte_net"].transform("count")
df_clean = df_clean.drop_duplicates(subset=["Type", "Texte_net"])
df_clean = df_clean.sort_values(["Type", "Fr√©quence"], ascending=[True, False])

# === Afficher le tableau complet ===
print(df_clean[["Type", "Texte_net", "Fr√©quence"]])

# === (Optionnel) Sauvegarder le tableau complet ===
output_file = os.path.join(folder_path, f"entites_nettoyees_{annee_choisie}.csv")
df_clean.to_csv(output_file, index=False, sep=";")
print(f"\nüíæ Tableau complet export√© : {output_file}")

      Type         Texte_net  Fr√©quence
45     LOC            France        108
133    LOC             Paris         91
1535   LOC             Li√®ge         88
4339   LOC        Anderlecht         87
200    LOC              Etat         80
...    ...               ...        ...
64580  PER    Pierre Vasnier          1
64584  PER         Ir F- K J          1
64590  PER              Mc M          1
64594  PER          Verlaine          1
64599  PER  M√©dicis U - Le K          1

[27490 rows x 3 columns]

üíæ Tableau complet export√© : C:\Users\tommy\TAC2\TAC\data\txt\entites_nettoyees_1965.csv


## Compter, trier et afficher les entit√©s nettoy√©es

In [25]:
clean_file = os.path.join(folder_path, f"entites_nettoyees_{annee_choisie}.csv")

# === Charger le CSV nettoy√© ===
df = pd.read_csv(clean_file, sep=';')

# Choisir la bonne colonne de texte (nettoy√©e si pr√©sente)
col_txt = "Texte_net" if "Texte_net" in df.columns else "Texte"

# === Compter par type ===
people = Counter(df.loc[df["Type"]=="PER", col_txt])
places = Counter(df.loc[df["Type"]=="LOC", col_txt])
orgs   = Counter(df.loc[df["Type"]=="ORG", col_txt])

# === Top N (ajuste comme tu veux) ===
topN = 10
top_people = people.most_common(topN)
top_places = places.most_common(topN)
top_orgs   = orgs.most_common(topN)

# === Tableau r√©capitulatif ===
df_summary = pd.DataFrame({
    "Personnes": [p for p,_ in top_people],
    "Fr√©quence (PER)": [n for _,n in top_people],
    "Lieux": [p for p,_ in top_places],
    "Fr√©quence (LOC)": [n for _,n in top_places],
    "Organisations": [p for p,_ in top_orgs],
    "Fr√©quence (ORG)": [n for _,n in top_orgs],
})

print(df_summary)

  Personnes  Fr√©quence (PER)       Lieux  Fr√©quence (LOC) Organisations  \
0   Moli√®re                1      France                1        Mutuel   
1       Roi                1       Paris                1           chf   
2   Jacques                1       Li√®ge                1          Fiat   
3       Enf                1  Anderlecht                1      Standard   
4      Curr                1        Etat                1       Conseil   
5     S.D.B                1      Anvers                1    ANDERLECHT   
6    Louise                1        Gand                1     Parlement   
7    Mozart                1     Ixelles                1      Couronne   
8    Tt Cft                1       Uccle                1           s.b   
9       Emp                1       Namur                1          FIAT   

   Fr√©quence (ORG)  
0                1  
1                1  
2                1  
3                1  
4                1  
5                1  
6                1  
7