#  Exploration Sémantique du sous corpus  liés aux   articles de 1920 



## 1) Entités nommés
je le fais pour  repérer les personnes, organisations et lieux les plus cités dans les articles 1920.

In [7]:
from pathlib import Path
import os, re
from collections import Counter, defaultdict
import spacy

DATA_DIR = Path("../data/txt")
DECADE = "1920"
files_1920 = sorted([f for f in os.listdir(DATA_DIR) if f"_{DECADE}-" in f and f.endswith(".txt")])

nlp = spacy.load("fr_core_news_sm")

def first_chunk(path, n_chars=4000):
    return (path.read_text(encoding="utf-8", errors="replace"))[:n_chars]

ent_counts = defaultdict(Counter)

for f in files_1920:
    doc = nlp(re.sub(r"\s+", " ", first_chunk(DATA_DIR / f)))
    for ent in doc.ents:
        if ent.label_ in {"PER", "ORG", "LOC"} and len(ent.text) > 2:
            ent_counts[ent.label_][ent.text.strip()] += 1

for label in ("PER","ORG","LOC"):
    print(f"\nTop 10 {label} :")
    for txt, c in ent_counts[label].most_common(10):
        print(f"  {txt} — {c}")


Top 10 PER :
  Roi — 7
  sach — 7
  Blancs — 7
  Rossel — 6
  Etranger — 6
  culs — 5
  Dujardin — 5
  Bru — 4
  Louise — 4
  qu’ — 4

Top 10 ORG :
  SOIR — 10
  Chambre — 10
  ANNONCES — 8
  ABONNEMENTS — 8
  PROVINCE — 7
  Sénat — 6
  Conseil — 6
  DEM — 6
  Eor — 5
  Société des nations — 5

Top 10 LOC :
  Bruxelles — 101
  qu’ — 39
  Allemagne — 38
  Belgique — 28
  Anvers — 19
  Paris — 17
  Etat — 16
  Berlin — 15
  Angleterre — 14
  France — 13


## 2)Sentiments

In [8]:
import numpy as np

positifs = {
    "accueil","aide","protection","asile","intégration","travail",
    "solidarité","réfugiés","droit","citoyenneté","naturalisation" ,"accueil","protection","asile","intégration","travail","solidarité",
    "droit","citoyenneté","naturalisation","aide","humanitaire","secours",
    "protéger","emplois","insertion","étudier","autoriser","visa"
}
negatifs = {
    "crainte","crise","illégal","expulsion","rejet","xénophobie",
    "trafic","violence","menace","charge","fraude","encombrement" ,"peur","tension","fermer","refuser"
}

def tokenize_fr(text):
    return re.findall(r"[a-zàâçéèêëîïôûùüÿœ\-]{3,}", text.lower())

def polarite(text):
    toks = tokenize_fr(text)
    p = sum(t in positifs for t in toks)
    n = sum(t in negatifs for t in toks)
    return (p - n) / max(1, p + n)

texts_1920 = [(DATA_DIR / f).read_text(encoding="utf-8", errors="replace") for f in files_1920]
scores = [polarite(t) for t in texts_1920]

print("Articles 1920 :", len(scores))
print("Score moyen (≈ tonalité) :", round(float(np.mean(scores)), 3))
print("5 scores exemples :", [round(float(s),3) for s in scores[:5]])

Articles 1920 : 100
Score moyen (≈ tonalité) : 0.29
5 scores exemples : [-1.0, 0.6, -0.2, -0.2, 1.0]


In [5]:
%pip install -q scikit-learn nltk
import nltk
nltk.download('stopwords')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


False

## 3) bigrams 
l'objectif c'st de voir les paires de mots les plus fréquentes, après stopwords. 

In [14]:

import os, re
from pathlib import Path
from collections import Counter

DATA_DIR = Path("./data/txt")

files_1920 = [f for f in sorted(os.listdir(DATA_DIR)) if "_1920" in f and f.endswith(".txt")]
print("Fichiers 1920 :", len(files_1920))

def tokenize(txt):
    return re.findall(r"[a-zàâçéèêëîïôûùüÿœ]{3,}", txt.lower())

STOP = {
    "a","à","au","aux","avec","car","ce","cela","ces","cette","chaque","ci","comme","d","dans","de","des","du",
    "elle","en","et","être","eux","il","ils","je","la","le","les","leur","lui","ma","mais","mes","moi","mon",
    "ne","nos","notre","nous","on","ou","où","par","pas","pour","qu","que","qui","sa","se","ses","sans","son",
    "sur","ta","te","tes","toi","ton","tu","un","une","vos","votre","vous","y","ça","cest","cet","dont"
}

docs = []
for fn in files_1920:
    with open(DATA_DIR / fn, encoding="utf-8", errors="replace") as f:
        docs.append(tokenize(f.read()))

print(f"Documents tokenisés : {len(docs)}")

Fichiers 1920 : 998
Documents tokenisés : 998


In [15]:
from collections import Counter

# docs = liste de documents tokenisés (tu l'as déjà)
bigrams = Counter((d[i], d[i+1]) for d in docs for i in range(len(d)-1))

# filtre : on enlève les bigrams qui contiennent un stopword
def is_clean(bg): 
    return bg[0] not in STOP and bg[1] not in STOP

top20 = [(a,b,c) for (a,b),c in bigrams.most_common(200) if is_clean((a,b))][:20]
for a,b,c in top20:
    print(f"{a} {b}  -> {c}")

ont été  -> 2258
aujourd hui  -> 1875
après midi  -> 940
ordre jour  -> 814
avait été  -> 778
chemins fer  -> 720
aura lieu  -> 702
après avoir  -> 688
etats unis  -> 664
point vue  -> 623
autre part  -> 617
lloyd george  -> 526
est plus  -> 514
agence rossel  -> 490
tout faire  -> 477
maison commerce  -> 462
jeune fille  -> 457
avant guerre  -> 454
pendant guerre  -> 449
février heures  -> 448


## 4) Collocations fortes (PMI)
je fais ca pour repérer les paires “surreprésentées” au-delà de la simple fréquence.

In [16]:
import math

uni = Counter(t for d in docs for t in d)
N_uni = sum(uni.values())
N_bi  = sum(bigrams.values())

def pmi(w1, w2, min_count=3):
    c12 = bigrams[(w1,w2)]
    if c12 < min_count: 
        return None
    p12 = c12 / N_bi
    p1  = uni[w1] / N_uni
    p2  = uni[w2] / N_uni
    return math.log2(p12 / (p1*p2))

targets = ["immigration","étranger","asile","réfugié", "séjour", "permis de séjour", "titre de séjour", "carte d’identité d’étranger",
    "visa", "passeport", "laissez-passer", "frontière", "douanes", "contrôle",
    "expulsion", "refoulement", "internement",
    "administration des étrangers", "police des étrangers", "office des réfugiés",
    "consulat", "ministère de l’intérieur",
    "apatride", "demandeur d’asile",
    "rapatriement", "déplacés", "secours", "réinstallation",
    "main-d’œuvre étrangère", "recrutement", "logement", "ouvriers étrangers"
]

for t in targets:
    R = sorted([(w2, pmi(t,w2)) for (w1,w2) in bigrams if w1==t and pmi(t,w2) is not None],
               key=lambda x: -x[1])[:10]
    L = sorted([(w1, pmi(w1,t)) for (w1,w2) in bigrams if w2==t and pmi(w1,t) is not None],
               key=lambda x: -x[1])[:10]
    print(f"\n=== Collocations PMI autour de « {t} » ===")
    print("Gauche :", [w for w,_ in L])
    print("Droite :", [w for w,_ in R])


=== Collocations PMI autour de « immigration » ===
Gauche : []
Droite : []

=== Collocations PMI autour de « étranger » ===
Gauche : ['vinco', 'vlnce', 'tributaire', 'vince', 'achetées', 'tributaires', 'joug', 'disponibilités', 'réfugié', 'envoyée']
Droite : ['dangereux', 'exposer', 'tarif', 'dublin', 'ouvre', 'revenu', 'centimes', 'soient', 'nombreux', 'aide']

=== Collocations PMI autour de « asile » ===
Gauche : ['interné', 'droit', 'donner', 'œuvre']
Droite : ['indignement', 'aliénés', 'accordé', 'ces', 'sur', 'aux', 'des']

=== Collocations PMI autour de « réfugié » ===
Gauche : ['était']
Droite : ['hollande', 'étranger', 'déclaré', 'dans']

=== Collocations PMI autour de « séjour » ===
Gauche : ['assigne', 'court', 'beau', 'permis', 'lille', 'années', 'long', 'mon', 'son', 'leur']
Droite : ['lucerne', 'esneux', 'auprès', 'bas', 'paris', 'avenue', 'trois', 'dans', 'bruxelles', 'aux']

=== Collocations PMI autour de « permis de séjour » ===
Gauche : []
Droite : []

=== Collocation