# Segmentation en phrases

## Imports

In [1]:
import os
import sys
import nltk
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import spacy
from spacy.lang.fr.examples import sentences

In [2]:
nlp = spacy.load('fr_core_news_md')

## Fichiers d'inputs et d'outputs

In [22]:
# Lister les Fichiers
title = "Ecole_1900-1910"
txt_path = '../data/tmp_tp4'
files = os.listdir(txt_path)

In [25]:
ext = ["1900s.txt", "1910s.txt"]

In [26]:
# Enlever les fichiers qui ne contiennent pas RptAn
Ecole_files = [f for f in files if f.startswith(tuple(ext))]
len(Ecole_files)

2

In [27]:
# Stocker le contenu de ces fichiers dans une liste
content_list = []
for txt in Ecole_files:
    with open(os.path.join(txt_path, txt), 'r') as f:
        content_list.append(f.read())

In [28]:
# Compter le nombre d'éléments (=fichiers) dans la liste
len(content_list)

2

In [29]:
# Imprimer les 200 premiers caractères du contenu du premier fichier
content_list[0][:200]

"(31\n\nJanvier\n\n1910)\n\n—\n\n100\n\n—\n\nCes comptes soldent par un déficit de. . . fr.\nalors que la somme prévue au bu iget de la Ville\nn'est que de\nDifférence.\n\n626.568 90\n532,970 37\n\n. fr.\n\n93,598 53\n\nEn vu"

In [30]:
# Ecrire tout le contenu dans un fichier temporaire
temp_path = '../data/tmp_tp4'
if not os.path.exists(temp_path):
    os.mkdir(temp_path)
with open(os.path.join(temp_path, f'{title}.txt'), 'w') as f:
    f.write(' '.join(content_list))

In [31]:
# Imprimer le contenu du fichier et constater les "déchets"
with open(os.path.join(temp_path, f'{title}.txt'), 'r') as f:
    before = f.read()

before[:500]

"(31\n\nJanvier\n\n1910)\n\n—\n\n100\n\n—\n\nCes comptes soldent par un déficit de. . . fr.\nalors que la somme prévue au bu iget de la Ville\nn'est que de\nDifférence.\n\n626.568 90\n532,970 37\n\n. fr.\n\n93,598 53\n\nEn vue de permettre de liquider le montant de ce déficit, le\nCollège vous propose, Messieurs, le vote d'un crédit supplémentaire defr. 9;5,598-ô3 à l'art. 131 des dépenses ordinaires\ndu budget de 1901) : « Subside éventuel à\nl'Administration\ndes hospices et secours ».\n\nLes ressources ordinaires de l'exer"

In [32]:
infile = "../data/tmp_tp4/Ecole_1900-1910.txt"
outfile = "../data/tmp_tp4/sents_tp4.txt"

## Segmentation en phrases du corpus des années 1900 et 1910 et création d'un nouveau fichier

In [33]:
LIMIT = None

In [34]:
with open(outfile, 'w', encoding="utf-8") as output:
    with open(infile, encoding="utf-8", errors="backslashreplace") as f:
        content = f.readlines()
        content = content[:LIMIT] if LIMIT is not None else content
        n_lines = len(content)
        for i, line in enumerate(content):
            if i % 10000 == 0:
                print(f'processing line {i}/{n_lines}')
            sentences = sent_tokenize(line)
            for sent in sentences:
                output.write(sent + "\n")
print("Done")

processing line 0/3700567
processing line 10000/3700567
processing line 20000/3700567
processing line 30000/3700567
processing line 40000/3700567
processing line 50000/3700567
processing line 60000/3700567
processing line 70000/3700567
processing line 80000/3700567
processing line 90000/3700567
processing line 100000/3700567
processing line 110000/3700567
processing line 120000/3700567
processing line 130000/3700567
processing line 140000/3700567
processing line 150000/3700567
processing line 160000/3700567
processing line 170000/3700567
processing line 180000/3700567
processing line 190000/3700567
processing line 200000/3700567
processing line 210000/3700567
processing line 220000/3700567
processing line 230000/3700567
processing line 240000/3700567
processing line 250000/3700567
processing line 260000/3700567
processing line 270000/3700567
processing line 280000/3700567
processing line 290000/3700567
processing line 300000/3700567
processing line 310000/3700567
processing line 320000

# Segmentation en phrases pour la décennie 1900

In [3]:
infile = "../data/tmp_tp4/1900s.txt"
outfile = "../data/tmp_tp4/sents_1900_tp4.txt"

In [4]:
LIMIT = 1000000

In [5]:
with open(outfile, 'w', encoding="utf-8") as output:
    with open(infile, encoding="utf-8", errors="backslashreplace") as f:
        content = f.readlines()
        content = content[:LIMIT] if LIMIT is not None else content
        n_lines = len(content)
        for i, line in enumerate(content):
            if i % 10000 == 0:
                print(f'processing line {i}/{n_lines}')
            sentences = sent_tokenize(line)
            for sent in sentences:
                output.write(sent + "\n")
print("Done")

processing line 0/1000000
processing line 10000/1000000
processing line 20000/1000000
processing line 30000/1000000
processing line 40000/1000000
processing line 50000/1000000
processing line 60000/1000000
processing line 70000/1000000
processing line 80000/1000000
processing line 90000/1000000
processing line 100000/1000000
processing line 110000/1000000
processing line 120000/1000000
processing line 130000/1000000
processing line 140000/1000000
processing line 150000/1000000
processing line 160000/1000000
processing line 170000/1000000
processing line 180000/1000000
processing line 190000/1000000
processing line 200000/1000000
processing line 210000/1000000
processing line 220000/1000000
processing line 230000/1000000
processing line 240000/1000000
processing line 250000/1000000
processing line 260000/1000000
processing line 270000/1000000
processing line 280000/1000000
processing line 290000/1000000
processing line 300000/1000000
processing line 310000/1000000
processing line 320000

# Segmentation en phrases pour la décennie 1910

In [6]:
infile = "../data/tmp_tp4/1910s.txt"
outfile = "../data/tmp_tp4/sents_1910_tp4.txt"

In [7]:
LIMIT = 1000000

In [8]:
with open(outfile, 'w', encoding="utf-8") as output:
    with open(infile, encoding="utf-8", errors="backslashreplace") as f:
        content = f.readlines()
        content = content[:LIMIT] if LIMIT is not None else content
        n_lines = len(content)
        for i, line in enumerate(content):
            if i % 10000 == 0:
                print(f'processing line {i}/{n_lines}')
            sentences = sent_tokenize(line)
            for sent in sentences:
                output.write(sent + "\n")
print("Done")

processing line 0/1000000
processing line 10000/1000000
processing line 20000/1000000
processing line 30000/1000000
processing line 40000/1000000
processing line 50000/1000000
processing line 60000/1000000
processing line 70000/1000000
processing line 80000/1000000
processing line 90000/1000000
processing line 100000/1000000
processing line 110000/1000000
processing line 120000/1000000
processing line 130000/1000000
processing line 140000/1000000
processing line 150000/1000000
processing line 160000/1000000
processing line 170000/1000000
processing line 180000/1000000
processing line 190000/1000000
processing line 200000/1000000
processing line 210000/1000000
processing line 220000/1000000
processing line 230000/1000000
processing line 240000/1000000
processing line 250000/1000000
processing line 260000/1000000
processing line 270000/1000000
processing line 280000/1000000
processing line 290000/1000000
processing line 300000/1000000
processing line 310000/1000000
processing line 320000