In [1]:
import pandas as pd
import datetime
import subprocess
import os
from pathlib import Path

# Process *les journaux officiels*

In [2]:
tmpfile = "tmp.txt"
df_file = "data_processed.csv"
if os.path.isfile(df_file):
    df = pd.read_csv(df_file)
    jo_list = sorted(df["jo"].unique())
else:
    df = pd.DataFrame(columns=["numero", "série", "sexe", "age", "dep", "jo", "delai"])
    jo_list = []

for decret in Path(".").glob("decrets/*.pdf"):
    # Skip if current JO is already treated
    jo = str(decret).partition("joe_")[2].partition("_")[0]
    if int(jo) in jo_list:
        continue
    
    print(f"Process JO {jo}...")
    date_jo = datetime.datetime.strptime(jo, "%Y%m%d")
    with open(tmpfile, "w") as f:
        subprocess.check_call(["pdf2txt.py", decret], stdout=f)
#         subprocess.check_call(["python", "D:/Programs/Miniconda3/Scripts/pdf2txt.py", str(decret)], stdout=f)
    
    with open(tmpfile) as f:
        content = f.read()
        keys = ["\n\n", ", \n"]
        for key in keys:
            content = content.replace(key, "\n")
        content = content.replace("\ndép", ", dép")
        content = content.replace("dép. \n", ", dép. ")
        content = content.replace("X \n", "X")
        content = content.replace(". \n", "^")
        content = content.replace(": \n", "^")
        content = content.replace("\n", "")
        content = content.replace("^", ".\n")
        lines = content.split("\n")
    
    for line in lines:
        if "NAT," not in line:
            continue
        else:
            if "née" in line:
                sexe = "F"
            else:
                sexe = "H"
            line = line.replace(" au ", " à ").replace(" aux ", " à ")
            naissance = line.partition(" le ")[2].partition(" à ")[0].replace(" ", "")
            try:
                date_naissance = datetime.datetime.strptime(naissance, "%d/%m/%Y")
            except ValueError:
                continue
            age = (date_jo - date_naissance).days / 365.2425
            dep = line.partition("dép.")[2].partition(",")[0].replace(" ", "")
            if len(dep) == 0:
                continue
            elif "Dt" in dep:
                dep = dep.partition("Dt")[0]
            numero = line.partition("NAT,")[2].partition(",")[0].replace(" ", "")
            serie = numero[:8]
            year = int(serie.partition("X")[0])
            weeknum = int(serie.partition("X")[2][:3]) + 1
            if weeknum > 53:
                continue
            else:
                date_serie = datetime.datetime.strptime(f"{year} {weeknum}-3", "%G %V-%w")
                delai = (date_jo - date_serie).days / (365.2425 / 12)
            line = {"numero": numero, "serie": serie, "sexe": sexe, "age": age, "dep": dep, "jo": jo, "delai": delai}
            df = df.append(line, ignore_index=True)

os.remove(tmpfile)

Process JO 20190831...


In [3]:
df.tail()

Unnamed: 0,numero,serie,sexe,age,dep,jo,delai
22155,2019X004251,2019X004,H,39.116477,95,20190831,6.99809
22156,2018X041046,2018X041,F,33.936357,75,20190831,10.447853
22157,2019X009784,2019X009,F,28.775403,76,20190831,5.848169
22158,2019X006304,2019X006,H,49.057818,34,20190831,6.538122
22159,2019X006731,2019X006,H,48.838785,69,20190831,6.538122


In [4]:
df.isnull().values.any()

False

In [5]:
df.to_csv("data_processed.csv", index=False)