In [36]:
# cSpell:disable
# consider : https://marketplace.visualstudio.com/items?itemName=streetsidesoftware.code-spell-checker-french


from pathlib import Path
from typing import List, Optional

import json_repair
from devtools import debug
from loguru import logger
from pydantic import BaseModel, ConfigDict


class JsonModel(BaseModel):
    model_config = ConfigDict(extra="ignore")


class DesgnEtab(JsonModel):
    eta_uai: str
    eta_libelle: str
    eta_name: str


class Adresse(JsonModel):
    eta_ville: str | None = None


class Etab(JsonModel):
    desgn_etab: DesgnEtab


class LieuxItem(JsonModel):
    site: str
    ville: str
    geo: str


class InformationsPedagogiques(JsonModel):
    lien_fiche: str
    mot_cle_disciplinaire: Optional[List[str]] = None
    mot_cle_sectoriel: Optional[List[str]] = None
    mot_cle_metier: Optional[List[str]] = None
    mot_cle_libre: Optional[List[str]] = None
    # langues: List[str]
    # lieux: List[LieuxItem]


class Parcour(JsonModel):
    for_inmp: str
    intitule_parcours: str
    informations_pedagogiques: InformationsPedagogiques | None = None
    # licences_conseillees: List[str]
    # attendus: List[str]
    # criteres: List[str]
    # criteres_examen: List[str]
    modalite_enseignement: List[str] | None = None


class Dnm(JsonModel):
    for_inm: str | None = None
    for_intitule: str
    dom_libelle: List[str]
    informations_pedagogiques: Optional[InformationsPedagogiques] = None
    # licences_conseillees: Optional[List[str]] = None
    parcours: Optional[List[Parcour]] = None


class Formation(JsonModel):
    etab: Etab
    dnms: List[Dnm]


# cSpell:disable

REPO = Path("/mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024")

FILE = REPO / "Offre_DNM_2024-2025_AMIENS_0801344B_PUBLIABLE.json"
FILE = "Offre_DNM_2024-2025_AGROPARISTECH_0912456A_PUBLIABLE.json"
FILE = "Offre_DNM_2024-2025_AIX-MARSEILLE_0134009M_PUBLIABLE.json"
FILE = "Offre_DNM_2024-2025_ENSI_CAEN_0141720U_PUBLIABLE.json"


with open(REPO / FILE, "r") as io:
    json_file = io.read()
    json_obj = json_repair.loads(json_file)
    master = Formation(**json_obj)  # type: ignore
    debug(master)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024/Offre_DNM_2024-2025_ENSI_CAEN_0141720U_PUBLIABLE.json'

In [45]:
import json
from typing import Iterator

from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document

# cSpell: disable


def format_info_pedago(intitule: str, info_pedago: InformationsPedagogiques):
    content = []
    content.append(f"intitulé: {intitule}")
    if info := info_pedago.mot_cle_disciplinaire:
        content.append(f"disciplines: {','.join(info)}")
    if info := info_pedago.mot_cle_sectoriel:
        content.append(f"secteurs: {','.join(info)}")
    if info := info_pedago.mot_cle_metier:
        content.append(f"métier: {','.join(info)}")
    if info := info_pedago.mot_cle_libre:
        content.append(f"autre: {','.join(info)}")
    return "\n".join(content)


class offre_formation_loader(BaseLoader):
    def __init__(self, doc_list: Path):
        self.parcours_directory = doc_list

    def lazy_load(self) -> Iterator[Document]:
        # metadata = dict()
        for file_name in self.parcours_directory.glob(pattern="Offre_*.json"):
            logger.debug(f"load {file_name}")
            try:
                with open(file_name, "r") as io:
                    json_file = io.read()
                    json_obj = json_repair.loads(json_file)
                    master = Formation(**json_obj)  # type: ignore
            except Exception as ex:
                logger.error(f"cannot load {file_name.name} - {ex}")
                continue

            metadata_offre = {
                "source": file_name.name,
                "eta_uai": master.etab.desgn_etab.eta_uai,
                "eta_libelle": master.etab.desgn_etab.eta_libelle,
                "eta_name": master.etab.desgn_etab.eta_name,
            }
            for dmn in master.dnms:
                if dmn.parcours:
                    for partour in dmn.parcours:
                        metadata_for = {"inmp": partour.for_inmp}
                        if partour.informations_pedagogiques:
                            content = format_info_pedago(
                                partour.intitule_parcours,
                                partour.informations_pedagogiques,
                            )
                            if lien := partour.informations_pedagogiques.lien_fiche:
                                metadata_for |= {"lien_fiche": lien}

                            yield Document(
                                page_content=content,
                                metadata=metadata_for | metadata_offre,
                            )

                dmn_info_pedago = dmn.informations_pedagogiques
                if dmn_info_pedago:
                    content = format_info_pedago(
                        "".join(dmn.dom_libelle), dmn_info_pedago
                    )
                    metadata_for = {"inm": dmn.for_inm}
                    if lien := dmn_info_pedago.lien_fiche:
                        metadata_for |= {"lien_fiche": lien}
                    yield Document(
                        page_content=content,
                        metadata=metadata_for | metadata_offre,
                    )


REPO = Path("/mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024")
assert REPO.exists

loader = offre_formation_loader(REPO)
processed = list(loader.load())


json_data = json.dumps([item.dict() for item in processed], indent=4)
with open(REPO / "synthesis.json", "w") as io:
    io.write(json_data)

[32m2024-05-22 18:54:23.229[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlazy_load[0m:[36m31[0m - [34m[1mload /mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024/Offre_DNM_2024-2025_AGROPARISTECH_0912456A_PUBLIABLE.json[0m
[32m2024-05-22 18:54:23.255[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlazy_load[0m:[36m31[0m - [34m[1mload /mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024/Offre_DNM_2024-2025_AIX-MARSEILLE_0134009M_PUBLIABLE.json[0m


[32m2024-05-22 18:54:23.298[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlazy_load[0m:[36m31[0m - [34m[1mload /mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024/Offre_DNM_2024-2025_AMIENS_0801344B_PUBLIABLE.json[0m
[32m2024-05-22 18:54:23.330[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlazy_load[0m:[36m31[0m - [34m[1mload /mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024/Offre_DNM_2024-2025_ANGERS_0490970N_PUBLIABLE.json[0m
[32m2024-05-22 18:54:23.357[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlazy_load[0m:[36m31[0m - [34m[1mload /mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024/Offre_DNM_2024-2025_ARTOIS_0623957P_PUBLIABLE.json[0m
[32m2024-05-22 18:54:23.384[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlazy_load[0m:[36m31[0m - [34m[1mload /mnt/c/Users/a184094/OneDrive - Eviden/_En cours/mon_master/Offres_2024/Offre_DNM_2024-2025_AVIGNON_0840685N_PUBLIABLE.json

In [50]:
from typing import Iterable


def save_docs_to_jsonl(array: Iterable[Document], file_path: Path) -> None:
    with open(file_path, "w") as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + "\n")


def load_docs_from_jsonl(file_path: Path) -> Iterable[Document]:
    array = []
    with open(file_path, "r") as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array


FILE = Path("test.jsonl")
save_docs_to_jsonl(processed, Path("test.jsonl"))
l = load_docs_from_jsonl(FILE)
debug(l)

/tmp/ipykernel_470622/1480742817.py:23 <module>
    l: [
        Document(
            page_content='intitulé: SCIENCES, TECHNOLOGIES, SANTÉ',
            metadata={
                'inm': '1800328G',
                'lien_fiche': (
                    'https://www.agroparistech.fr/formations-ingenieur/mention-agrosciences-environnement-territoires-'
                    'paysage-foret-nancy-parcours-m1-agrosciences-environnement-territoires-paysage-foret'
                ),
                'source': 'Offre_DNM_2024-2025_AGROPARISTECH_0912456A_PUBLIABLE.json',
                'eta_uai': '0912456A',
                'eta_libelle': 'AGROPARISTECH',
                'eta_name': 'AgroParisTech',
            },
        ),
        Document(
            page_content='intitulé: SCIENCES, TECHNOLOGIES, SANTÉ',
            metadata={
                'inm': '1501316R',
                'lien_fiche': (
                    'https://www.agroparistech.fr/formations-ingenieur/mention-biodiversite-ecologie

[Document(page_content='intitulé: SCIENCES, TECHNOLOGIES, SANTÉ', metadata={'inm': '1800328G', 'lien_fiche': 'https://www.agroparistech.fr/formations-ingenieur/mention-agrosciences-environnement-territoires-paysage-foret-nancy-parcours-m1-agrosciences-environnement-territoires-paysage-foret', 'source': 'Offre_DNM_2024-2025_AGROPARISTECH_0912456A_PUBLIABLE.json', 'eta_uai': '0912456A', 'eta_libelle': 'AGROPARISTECH', 'eta_name': 'AgroParisTech'}),
 Document(page_content='intitulé: SCIENCES, TECHNOLOGIES, SANTÉ', metadata={'inm': '1501316R', 'lien_fiche': 'https://www.agroparistech.fr/formations-ingenieur/mention-biodiversite-ecologie-evolution-montpellier-parcours-m1-biodiversite-vegetale-gestion-ecosystemes-tropicaux-montpellier', 'source': 'Offre_DNM_2024-2025_AGROPARISTECH_0912456A_PUBLIABLE.json', 'eta_uai': '0912456A', 'eta_libelle': 'AGROPARISTECH', 'eta_name': 'AgroParisTech'}),
 Document(page_content='intitulé: SCIENCES, TECHNOLOGIES, SANTÉ', metadata={'inm': '1702222S', 'lien_f