### Analyse des données initiales

In [70]:
import json
import os
import re
from typing import Dict, List 
import pandas as pd

In [71]:
SOURCE_FOLDER = "./data/ladrome-pages-json-v1 (Markdown)/json-v1"
source_files = os.listdir(SOURCE_FOLDER)

In [72]:
def read_json_file(filename):
    with open(os.path.join(SOURCE_FOLDER, filename), 'r') as f:
        read_file = json.load(f)
    return read_file


def get_title_and_text(file_content):
    return pd.Series([file_content["hasPart"][0]["title"], file_content["hasPart"][0]["text"]])


def get_chunk_length(text):
    return pd.Series([len(text), len(text.split())])

In [73]:
source_data = pd.DataFrame(source_files, columns=["filename"])

In [74]:
source_data["file_content"] = source_data["filename"].apply(read_json_file)

In [75]:
source_data[["title", "text"]] = source_data["file_content"].apply(get_title_and_text)

In [76]:
source_data[["chunk_len_chars", "chunk_len_words"]] = source_data["text"].apply(get_chunk_length)
source_data["chunk_len_words"].describe()

count     190.000000
mean      241.652632
std       370.707839
min         1.000000
25%        63.500000
50%       144.500000
75%       277.750000
max      3007.000000
Name: chunk_len_words, dtype: float64

Problems :
- Some chunks ave low amount of word
- variation in chunk size is high
- some chunks are too big
- some chunks need cleaning (symbols and links everywhere)
- some chunks are just links toward other pages (ladrome-page-558941.json)

Ok stuff :
- The scrapping and parsing of the webpages is actually good (no messy stuff)

In [77]:
sorted_chunks = source_data.sort_values(by="chunk_len_words")
sorted_chunks.head(5)

Unnamed: 0,filename,file_content,title,text,chunk_len_chars,chunk_len_words
95,ladrome-page-11223.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Agenda,Agenda\n\n,8,1
37,ladrome-page-10773.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Handicap,Handicap\n\n,10,1
1,ladrome-page-10513.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Les cantons,Les cantons\n\n,13,2
77,ladrome-page-10993.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Le nucléaire,Le nucléaire\n\n,14,2
134,ladrome-page-43852.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Menu des collèges,Menu des collèges\n\n,19,3


In [78]:
sorted_chunks.tail(5)

Unnamed: 0,filename,file_content,title,text,chunk_len_chars,chunk_len_words
171,ladrome-page-574603.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Presse,Presse\n\nVous êtes journaliste ? Cet espace v...,16516,1431
113,ladrome-page-13874.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Conditions générales d’utilisation du formulai...,Conditions générales d’utilisation du formulai...,10905,1585
174,ladrome-page-581681.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Politique de cookies,Politique de cookies\n\n_Cette politique de co...,17242,2054
137,ladrome-page-480456.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Conditions générales d’utilisation de l’applic...,Conditions générales d’utilisation de l’applic...,17470,2519
148,ladrome-page-547344.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",FAQ – futur collège Mercurol-Veaunes,FAQ – futur collège Mercurol-Veaunes\n\nRetrou...,21333,3007


In [79]:
HTML_FOLDER = "./data/ladrome-pages-2023-12-22-json-v1 (HTML)/json-v1"
#FILE = "ladrome-page-480456.json"
#FILE = "ladrome-page-10516.json"
#FILE = "ladrome-page-567239.json"
#FILE = "ladrome-page-573222.json"
#FILE = "ladrome-page-555850.json"
FILE = "ladrome-page-581681.json"
with open(os.path.join(HTML_FOLDER, FILE), "r") as f:
    file = json.load(f)

In [80]:
def compare_HTML_with_Markdown():
    with open(os.path.join(HTML_FOLDER, FILE), "r") as f:
        file = json.load(f)
        print(file["hasPart"][0]["text"])
    with open(os.path.join(SOURCE_FOLDER, FILE), "r") as f:
        file = json.load(f)
        print(file["hasPart"][0]["text"])

compare_HTML_with_Markdown()

<h1>Politique de cookies</h1>


<!-- Legal document generated by Complianz | GDPR/CCPA Cookie Consent https://wordpress.org/plugins/complianz-gdpr -->
<div id="cmplz-document" class="cmplz-document cookie-statement cmplz-document-eu"><p><i>Cette politique de cookies a été mise à jour pour la dernière fois le 21/11/2023 et s’applique aux citoyens et aux résidents permanents légaux de l’Espace Économique Européen et de la Suisse.</i><br></p><h2>1. Introduction</h2><p>Notre site web, <a href="https://www.ladrome.fr">https://www.ladrome.fr</a> (ci-après : « le site web ») utilise des cookies et autres technologies liées (par simplification, toutes ces technologies sont désignées par le terme « cookies »). Des cookies sont également placés par des tierces parties que nous avons engagées. Dans le document ci-dessous, nous vous informons de l’utilisation des cookies sur notre site web.</p><h2>2. Que sont les cookies ?</h2><p>Un cookie est un petit fichier simple envoyé avec les pages de ce si

We see that the parsing of the HTML file removed the h2 titles (they are converted as "\n\n**here is the title text**") which makes the splitting difficult. Keepeing a style like "##here is h2 title" would be better

## New chunking method based on titles

In [81]:
from markdownify import markdownify


class ChunkParserException(Exception):
    def __init__(self, message):
        pass


class ChunkSizeExceeded(Exception):
    def __init__(self, message):
        pass


class ChunkParser:
    def __init__(self):
        pass

    
    def __call__(self, filepath:str, **kwargs) -> str:
        text = self.read_json_file(filepath)
        chunks = self.chunk_document(text, **kwargs)
        return chunks

    
    @staticmethod
    def read_json_file(filepath:str) -> str:
        """Reads a json file and applies markdownify to it

        Args:
            filepath (str): path to a json file

        Returns:
            str: the text, mardkownified
        """
        try:
            with open(filepath, 'r') as f:
                read_file = json.load(f)
            md_file = markdownify(read_file["hasPart"][0]['text'], strip=["figure", "img"], bullets="-*+")
        except Exception as e:
            raise ChunkParserException(f"Can't open JSON file : {e}")

        return md_file


    @property
    def regex_patterns(self):
        return {
            "h1": re.compile(r"(.+?)\n={3,}"),
            "h2": re.compile(r"(.+?)\n-{3,}"),
            "h3": re.compile(r"### (.+)\n"),
            "h4": re.compile(r"#### (.+)\n"),
            "h5": re.compile(r"##### (.+)\n"),
            "link": re.compile(r"\[(.+?)\]\((https?:.+?)\)")
        }


    def get_toc(self, text:str) -> Dict:
        """Gets the table of content of the provided text.
        Parses the text to find headers, from h1 to h5.
        Handles specials cases, such as:
        - we have no header (-> returns empty dict)
        - we have a level missing (e.g. we have h1 and h3 but no h2)

        Args:
            text (str): the table of content, as a dict structured like so:
                {title: {
                    "level": the level (0-based),
                    "parents": the parent titles and their level,
                    "text": the text that is within this title
                    }}
        """
        def get_subtoc(text:str, level:int, parents:List[Dict], id_prefix:str):
            titles = re.findall(self.regex_patterns[f"h{level+1}"], text)
            text_splited_by_title = re.split(self.regex_patterns[f"h{level+1}"], text)
            text_splited_by_title = [t for t in text_splited_by_title if t not in titles]
            # remove first text if we found titles as it is before the first title. else keep the only element
            text_splited_by_title = text_splited_by_title[1:] if titles else text_splited_by_title
            toc = [
                {"title":t,
                "level": level,
                "text": txt,
                "id": f"{id_prefix}{i}",
                "parents": parents, 
                } for i, (t, txt) in enumerate(zip(titles, text_splited_by_title))]

            return titles, text_splited_by_title, toc


        def build_parents(toc:List[Dict], parent_ids:List[str]):
            assert isinstance(parent_ids, List), f"parent_titles shoud be a list, not {type(parent_ids)}"
            # remove Nones
            parent_ids = [t for t in parent_ids if not t.endswith("_")]
            # get the id of each parent title
            parent_titles = [
                ChunkParser.get_item(toc, conditions={"id": t})["title"]
                for t in parent_ids
                ]
            # build dict of parent titles
            parents = [{"level": i, "title": t, "id": tid}
                for i, (t, tid) in enumerate(zip(parent_titles, parent_ids))
                ]

            return parents


        def get_children(toc:Dict):
            # iterate through each element of toc
            for toc_piece in toc:
                children = []
                # reiterate through the toc to see if our toc_piece is in the parents of other to piceces
                for potential_child in toc:
                    parents = potential_child["parents"]
                    for p in parents:
                        if p["id"] == toc_piece["id"]:
                            children.append({
                                "level": potential_child["level"],
                                "title": potential_child["title"],
                                "id": potential_child["id"]
                                })
                toc_piece["children"] = children

            return toc

        h1_titles, h1_texts, toc = get_subtoc(text, 0, parents=[], id_prefix="0")
        for i, subtext_i in enumerate(h1_texts):
            h1_id = f"0{i}" if h1_titles else f"0_"
            parents = build_parents(toc, [h1_id])
            h2_titles, h2_texts, subtoc = get_subtoc(subtext_i, 1, parents=parents, id_prefix=h1_id)
            toc.extend(subtoc)
            for j, subtext_j in enumerate(h2_texts):
                h2_id = f"{h1_id}{j}" if h2_titles else f"{h1_id}_"
                parents = build_parents(toc, [h1_id, h2_id])
                h3_titles, h3_texts, subtoc = get_subtoc(subtext_j, 2, parents=parents, id_prefix=h2_id)
                toc.extend(subtoc)
                for k, subtext_k in enumerate(h3_texts):
                    h3_id = f"{h2_id}{k}" if h3_titles else f"{h2_id}_"
                    parents = build_parents(toc, [h1_id, h2_id, h3_id])
                    h4_titles, h4_texts, subtoc = get_subtoc(subtext_k, 3, parents=parents, id_prefix=h3_id)
                    toc.extend(subtoc)
                    for l, subtext_l in enumerate(h4_texts):
                        h4_id = f"{h3_id}{l}" if h4_titles else f"{h3_id}_"
                        parents = build_parents(toc, [h1_id, h2_id, h3_id, h4_id])
                        h5_titles, h5_texts, subtoc = get_subtoc(subtext_l, 4, parents=parents, id_prefix=h4_id)
                        toc.extend(subtoc)

        toc = get_children(toc)
        

        return toc


    @staticmethod
    def get_item(toc:List[Dict], conditions:Dict, return_attributes:List[str]=None, raise_errors=True):
        """Gets a toc element (=title) matching the specified condictions.
        'condictions' is a dict of conditions {key:value} -> we want to find
        the title in the toc that matches these conditions by having 
        these key-value pairs

        Args:
            toc (List[Dict]): the table of content
            conditions (Dict, optional): a dict of conditions to match. Defaults to Dict.
            raise_errors (bool, optional): if True, it will raise an error if the some keys in the condictions 
                can't be found in the toc element. Defaults to True.

        Raises:
            KeyError: _description_
            Exception: _description_
            Exception: _description_

        Returns:
            _type_: _description_
        """
        results = []
        for d in toc:
            if raise_errors and not all([k in d.keys() for k in conditions.keys()]):
                missing_keys = [k for k in conditions.keys() if k not in d.keys() ]
                raise KeyError(f"Some conditions specified can't be verified as the keys {missing_keys} are not present in dictionnary {d}")
                
            if all([d[k] == v for k, v in conditions.items() if k in d.keys()]):
                results.append(d)

        if len(results) > 1:
            raise Exception(f"More than one item corresponds to specified conditions: {conditions}. This should not happen. Items: {results}")
        if len(results) == 0:
            raise Exception(f"No item have been found corresponding to conditions : {conditions}")

        result = results[0]
        if return_attributes:
            return [result[a] for a in return_attributes]

        return result


    def chunk_document(self, text:str, max_chunk_size:int=300, chunk_on_title_level:int=None, raise_errors=True, max_title_level_to_use=5, **kwargs):
        """Chunks a document using titles
        The title level used to chunk can be forced by specifying chunk_on_title.
        Otherwise the each part will be chunk recursively using its titles until the chunks reach
        a size lower than the specified max_chunk_size.

        Args:
            text (str): _description_
            max_chunk_size (int, optional): _description_. Defaults to 300.
            chunk_on_title_level (int, optional): _description_. Defaults to None.
            raise_errors (bool, optional): _description_. Defaults to False.
            max_title_level_to_use (int, optional) : the max level of the titles to use for subdivision

        Raises:
            ChunkSizeExceeded: _description_

        Returns:
            _type_: _description_
        """
        # TODO: Handle case in which a portion of the text in a title is out of all its subtitles
        # For exemple the introduction of a part
        toc = self.get_toc(text)
        
        if chunk_on_title_level is not None:
            assert chunk_on_title_level in set(tp["level"] for tp in toc),\
            f"Can't chunk on level '{chunk_on_title_level}' as no title at that level are found"
            total_chunks = [self.format_chunk(tp) for tp in toc if tp["level"] == chunk_on_title_level]
            if any([len(c.split()) > max_chunk_size for c in total_chunks]) and raise_errors:
                raise ChunkSizeExceeded(f"Chunking on level '{chunk_on_title_level}' lead to oversized chunks")
        
        else:
            # Sort the titles by level (from lvl 0 to lvl 5)
            toc = sorted(toc, key=lambda x: x["level"])
            # store the toc parts that have been treated and the chunks
            treated_toc_ids = []
            total_chunks = []
            # iterate through each toc title
            for toc_piece in toc:
                # if this title have been already treated, then don't use it again
                if toc_piece["id"] in treated_toc_ids:
                    continue
                # build the chunk with the title we are using
                chunk = self.format_chunk(toc_piece)
                # check if the chunk is too big, if so subdivide it
                chunk_is_too_big = len(chunk.split()) > max_chunk_size
                if not chunk_is_too_big:
                    total_chunks.append(chunk)
                    treated_toc_ids.append(toc_piece["id"])
                    treated_toc_ids.extend([child["id"] for child in toc_piece["children"]])
                    toc_pieces_to_subdivide = []
                    continue
                else:
                    toc_pieces_to_subdivide = [toc_piece]
                # while we have toc pieces that needs to be subdivided...
                while toc_pieces_to_subdivide:
                    # ...iterate over the toc pieces that needs to be subdivided
                    new_toc_pieces_to_subdivide = []
                    for tp2subdivide in toc_pieces_to_subdivide:
                        # if the level is of the title is >= to the max level we want to use, just make a chunk without using children
                        if tp2subdivide["level"] == max_title_level_to_use:
                            treated_toc_ids.append(tp2subdivide["id"])
                            treated_toc_ids.extend([child["id"] for child in tp2subdivide["children"]])
                            total_chunks.append(self.format_chunk(tp2subdivide))
                        # check if the tocpiece have children titles that we can use
                        elif tp2subdivide["children"]:
                            children = []
                            for tpc in tp2subdivide["children"]:
                                child = ChunkParser.get_item(toc, conditions={"id": tpc["id"]})
                                if child["level"] == tp2subdivide["level"] -1:
                                    child_id = child["id"]
                                    children.append(child)
                                    treated_toc_ids.append(child_id)
                            chunks = [self.format_chunk(child) for child in children]
                            new_toc_pieces_to_subdivide.extend([
                                child
                                for child, chunk in zip(children, chunks)
                                if len(chunk.split()) > max_chunk_size
                                ])
                            chunks_that_are_ok = [c for c in chunks if len(c.split()) < max_chunk_size]
                            total_chunks.extend(chunks_that_are_ok)
                        # if we don't have children (=subtitles) then just leave this a big chunk
                        # TODO: find other subdivision method for cases where we have no children titles
                        else:
                            treated_toc_ids.append(tp2subdivide["id"])
                            total_chunks.append(self.format_chunk(tp2subdivide))
                    toc_pieces_to_subdivide = new_toc_pieces_to_subdivide

        return total_chunks


    def format_chunk(self, toc_piece):
        chunk = ""
        if toc_piece["parents"]:
            chunk = "\n".join([ChunkParser.cleanup_text(p["title"]) for p in toc_piece["parents"]])
            chunk += "\n"
        chunk += f"{ChunkParser.cleanup_text(toc_piece['title'])}\n"
        chunk += ChunkParser.cleanup_text(toc_piece["text"])
        chunk = self.change_links_format(chunk, link_position="end_of_chunk")
        chunk += "\n"

        return chunk


    def change_links_format(self, text, link_position:str=None, **kwargs) -> str:
        """Removes the markdown format of the links in the text.
        The links are treated as specified by 'link_position':
        - None : links are removed
        - in_sentence : the link is placed in the sentence, between parenthesis
        - end_of_chunk : all links are added at the end of the text
        - end_of_sentence : each link is added at the end of the sentence it is found in

        Args:
            text (str): the text to find the links in
            link_position (str, optional): How the links should be handled. Defaults to None.

        Raises:
            NotImplementedError: _description_

        Returns:
            str: the formated text
        """
        matches = re.finditer(self.regex_patterns["link"], text)
        if matches is not None:
            for i, m in enumerate(matches):
                match link_position:
                    case None:
                        text = text.replace(m[0], m[1])
                    case "end_of_chunk":
                        if i == 0:
                            text += "\nPour plus d'informations:\n"
                        text = text.replace(m[0], m[1])
                        text += f"- {m[1]}: {m[2]}\n"
                    case "in_sentence":
                        text = text.replace(m[0], f"{m[1]} (pour plus d'informations : {m[2]})")
                    case "end_of_sentence":
                        link_end_position = m.span(2)[1]
                         #next_breakpoint = ChunkParser.find_end_of_sentence(text, link_end_position)
                        raise NotImplementedError

        return text


    @staticmethod
    def find_end_of_sentence(text, start_idx):
        next_breakpoint = text.find(".", start_idx)
        # if there is no ".", find \n
        if not next_breakpoint:
            next_breakpoint = text.find("\n", start_idx)
            # make sure this \n doesn't annouce a list, if so, go to next \n
            if text[next_breakpoint-1] == ":":
                next_breakpoint = text.find("\n", start_idx+3)
        # if we still don't have a breakpoint, then breakpoint is end of text
        if not next_breakpoint:
            next_breakpoint = len(text)

        return next_breakpoint


    @staticmethod
    def cleanup_text(text):

        # remove special characters
        text = text.replace("\xa0", "").replace("###", "")#.replace("–", "-")
        # remove strong markdown
        text = text.replace("**", "")
        # remove whitespaces and newlines
        text = " ".join(text.split())
        # restore newline for bullet-point lists
        text = "\n- ".join(text.split("- "))
        
        return text

In [82]:
cp = ChunkParser()

SOURCE_FOLDER = "./data/ladrome-pages-2023-12-22-json-v1 (HTML)/json-v1"
#FILE = "ladrome-page-13830.json"
FILE = "ladrome-page-61504.json"
FILE = "ladrome-page-504967.json"
FILE = "ladrome-page-567239.json"
FILE = "ladrome-page-13718.json"
FILE = "ladrome-page-10516.json"
FILE = "ladrome-page-547344.json" # big chunks
FILE = "ladrome-page-574603.json" # no title, just a list of links
FILE = "ladrome-page-581681.json" # plein de petits titres

file = ChunkParser.read_json_file(os.path.join(SOURCE_FOLDER, FILE))
chunks = cp(os.path.join(SOURCE_FOLDER, FILE), max_chunk_size=200, max_title_level_to_use=1, raise_errors=False)

print_chunks = True
if print_chunks:
    for c in chunks:
        print(c)

Politique de cookies
1. Introduction
Notre site web, <https://www.ladrome.fr> (ci-après: «le site web») utilise des cookies et autres technologies liées (par simplification, toutes ces technologies sont désignées par le terme «cookies»). Des cookies sont également placés par des tierces parties que nous avons engagées. Dans le document ci-dessous, nous vous informons de l’utilisation des cookies sur notre site web.

Politique de cookies
2. Que sont les cookies?
Un cookie est un petit fichier simple envoyé avec les pages de ce site web et stocké par votre navigateur sur le disque dur de votre ordinateur ou d’un autre appareil. Les informations qui y sont stockées peuvent être renvoyées à nos serveurs ou aux serveurs des tierces parties concernées lors d’une visite ultérieure.

Politique de cookies
3. Que sont les scripts?
Un script est un élément de code utilisé pour que notre site web fonctionne correctement et de manière interactive. Ce code est exécuté sur notre serveur ou sur votre 

In [83]:
cp = ChunkParser()
file_list = os.listdir(SOURCE_FOLDER)
df_constructor = []
for file in file_list:
    chunks = cp(os.path.join(SOURCE_FOLDER, file), max_chunk_size=200, max_title_level_to_use=1)
    destination_filenames = [file.replace(".json", f"_{i}.json") for i, _ in enumerate(chunks)]
    df_constructor.extend([(file, dest_file, c) for dest_file, c in zip(destination_filenames, chunks)])

drome_df = pd.DataFrame(df_constructor, columns=["orig_filename", "dest_file", "chunk"])
drome_df.head()

Unnamed: 0,orig_filename,dest_file,chunk
0,ladrome-page-10511.json,ladrome-page-10511_0.json,Un peu d’histoire\nSituée à mi-chemin entre l’...
1,ladrome-page-10513.json,ladrome-page-10513_0.json,Les cantons\n\n
2,ladrome-page-10516.json,ladrome-page-10516_0.json,Filières d’excellence\nAgriculture bio\nClassé...
3,ladrome-page-10516.json,ladrome-page-10516_1.json,Filières d’excellence\nGastronomie\nL’excellen...
4,ladrome-page-10516.json,ladrome-page-10516_2.json,Filières d’excellence\nŒnologie\nLien entre la...


In [84]:
drome_df[["chunk_len_chars", "chunk_len_words"]] = drome_df["chunk"].apply(get_chunk_length)
drome_df["chunk_len_words"].describe()

count     383.000000
mean      133.963446
std       213.438010
min         1.000000
25%        56.000000
50%        90.000000
75%       152.500000
max      3103.000000
Name: chunk_len_words, dtype: float64

In [85]:
sorted_chunks = drome_df.sort_values(by="chunk_len_words")
sorted_chunks.tail(10)

Unnamed: 0,orig_filename,dest_file,chunk,chunk_len_chars,chunk_len_words
203,ladrome-page-19076.json,ladrome-page-19076_0.json,Missions\nLes prérogatives des CLI ont été déf...,3722,569
200,ladrome-page-17218.json,ladrome-page-17218_0.json,La carte Top Dép’Art\nPour commander la carte ...,4254,594
197,ladrome-page-13874.json,ladrome-page-13874_2.json,Conditions générales d’utilisation du formulai...,4371,640
196,ladrome-page-13874.json,ladrome-page-13874_1.json,Conditions générales d’utilisation du formulai...,5054,745
269,ladrome-page-547344.json,ladrome-page-547344_0.json,FAQ – futur collège Mercurol-Veaunes\nSectoris...,6047,795
270,ladrome-page-547344.json,ladrome-page-547344_1.json,FAQ – futur collège Mercurol-Veaunes\nTranspor...,7340,1134
336,ladrome-page-581681.json,ladrome-page-581681_5.json,Politique de cookies\n6. Cookies placés\nWordP...,9660,1151
271,ladrome-page-547344.json,ladrome-page-547344_2.json,FAQ – futur collège Mercurol-Veaunes\nEnseigne...,8063,1171
157,ladrome-page-11061.json,ladrome-page-11061_0.json,Archives des actes administratifs\nA compter d...,12679,1269
328,ladrome-page-574603.json,ladrome-page-574603_1.json,Presse\nAnnée 2023\nDécembre Fin des travaux d...,25886,3103


In [86]:
sorted_chunks.head(50)

Unnamed: 0,orig_filename,dest_file,chunk,chunk_len_chars,chunk_len_words
160,ladrome-page-11223.json,ladrome-page-11223_0.json,Agenda\n\n,8,1
62,ladrome-page-10773.json,ladrome-page-10773_0.json,Handicap\n\n,10,1
1,ladrome-page-10513.json,ladrome-page-10513_0.json,Les cantons\n\n,13,2
142,ladrome-page-10993.json,ladrome-page-10993_0.json,Le nucléaire\n\n,14,2
112,ladrome-page-10905.json,ladrome-page-10905_0.json,Vivre en établissement\n\n,24,3
102,ladrome-page-10866.json,ladrome-page-10866_0.json,Vivre à domicile\n\n,18,3
126,ladrome-page-10944.json,ladrome-page-10944_0.json,Le laboratoire départemental\n\n,30,3
224,ladrome-page-43852.json,ladrome-page-43852_0.json,Menu des collèges\n\n,19,3
63,ladrome-page-10775.json,ladrome-page-10775_0.json,Le handicap chez l’enfant\n\n,27,4
68,ladrome-page-10781.json,ladrome-page-10781_0.json,Scolarisation\nEn milieu ordinaire\n##\n,37,5


In [87]:
DESTINATION_FOLDER = "./data/chunked-pages-v1/"
MIN_CHUNK_SIZE = 19 # chunks with less words than that will we discarded

def save_chunk_as_json(row):
    to_dump = {
        "source_file": row["orig_filename"],
        "chunk_id": int(row["dest_file"].split("_")[-1].replace(".json", "")),
        "text": row["chunk"]
    }
    with open(os.path.join(DESTINATION_FOLDER, row["dest_file"]), "w") as f:
        json.dump(to_dump, f, ensure_ascii=False)

drome_df = drome_df[drome_df["chunk_len_words"] >= MIN_CHUNK_SIZE]
null = drome_df.apply(save_chunk_as_json, axis=1)

0      None
2      None
3      None
4      None
5      None
       ... 
378    None
379    None
380    None
381    None
382    None
Length: 356, dtype: object

In [195]:
import random
random_id = random.randint(0, len(drome_df))
print(f"---{drome_df.iloc[random_id]['dest_file']}---")
print(drome_df.iloc[random_id]["chunk"])

---ladrome-page-10708_0.json---
Les aides à l’investissement
Aide à l’immobilier d’entreprise, co-financée avec les groupements de communes (EPCI)
Ce dispositif voté par les groupements de communes drômoises vise à encourager l’installation des activités industrielles et de services à l’industrie dans la Drôme (acquisition /construction /extension de bâtiments). Il soutient plus particulièrement les projets qui créent de l’emploi durable sur notre territoire. Plaquette Aide à l’Immobilier d’Entreprise (AIE) #
Pour plus d'informations:
- Plaquette Aide à l’Immobilier d’Entreprise (AIE): https://www.ladrome.fr/wp-content/uploads/2020/03/plaquetteaie.pdf




In [147]:
with open(os.path.join(DESTINATION_FOLDER, "ladrome-page-10511_0.json")) as f:
    yolo = json.load(f)
print(yolo)