### Analyse des données initiales

In [130]:
import json
import os
import re
from typing import Dict, List, Tuple
import pandas as pd

In [9]:
SOURCE_FOLDER = "./data/ladrome-pages-json-v1-Markdown/"
source_files = os.listdir(SOURCE_FOLDER)

In [10]:
def read_json_file(filename):
    with open(os.path.join(SOURCE_FOLDER, filename), 'r') as f:
        read_file = json.load(f)
    return read_file


def get_title_and_text(file_content):
    return pd.Series([file_content["hasPart"][0]["title"], file_content["hasPart"][0]["text"]])


def get_chunk_length(text):
    return pd.Series([len(text), len(text.split())])

In [11]:
source_data = pd.DataFrame(source_files, columns=["filename"])

In [12]:
source_data["file_content"] = source_data["filename"].apply(read_json_file)

In [13]:
source_data[["title", "text"]] = source_data["file_content"].apply(get_title_and_text)

In [14]:
source_data[["chunk_len_chars", "chunk_len_words"]] = source_data["text"].apply(get_chunk_length)
source_data["chunk_len_words"].describe()

count     190.000000
mean      241.652632
std       370.707839
min         1.000000
25%        63.500000
50%       144.500000
75%       277.750000
max      3007.000000
Name: chunk_len_words, dtype: float64

Problems :
- Some chunks ave low amount of word
- variation in chunk size is high
- some chunks are too big
- some chunks need cleaning (symbols and links everywhere)
- some chunks are just links toward other pages (ladrome-page-558941.json)

Ok stuff :
- The scrapping and parsing of the webpages is actually good (no messy stuff)

In [15]:
sorted_chunks = source_data.sort_values(by="chunk_len_words")
sorted_chunks.head(5)

Unnamed: 0,filename,file_content,title,text,chunk_len_chars,chunk_len_words
95,ladrome-page-11223.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Agenda,Agenda\n\n,8,1
37,ladrome-page-10773.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Handicap,Handicap\n\n,10,1
1,ladrome-page-10513.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Les cantons,Les cantons\n\n,13,2
77,ladrome-page-10993.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Le nucléaire,Le nucléaire\n\n,14,2
134,ladrome-page-43852.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Menu des collèges,Menu des collèges\n\n,19,3


In [16]:
sorted_chunks.tail(5)

Unnamed: 0,filename,file_content,title,text,chunk_len_chars,chunk_len_words
171,ladrome-page-574603.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Presse,Presse\n\nVous êtes journaliste ? Cet espace v...,16516,1431
113,ladrome-page-13874.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Conditions générales d’utilisation du formulai...,Conditions générales d’utilisation du formulai...,10905,1585
174,ladrome-page-581681.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Politique de cookies,Politique de cookies\n\n_Cette politique de co...,17242,2054
137,ladrome-page-480456.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",Conditions générales d’utilisation de l’applic...,Conditions générales d’utilisation de l’applic...,17470,2519
148,ladrome-page-547344.json,"{'@context': 'https://wikit.ai', '@type': 'Doc...",FAQ – futur collège Mercurol-Veaunes,FAQ – futur collège Mercurol-Veaunes\n\nRetrou...,21333,3007


In [18]:
HTML_FOLDER = "./data/ladrome-pages-2023-12-22-json-v1-HTML"
#FILE = "ladrome-page-480456.json"
#FILE = "ladrome-page-10516.json"
#FILE = "ladrome-page-567239.json"
#FILE = "ladrome-page-573222.json"
#FILE = "ladrome-page-555850.json"
FILE = "ladrome-page-581681.json"
with open(os.path.join(HTML_FOLDER, FILE), "r") as f:
    file = json.load(f)

In [80]:
def compare_HTML_with_Markdown():
    with open(os.path.join(HTML_FOLDER, FILE), "r") as f:
        file = json.load(f)
        print(file["hasPart"][0]["text"])
    with open(os.path.join(SOURCE_FOLDER, FILE), "r") as f:
        file = json.load(f)
        print(file["hasPart"][0]["text"])

compare_HTML_with_Markdown()

<h1>Politique de cookies</h1>


<!-- Legal document generated by Complianz | GDPR/CCPA Cookie Consent https://wordpress.org/plugins/complianz-gdpr -->
<div id="cmplz-document" class="cmplz-document cookie-statement cmplz-document-eu"><p><i>Cette politique de cookies a été mise à jour pour la dernière fois le 21/11/2023 et s’applique aux citoyens et aux résidents permanents légaux de l’Espace Économique Européen et de la Suisse.</i><br></p><h2>1. Introduction</h2><p>Notre site web, <a href="https://www.ladrome.fr">https://www.ladrome.fr</a> (ci-après : « le site web ») utilise des cookies et autres technologies liées (par simplification, toutes ces technologies sont désignées par le terme « cookies »). Des cookies sont également placés par des tierces parties que nous avons engagées. Dans le document ci-dessous, nous vous informons de l’utilisation des cookies sur notre site web.</p><h2>2. Que sont les cookies ?</h2><p>Un cookie est un petit fichier simple envoyé avec les pages de ce si

We see that the parsing of the HTML file removed the h2 titles (they are converted as "\n\n**here is the title text**") which makes the splitting difficult. Keepeing a style like "##here is h2 title" would be better

## New chunking method based on titles

In [383]:
import json
import re
import os
from typing import Dict, List, Tuple, TypedDict
from markdownify import markdownify
import tiktoken


# Types
class ShortTitle(TypedDict, total=True):
    id: int
    text: str
    level: int
    start_position: int
    end_position: int


class Title(TypedDict, total=False):
    id: int
    text: str
    level: int
    start_position: int
    end_position: int
    content: str
    parents: List[ShortTitle]
    children: List[ShortTitle]


Titles = List[Title]


class Chunk(TypedDict, total=True):
    id: int
    token_count: int
    word_count:int
    source_doc: str
    text: str


Chunks = List[Chunk]


# Exceptions
class HTMLChunkNorrisException(Exception):
    def __init__(self, message):
        pass


class ChunkSizeExceeded(Exception):
    def __init__(self, message):
        pass


# ChunkNorris
class HTMLChunkNorris:
    def __init__(self):
        self.tokenizer = tiktoken.get_encoding("cl100k_base")

    def __call__(self, filepath: str, **kwargs) -> str:
        text = HTMLChunkNorris.read_json_file(filepath)
        titles = self.get_toc(text, **kwargs)
        chunks = self.get_chunks(titles, os.path.basename(filepath), **kwargs)

        return chunks

    @property
    def regex_patterns(self):
        return {
            "h1": re.compile(r"(.+?)\n={3,}", re.MULTILINE),
            "h2": re.compile(r"(.+?)\n-{3,}", re.MULTILINE),
            "h3": re.compile(r"^#{3} (.+)\n", re.MULTILINE),
            "h4": re.compile(r"^#{4} (.+)\n", re.MULTILINE),
            "h5": re.compile(r"^#{5} (.+)\n", re.MULTILINE),
            "link": re.compile(r"\[(.+?)\]\((https?:.+?)\)"),
        }

    @staticmethod
    def read_json_file(filepath: str) -> str:
        """Reads a json file and applies markdownify to it

        Args:
            filepath (str): path to a json file

        Returns:
            str: the text, mardkownified
        """
        try:
            with open(filepath, "r") as f:
                read_file = json.load(f)
            md_file = markdownify(
                read_file["hasPart"][0]["text"], strip=["figure", "img"], bullets="-*+"
            )
        except Exception as e:
            raise HTMLChunkNorrisException(f"Can't open JSON file : {e}")

        return md_file

    @staticmethod
    def check_string_argument_is_valid(
        argname: str, argvalue: str, allowed_values: List[str]
    ):
        """Checks that an argument has a valid value

        Args:
            argname (str): the name of the argument
            argvalue (str): the value of the argument
            allowed_values (List[str]): list of allowed values
        """
        assert argvalue in allowed_values, ValueError(
            f"Argument '{argname}' should be one of {allowed_values}. Got '{argvalue}'"
        )

    def get_toc(self, text: str, **kwargs) -> Titles:
        """Get the Table Of Content i.e the list
        of titles and their relation with each other

        Args:
            text (str): the text to get the toc from

        Returns:
            Titles: list of dicts describing the titles. For more info, look at Title class
        """
        titles = self.get_titles(text)
        for title in titles:
            title["children"] = HTMLChunkNorris.get_titles_children(title, titles)
            title["parents"] = HTMLChunkNorris.get_titles_parents(title, titles)
        titles = HTMLChunkNorris.get_titles_content(titles, text)

        return titles

    def get_titles(self, text: str, max_title_level_to_use: str = "h4") -> Titles:
        """Gets the titles (=headers h1, h2 ...) in the text
        using regex

        Args:
            text (str): the titles to look for titles in
            max_title_level_to_use (str, optional): the max level of headers to look for (included). Defaults to "h4".

        Returns:
            Titles: a list of dicts describing titles. For more info, look at Title class
        """
        HTMLChunkNorris.check_string_argument_is_valid(
            "max_title_level", max_title_level_to_use, ["h1", "h2", "h3", "h4", "h5"]
        )

        # Get the titles types to consider (h1, h2, ...)
        title_types_to_consider = [
            f"h{i}" for i in range(1, int(max_title_level_to_use[1]) + 1)
        ]
        titles = []
        for title_level, title_type in enumerate(title_types_to_consider):
            regex_pattern = self.regex_patterns[title_type]
            for match in re.finditer(regex_pattern, text):
                title_text = match.group(1)
                start_position = match.start()
                end_position = match.end()
                titles.append(
                    {
                        "text": title_text,
                        "level": title_level,
                        "start_position": start_position,
                        "end_position": end_position,
                    }
                )

        # add id to each title
        titles = [{"id": i} | title for i, title in enumerate(titles)]

        return titles

    @staticmethod
    def get_titles_content(titles: Titles, text: str) -> Titles:
        """Get the text content of each title, meaning
        the text that is between the title and the next title

        Args:
            titles (Titles): the titles found in the text
            text (str): the text from which titles were extracted

        Returns:
            Titles: the titles, with a "content" section added
        """
        # make sure titles are sorted by order of appearance
        titles = sorted(titles, key=lambda x: x["start_position"])
        for i, title in enumerate(titles):
            if i + 1 < len(titles):
                next_title = titles[i + 1]
                content = text[title["end_position"] : next_title["start_position"]]
            else:  # its the last title
                content = text[title["end_position"] :]
            title["content"] = HTMLChunkNorris.cleanup_text(content)

        return titles

    @staticmethod
    def cleanup_text(text: str) -> str:
        """Cleans up a text using various operations

        Args:
            text (str): the text to cleanup

        Returns:
            str: the cleanedup text
        """
        # remove special characters
        special_chars = ["**", "\xa0"]
        for char in special_chars:
            text = text.replace(char, "")
        # remove white spaces and newlines
        text = " ".join(text.split())
        # restore newline for bullet-point lists
        text = "\n- ".join(text.split("- "))

        return text

    @staticmethod
    def get_titles_children(title: Title, titles: Titles) -> List[ShortTitle]:
        """Gets the children of a title among titles,
        meaning the titles of its subsections.

        A child is:
         - after in terms of position (higher 'start_position')
         - lower in terms of level (higher 'level')

        Args:
            title (Title): The title for which we want the children
            titles (Titles): The titles found in the text

        Returns:
            Titles: A list of child titles
        """
        # in the children, put the titles without their "content", "children" or "parents" fields
        child_keys_to_keep = ["id", "text", "level", "start_position", "end_position"]
        title_of_next_section = HTMLChunkNorris.get_title_of_next_section(title, titles)
        if title_of_next_section is None:
            return [
                {k: v for k, v in t.items() if k in child_keys_to_keep}
                for t in titles
                if t["start_position"] > title["end_position"]
            ]
        else:
            return [
                {k: v for k, v in t.items() if k in child_keys_to_keep}
                for t in titles
                if t["start_position"] > title["end_position"]
                and t["end_position"] < title_of_next_section["start_position"]
            ]

    @staticmethod
    def get_titles_parents(title: Title, titles: Titles) -> List[ShortTitle]:
        """Gets the parents of the specified title

        Parents are titles of the section that the specified title
        belongs to.

        Args:
            title (Title): The title we want the parents of
            titles (Titles): The titles found in the text

        Returns:
            List[ShortTitle]: A list of parents
        """
        # in the parents, put the titles without their "content", "children" or "parents" fields
        parent_keys_to_keep = ["id", "text", "level", "start_position", "end_position"]
        parents = []
        # find the title's parent, and consecutive parents of parents
        direct_parent = title
        while direct_parent:
            direct_parent = HTMLChunkNorris.get_direct_parent_of_title(
                direct_parent, titles
            )
            if direct_parent is not None:
                direct_parent = {
                    k: v for k, v in direct_parent.items() if k in parent_keys_to_keep
                }
                parents.append(direct_parent)

        return parents

    @staticmethod
    def get_direct_parent_of_title(title: Title, titles: Titles) -> Title:
        """Considering a title, gets the direct parent of a title,
        meaning the title of the section this title belongs to.

        The direct parent is:
        - Higher in terms of level (lower level index)
        - has its position in text before the title considered

        Example : considering an h2, its direct parent would be
        the h1 before it.

        WARNING: May return None if no parent title is found

        Args:
            title (Title): the title to consider
            titles (Titles): the titles found in the text

        Returns:
            Title: the direct parent of the provided Title
        """
        direct_parent = [
            t
            for t in titles
            if t["level"] < title["level"]
            and t["end_position"] < title["start_position"]
        ]
        if not direct_parent:
            return None

        return max(direct_parent, key=lambda x: x["start_position"])

    @staticmethod
    def get_title_of_next_section(title: Title, titles: Titles) -> Title:
        """Considering a title, gets the title of the next section.
        The next section comes when we reach a title that as a
        equal or higher level than the provided title.

        Example: considering a h3, the next section's title would be when
        we encounter an other h3 or h2 or h1

        WARNING: May return None if no next section's title is found

        Args:
            title (Title): The title that we consider
            titles (Titles): List of titles of the document

        Returns:
            Title: The title of next section
        """
        same_level_titles = [
            t
            for t in titles
            if t["level"] <= title["level"]
            and t["start_position"] > title["end_position"]
        ]
        # if we have no next sections's title
        if not same_level_titles:
            return None

        return min(same_level_titles, key=lambda x: x["start_position"])

    @staticmethod
    def get_title_of_next_subsection(title: Title, titles: Titles) -> Title:
        """Considering a title, gets the title of the next subsection.
        The next subsection comes when we reach a title that as a
        lower level than the provided title.

        Example: considering a h3, the next section's title would be when
        we encounter an other h4 or h5

        WARNING: May return None if no next section's title is found

        Args:
            title (Title): The title that we consider
            titles (Titles): List of titles of the document

        Returns:
            Title: The title of next section
        """
        same_level_titles = [
            t
            for t in titles
            if t["level"] > title["level"]
            and t["start_position"] > title["end_position"]
        ]
        # if we have no next sections's title
        if not same_level_titles:
            return None

        return min(same_level_titles, key=lambda x: x["start_position"])

    @staticmethod
    def get_title_using_condition(
        titles: Titles, conditions: Dict, raise_errors: bool = True
    ) -> Title:
        """Get the titles corresponding to the conditions.
        The conditions must be a dict. The method check that the keys-values pairs of conditions
        exist in the title.

        Example of conditions : {"id": 23, "level":2} will match the title that has id=23 and level=2

        Args:
            titles (Titles): the titles extracted from the text
            conditions (Dict): dict of conditions, i.e key value paris that must exist in the title we look for
            raise_errors (bool, optional): raise error if matched title is not exaclty 1. Defaults to True.

        Raises:
            HTMLChunkNorrisException: If not title matches the conditions
            HTMLChunkNorrisException: If more than one title matches the conditions

        Returns:
            Title: The title that matches the conditions
        """
        conditions = list(conditions.items())
        title = [
            t
            for t in titles
            if all([condition in t.items() for condition in conditions])
        ]
        if raise_errors:
            if len(title) == 0:
                raise HTMLChunkNorrisException(
                    f"No title matched conditions : {conditions}"
                )
            elif len(title) > 1:
                raise HTMLChunkNorrisException(
                    f"More than 1 title matches conditions : {conditions} in\n{title}"
                )
        else:
            return title  # list of 0 to many titles

        return title[0]

    @staticmethod
    def get_direct_children_of_title(title: Title, titles: Titles) -> Titles:
        """Gets the directs children of a title, meaning
        the children without their children

        We assume that direct children have highest title level (closest to 0) among children

        Args:
            title (Title): the titles we want the children from

        Returns:
            Titles: the children titles
        """
        # if the title has no children, return empty list
        if not title["children"]:
            return []

        direct_children = [
            c
            for c in title["children"]
            if c["level"] == min(title["children"], key=lambda x: x["level"])["level"]
        ]

        return [
            HTMLChunkNorris.get_title_using_condition(titles, c)
            for c in direct_children
        ]

    def get_chunks(self, titles: Titles, source_filename: str, **kwargs) -> Chunks:
        """Builds the chunks based on the titles

        Args:
            titles (Titles): The titles, obtained from get_toc() method
            max_title_level_to_use (str, optional): The lowest level of titles to consider.
                Defaults to "h4".
            max_chunk_word_length (int, optional): The max size a chunk can be. Defaults to 250.
            hard_limit (bool, optional): if True, it will raise error if the chunk couldn't be chunked
                down to max_chunk_word_length. Defaults to False.

        Returns:
            Chunks: a list of Chunk
        """
        text_chunks = HTMLChunkNorris.get_chunks_text_content(titles, **kwargs)
        # Change position of links in the text
        text_chunks = [self.change_links_format(t, **kwargs) for t in text_chunks]
        # build list of chunks object
        chunks = []
        for i, text in enumerate(text_chunks):
            chunks.append(
                {
                    "id": f"{source_filename.replace('.json', '')}-{i}.json",
                    "token_count": len(self.tokenizer.encode(text)),
                    "word_count": len(text.split()),
                    "source_file": source_filename,
                    "text": text,
                }
            )
        # check that chunks don't exceed the hard token limit
        chunks = self.check_chunks(chunks, **kwargs)

        return chunks

    @staticmethod
    def get_chunks_text_content(
        titles: Titles,
        max_title_level_to_use: str = "h4",
        max_chunk_word_length: int = 250,
        **kwargs,
    ) -> List[str]:
        """Builds the chunks based on the titles obtained by the get_toc() method.

        It will split the text recursively using the titles. Here's what happens:
        - it takes a title and builds of chunk text (using title + content + content of children)
        - if the chunk obtained is too big and the title has children, it will subdivide it
        - otherwise the chunk is kept as is

        Note : titles that have a lower level that the one specified in max_title_level_to_use
        will not be considered for splitting

        Args:
            titles (Titles): The titles, obtained from get_toc() method
            max_title_level_to_use (str, optional): The lowest level of titles to consider.
                Defaults to "h4".
            max_chunk_word_length (int, optional): The max size a chunk can be. Defaults to 250.

        Returns:
            List[str]: the chunk's texts
        """
        HTMLChunkNorris.check_string_argument_is_valid(
            "max_title_level_to_use",
            max_title_level_to_use,
            ["h1", "h2", "h3", "h4", "h5"],
        )

        total_chunks = []
        total_used_ids = []
        # make sure titles are sorted by order of appearance
        titles = sorted(titles, key=lambda x: x["start_position"])
        for title in titles:
            if title["id"] not in total_used_ids:
                chunk, used_ids = HTMLChunkNorris.create_chunk(title, titles)
                # Note : we work on a lists to enable recursivity
                # if chunk is too big and but title can be subdivide (because they have children)
                if HTMLChunkNorris.chunk_is_too_big(
                    chunk, max_chunk_word_length
                ) and HTMLChunkNorris.title_has_children(title, max_title_level_to_use):
                    titles_to_subdivide = [title]
                    total_used_ids.append(title["id"])
                # if chunk is OK, or too big but can't be subdivided with children
                else:
                    titles_to_subdivide = []
                    total_chunks.append(chunk)
                    total_used_ids.extend(used_ids)
                # While we have chunks to subdivide, recursive subdivision
                while titles_to_subdivide:
                    new_titles_to_subdivide = []
                    for title2subdivide in titles_to_subdivide:
                        direct_children = HTMLChunkNorris.get_direct_children_of_title(
                            title2subdivide, titles
                        )
                        for child in direct_children:
                            chunk, used_ids = HTMLChunkNorris.create_chunk(
                                child, titles
                            )
                            if HTMLChunkNorris.chunk_is_too_big(
                                chunk, max_chunk_word_length
                            ) and HTMLChunkNorris.title_has_children(
                                child, max_title_level_to_use
                            ):
                                new_titles_to_subdivide.append(child)
                                total_used_ids.append(child["id"])
                            else:
                                total_chunks.append(chunk)
                                total_used_ids.extend(used_ids)

                    titles_to_subdivide = new_titles_to_subdivide

        return total_chunks

    @staticmethod
    def chunk_is_too_big(chunk: str, max_chunk_length: int) -> bool:
        """Returns True if the chunk is bigger than the value
        specified as max_chunk_length in terms of word count

        Args:
            chunk (str): a chunk of text
            max_chunk_length (int): the max size of the chunk in words

        Returns:
            bool: True if the chunk is too big, else false
        """
        return len(chunk.split()) > max_chunk_length

    @staticmethod
    def title_has_children(title: Title, max_title_level_to_use: str = "h4") -> bool:
        """Returns True if the title has children, ecluding title level
        that are lower than max_title_level_use

        Args:
            title (Title): the title to check for children
            max_title_level_to_use (str, optional): the max level of title to consider, included.
                Defaults to "h4".

        Returns:
            bool: True if the title has children
        """
        max_level = int(max_title_level_to_use[1]) - 1

        return bool([c for c in title["children"] if c["level"] <= max_level])

    @staticmethod
    def create_chunk(title: Title, titles: Titles) -> Tuple[str, List[int]]:
        """Creates a chunk, based on a title.
        A chunk is made from :
        - the title text
        - the content
        - the title text of its children
        - the content of its children
        - ... recursively for the children of children

        Args:
            title (Title): a title element
            titles (Titles): all the titles of the document

        Returns:
            Tuple[str, List[int]]: Returns two things :
                - the chunk (as a string)
                - the list of ids of titles used to build the chunk
        """
        chunk = ""
        # add title text of parents + their content
        if title["parents"]:
            parents = sorted(title["parents"], key=lambda x: x["level"])
            for parent in parents:
                parent = HTMLChunkNorris.get_title_using_condition(
                    titles, {"id": parent["id"]}
                )
                chunk += HTMLChunkNorris.create_title_text(parent)
        # add title + content of current title
        chunk += HTMLChunkNorris.create_title_text(title)
        used_titles_ids = [title["id"]]
        # add title + content of all children
        if title["children"]:
            for child in title["children"]:
                child = HTMLChunkNorris.get_title_using_condition(
                    titles, {"id": child["id"]}
                )
                chunk += HTMLChunkNorris.create_title_text(child)
                used_titles_ids.append(child["id"])

        return chunk, used_titles_ids

    @staticmethod
    def create_title_text(title: Title) -> str:
        """Generate the text of the title, using the title name and it's content

        Args:
            title (Title): a title element

        Returns:
            str: the text to put in the chunk for that title
        """
        # add # before title (markdown style)
        title_text = "#" * (title["level"] + 1) + " " + title["text"] + "\n"
        # add title content
        title_text += title["content"] + "\n" if title["content"] else ""

        return title_text

    def change_links_format(
        self, text, link_placement: str = "end_of_chunk", **kwargs
    ) -> str:
        """Removes the markdown format of the links in the text.
        The links are treated as specified by 'link_position':
        - None : links are removed
        - in_sentence : the link is placed in the sentence, between parenthesis
        - end_of_chunk : all links are added at the end of the text
        - end_of_sentence : each link is added at the end of the sentence it is found in

        Args:
            text (str): the text to find the links in
            link_placement (str, optional): How the links should be handled. Defaults to None.

        Raises:
            NotImplementedError: _description_

        Returns:
            str: the formated text
        """
        allowed_link_placements = [
            "remove",
            "end_of_chunk",
            "in_sentence",
            "end_of_sentence",
        ]
        HTMLChunkNorris.check_string_argument_is_valid(
            "link_placement", link_placement, allowed_link_placements
        )

        matches = re.finditer(self.regex_patterns["link"], text)
        if matches is not None:
            for i, m in enumerate(matches):
                match link_placement:
                    case "remove":
                        text = text.replace(m[0], m[1])
                    case "end_of_chunk":
                        if i == 0:
                            text += "Pour plus d'informations:\n"
                        text = text.replace(m[0], m[1])
                        text += f"- {m[1]}: {m[2]}\n"
                    case "in_sentence":
                        text = text.replace(
                            m[0], f"{m[1]} (pour plus d'informations : {m[2]})"
                        )
                    case "end_of_sentence":
                        link_end_position = m.span(2)[1]
                        # next_breakpoint = HTMLChunkNorris.find_end_of_sentence(text, link_end_position)
                        raise NotImplementedError()

        return text

    def check_chunks(
        self,
        chunks: Chunks,
        max_chunk_tokens: int = 8191,
        chunk_tokens_exceeded_handling: str = "raise_error",
        **kwargs,
    ):
        """Checks that the chunks do not exceed the token limit, considered as a hard limit
        If chunk_tokens_exceeded_handling is:
        - "raise_error" -> it will raise an error in case a chunk to big is found
        for it to be investigated.
        - "split" -> Chunks exceeding the max size will be split to fit max_chunk_tokens

        Args:
            chunks (Chunks): The chunks obtained from the get_chunks() method
            max_chunk_tokens (int, optional): the maximum size a chunk is allowed to be,
                in tokens. Defaults to 8191.
            chunk_tokens_exceeded_handling (bool, optional): whether or not error sould be raised if a big
                chunk is encountered, or split. Defaults to True.
        """
        HTMLChunkNorris.check_string_argument_is_valid(
            "chunk_tokens_exceeded_handling",
            chunk_tokens_exceeded_handling,
            ["raise_error", "split"],
        )

        splitted_chunks = []
        for chunk in chunks:
            if chunk["token_count"] < max_chunk_tokens:
                splitted_chunks.append(chunk)
            else:
                match chunk_tokens_exceeded_handling:
                    case "raise_error":
                        raise ChunkSizeExceeded(
                            (
                                f"Found chunk bigger than the specified token limit {max_chunk_tokens}:",
                                "You can disable this error and allow dummy splitting of this chunk by passing 'raise_error=False'",
                                f"The chunk : {chunk}",
                            )
                        )
                    case "split":
                        splitted_chunk = self.split_big_chunk(chunk, max_chunk_tokens)
                        splitted_chunks.extend(splitted_chunk)

        return splitted_chunks

    def split_big_chunk(
        self,
        chunk: Chunk,
        max_chunk_tokens: int = 8191,
    ) -> Chunks:
        """Splits the chunk so that the subchunk fit un max_chunk_size

        Args:
            chunk (Chunk): _description_
            max_chunk_tokens (int, optional): _description_. Defaults to 8191.

        Returns:
            _type_: _description_
        """
        # if chunk is smaller that specified limit, just return the chunk
        if chunk["token_count"] < max_chunk_tokens:
            return [chunk]

        split_count = (chunk["token_count"] // max_chunk_tokens) + 1
        split_token_size = chunk["token_count"] // split_count
        # split the chunk's text
        tokenized_text = self.tokenizer.encode(chunk["text"])
        splitted_text = [
            self.tokenizer.decode(
                tokenized_text[i * split_token_size : (i + 1) * split_token_size]
            )
            for i in range(split_count)
        ]
        # recreate subchunks from the initial chunk
        splitted_chunk = [
            {
                "id": f"{chunk['id'].replace('.json', '')}-{i}.json",
                "token_count": len(self.tokenizer.encode(sct)),
                "word_count": len(sct.split()),
                "source_file": chunk["source_file"],
                "text": sct,
            }
            for i, sct in enumerate(splitted_text)
        ]

        return splitted_chunk


In [384]:
html_cn = HTMLChunkNorris()

SOURCE_FOLDER = "./data/ladrome-pages-2023-12-22-json-v1-HTML"
# FILE = "ladrome-page-13830.json"
# FILE = "ladrome-page-10516.json"
# FILE = "ladrome-page-61504.json"
# FILE = "ladrome-page-504967.json"
# FILE = "ladrome-page-567239.json"
# FILE = "ladrome-page-13718.json"
# FILE = "ladrome-page-10516.json"
# FILE = "ladrome-page-547344.json" # big chunks
FILE = "ladrome-page-574603.json" # no title, just a list of links
# FILE = "ladrome-page-581681.json" # plein de petits titres

file = HTMLChunkNorris.read_json_file(os.path.join(SOURCE_FOLDER, FILE))
# print(file)
titles = html_cn.get_toc(file)
# print(titles)
#chunks = html_cn.get_chunks(titles, FILE)
# print(chunks)
chunks = html_cn(
    os.path.join(SOURCE_FOLDER, FILE),
    max_title_level_to_use="h3",
    max_chunk_word_length=200,
    link_placement="end_of_chunk",
    chunk_tokens_exceeded_handling="split"
    )
print("\n======================================\n")
for c in chunks:
    print(c["text"])



# Presse
Vous êtes journaliste ? Cet espace vous est réservé. Il vous permet de prendre connaissance des communiqués et dossiers adressés à la presse par le Département.
### **Contacter le service presse**
Yvan GUILHOT – Attaché de presse Tél. : 04.75.79.27.84 – 06.99.23.32.75 Email : [yguilhot@ladrome.fr](mailto:yguilhot@ladrome.fr)

# Presse
Vous êtes journaliste ? Cet espace vous est réservé. Il vous permet de prendre connaissance des communiqués et dossiers adressés à la presse par le Département.
### Année 2023
Décembre Fin des travaux d’aménagements cyclables sur le pont Mistral (pdf – 302 ko) Éducation : Inauguration des travaux de rénovation du collège Jean-Perrin Bâtiments : Inauguration des travaux de rénovation énergétique de l’Hôtel du Département (pdf – 461 ko) Éducation : Le Département soutient les sections sportives des collèges (pdf – 524 ko) Environnement : Le Département s’engage pour les sports de nature Budget primitif 2024 : 836,8 M€ pour les Drômois Enfance : A

In [385]:
html_cn = HTMLChunkNorris()
file_list = os.listdir(SOURCE_FOLDER)
df_constructor = []
for file in file_list:
    chunks = html_cn(
        os.path.join(SOURCE_FOLDER, file),
        max_title_level_to_use="h3",
        max_chunk_word_length=250,
        link_placement="end_of_chunk",
        chunk_tokens_exceeded_handling="split"
    )
    df_constructor.extend(chunks)

drome_df = pd.DataFrame(df_constructor)
drome_df.head()

Unnamed: 0,id,token_count,word_count,source_file,text
0,ladrome-page-10511-0.json,660,361,ladrome-page-10511.json,# Un peu d’histoire\nSituée à mi-chemin entre ...
1,ladrome-page-10513-0.json,5,3,ladrome-page-10513.json,# Les cantons\n
2,ladrome-page-10516-0.json,91,43,ladrome-page-10516.json,# Filières d’excellence\n### Agriculture bio\n...
3,ladrome-page-10516-1.json,164,62,ladrome-page-10516.json,# Filières d’excellence\n### Gastronomie\nL’ex...
4,ladrome-page-10516-2.json,108,47,ladrome-page-10516.json,# Filières d’excellence\n### Œnologie\nLien en...


In [386]:
drome_df["word_count"].describe()

count     352.000000
mean      174.914773
std       194.311741
min         2.000000
25%        80.750000
50%       137.000000
75%       198.250000
max      1949.000000
Name: word_count, dtype: float64

In [387]:
sorted_chunks = drome_df.sort_values(by="word_count")
sorted_chunks.tail(10)

Unnamed: 0,id,token_count,word_count,source_file,text
289,ladrome-page-566648-0.json,1263,607,ladrome-page-566648.json,"# Alimentation\n#### Mieux manger, c’est mange..."
78,ladrome-page-10791-0.json,1473,637,ladrome-page-10791.json,# Les aides de la Maison Départementale de l’A...
334,ladrome-page-582176-0.json,1410,768,ladrome-page-582176.json,"# Expressions politiques\nDans cette rubrique,..."
173,ladrome-page-13832-0.json,1874,791,ladrome-page-13832.json,# Consultations réglementaires des CLI\nLes CL...
262,ladrome-page-547344-0.json,2013,908,ladrome-page-547344.json,# FAQ – futur collège Mercurol-Veaunes\nRetrou...
300,ladrome-page-574603-1-1.json,4235,1182,ladrome-page-574603.json,d’une liaison cyclable entre Tain-l’Hermitage...
263,ladrome-page-547344-1.json,2323,1247,ladrome-page-547344.json,# FAQ – futur collège Mercurol-Veaunes\nRetrou...
137,ladrome-page-11061-0.json,4289,1270,ladrome-page-11061.json,# Archives des actes administratifs\nA compter...
264,ladrome-page-547344-2.json,2520,1284,ladrome-page-547344.json,# FAQ – futur collège Mercurol-Veaunes\nRetrou...
299,ladrome-page-574603-1-0.json,4235,1949,ladrome-page-574603.json,# Presse\nVous êtes journaliste ? Cet espace v...


In [388]:
sorted_chunks.head(50)

Unnamed: 0,id,token_count,word_count,source_file,text
70,ladrome-page-10773-0.json,4,2,ladrome-page-10773.json,# Handicap\n
1,ladrome-page-10513-0.json,5,3,ladrome-page-10513.json,# Les cantons\n
122,ladrome-page-10993-0.json,7,3,ladrome-page-10993.json,# Le nucléaire\n
219,ladrome-page-43852-0.json,7,4,ladrome-page-43852.json,# Menu des collèges\n
93,ladrome-page-10905-0.json,8,4,ladrome-page-10905.json,# Vivre en établissement\n
106,ladrome-page-10944-0.json,8,4,ladrome-page-10944.json,# Le laboratoire départemental\n
85,ladrome-page-10866-0.json,6,4,ladrome-page-10866.json,# Vivre à domicile\n
71,ladrome-page-10775-0.json,8,5,ladrome-page-10775.json,# Le handicap chez l’enfant\n
136,ladrome-page-11059-0.json,13,6,ladrome-page-11059.json,# Délibérations\n#### Articles non trouvés\n
131,ladrome-page-11047-0.json,16,6,ladrome-page-11047.json,# Actualités\n[archives type=monthly limit=20 ...


In [389]:
DESTINATION_FOLDER = "./data/chunked-pages-v1/"
MIN_CHUNK_SIZE = 15 # chunks with less words than that will we discarded

def save_record_as_json(record):
    with open(os.path.join(DESTINATION_FOLDER, record["id"]), "w") as f:
        json.dump(record, f, ensure_ascii=False)

drome_df = drome_df[drome_df["word_count"] >= MIN_CHUNK_SIZE]
for record in drome_df.to_dict(orient="records"):
    save_record_as_json(record)

In [390]:
import random

DESTINATION_FOLDER = "./data/chunked-pages-v1/"
FILES = os.listdir(DESTINATION_FOLDER)
RANDOM_FILE = random.choice(FILES)
print(f"---{RANDOM_FILE}---")
with open(os.path.join(DESTINATION_FOLDER, RANDOM_FILE), "r") as f:
    tmp = json.load(f)
    for k,v in tmp.items():
        if k != "text":
            print(f"{k} : {v}")
        else:
            print("Texte:")
            print(v)


---ladrome-page-573222-1.json---
id : ladrome-page-573222-1.json
token_count : 412
word_count : 185
source_file : ladrome-page-573222.json
Texte:
# Le cinéma d’animation dans la Drôme
### Le Fonds de Soutien aux Œuvres Animées
Afin de soutenir cette filière, le Département de la Drôme a co-créé en 2017 un Fonds de Soutien aux Œuvres Animées, qui dispose depuis 2023 d’une enveloppe de 380 000€ : 170 000€ du Département, 110 000€ de Valence Romans Agglo et 100 000€ du Centre National du Cinéma et de l’image animée (CNC). Il s’agit d’un dispositif d’aides sélectives aux œuvres cinématographiques et audiovisuelles utilisant des techniques d’animation (court métrage, série et spécial TV), basé sur des critères artistiques et culturels. 15 à 20 projets sont soutenus chaque année par le Département de la Drôme, aidant ainsi les studios drômois dans le développement ou la production de leurs projets. Au total, 83 courts métrages ou séries ont déjà bénéficié d’une subvention. 
- Palmarès du Fon