In [149]:
from langchain.document_loaders import UnstructuredFileLoader
from typing import List
from statistics import mean

import pandas as pd

In [159]:
file = "./data/raw/1701963097.697041-Prise en main et installation de Microsoft Teams.pdf"

loader = UnstructuredFileLoader(
    file,
    mode="elements",
    )
loaded_file = loader.load()

In [155]:
print({f.metadata["category"] for f in loaded_file})

{'ListItem', 'Title', 'UncategorizedText', 'NarrativeText'}


In [160]:
# Remove uncategorized text (often it is useless, such as headers and footers)
loaded_file = [f for f in loaded_file if f.metadata["category"] != "UncategorizedText"]
# Drop duplicate elements, because they are likely to be headers or footers
idx_of_non_duplicates = pd.DataFrame([f.page_content for f in loaded_file]).drop_duplicates(keep=False).index
loaded_file = [f for i, f in enumerate(loaded_file) if i in idx_of_non_duplicates]
# Remove headers and footers using thresholding
def is_not_header_or_footer(
    element_points:List[List[float]],
    layout_height:float,
    top_threshold:float=.08,
    bottom_threshold:float=.92
    ) -> bool:
    """Compute the y position of the element based on its points.
    If the y position is above the top_threshold or below the bottom threshold
    it returns False (meaning the element might be a header of footer).

    Args:
        element_points (List[List[float]]): The points of the box location of the text. Must be a list of (x,y) coordinates
        layout_height (float): The height of the page.
        top_threshod (float, optional): the threshold below which the text is considered being a header.
            It is relative to the page height (so must be between 0 and 1) Defaults to .1.
        bottom_threshold (float, optional): the threshold above which the text is considered being a footer.
            It is relative to the page height. Defaults to .1.
    """
    y_pos = mean([p[1] for p in element_points])
    return top_threshold*layout_height <= y_pos <= bottom_threshold*layout_height


loaded_file = [
    f for f in loaded_file\
    if is_not_header_or_footer(f.metadata["coordinates"]["points"], f.metadata["coordinates"]["layout_height"])
    ]

# Group elements using the following rules :
## - group titles as long as we do not have a paragraph
## - when we have a paragraphs, merge them with their titles above
## - if we have a new title following paragraph, start a new group
title_goup_idx = 0
prev_was_paragraph = False
for f in loaded_file:
    if f.metadata["category"] == "Title":
        if prev_was_paragraph:
            title_goup_idx += 1
        f.metadata["merging_group"] = title_goup_idx
        prev_was_paragraph = False
    else:
        f.metadata["merging_group"] = title_goup_idx
        prev_was_paragraph = True

chunks = []
for idx in range(title_goup_idx+1):
    new_chunk = [f.page_content for f in loaded_file if f.metadata["merging_group"] == idx]
    chunks.append(new_chunk)

In [161]:
chunks

[['Pôle QVT-DH 04 mai 2020',
  'Table des matières',
  'Accès en mode Web à la plateforme Teams du Conseil Départemental ........................ 4'],
 ['Planifier une réunion en vidéo-conférence pour une équipe, un canal .......................... 16',
  'Planifier une vidéo-conférence avec des correspondants isolés .................................... 16',
  'Pour démarrer une visio-conférence entre tous les membres d’un canal ........................... 20',
  'Annexe : installer l’application Teams sur un ordinateur du CD 29 ................................ 24',
  'Document en cours d’élaboration. Si besoin appeler le 4857'],
 ['Accéder à Teams',
  "Microsoft Teams est une plateforme collaborative pour le travail d'équipe autour d’Office 365. Vos conversations, fichiers, réunions et applications sont rassemblés dans un seul espace de travail partagé accessible depuis un navigateur (depuis un pc, une tablette, un smartphone)",
  'Il permet la vidéoconférence, la messagerie instantan

In [162]:
loaded_file

[Document(page_content='Pôle QVT-DH 04 mai 2020', metadata={'source': './data/raw/1701963097.697041-Prise en main et installation de Microsoft Teams.pdf', 'coordinates': {'points': ((344.7, 748.175), (344.7, 764.175), (523.596, 764.175), (523.596, 748.175)), 'system': 'PixelSpace', 'layout_width': 595.4, 'layout_height': 841.8}, 'file_directory': './data/raw', 'filename': '1701963097.697041-Prise en main et installation de Microsoft Teams.pdf', 'languages': ['eng'], 'last_modified': '2023-12-14T16:43:24', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'Title', 'merging_group': 0}),
 Document(page_content='Table des matières', metadata={'source': './data/raw/1701963097.697041-Prise en main et installation de Microsoft Teams.pdf', 'coordinates': {'points': ((70.82499999999993, 74.02999999999997), (70.82499999999993, 90.02999999999997), (195.86600799999994, 90.02999999999997), (195.86600799999994, 74.02999999999997)), 'system': 'PixelSpace', 'layout_width': 595.4, 'layout_he