In [1]:
from collections import Counter
import re
from typing import Dict, List, Tuple
import unicodedata
import fitz
from unidecode import unidecode

In [24]:
class WikitPDFParser:
    def __init__(
            self,
            filepath:str,
            chunking_method:str="auto",
            sort_by_reading_order:bool=False,
            top_bot_margin:float=.08,
            left_right_margin:float=.0,
            ):
        """A class used to parse pdf file intelligently

        Args:
            filepath (str): path to the file (must be pdf)
            sort_by_reading_order (bool, optional): if True, forces the sorting of all detected text boxes 
                from top to bottom and left to right. But may destroy structure of tables of graphics.
                Only use if the text order seems off. Defaults to False.
            top_bot_margin (float, optional): the top and bottom margins to remove, used to 
                remove headers and footers, as a float of page height percentage. Defaults to .08.
            left_right_margin (float, optional): the left and right margins to remove, as a percentage
                of page width. Defaults to .0.
        """
        if not filepath.endswith(".pdf"):
            raise ValueError("The provided file must be a PDF.")
        self.document = fitz.open(filepath)

        self.chunking_method = chunking_method
        self.doc_as_dict = self.build_doc_as_dict(sort_by_reading_order, top_bot_margin, left_right_margin)
        self.doc_as_lines = self.consolidate_lines()
        self.toc = self.infer_table_of_content()
        self.chunks = self.chunk_document()
        

    def build_doc_as_dict(
            self,
            sort_by_reading_order:bool=False,
            top_bot_margin:float=.08,
            left_right_margin:float=.0
            ) -> List[Dict]:

        doc_as_dict = []
        for page in self.document.pages():
            scan_area = WikitPDFParser.get_page_scan_area(
                page,
                top_bot_margin,
                left_right_margin
                )
            doc_as_dict.append(
                page.get_text(
                    "dict",
                    sort=sort_by_reading_order,
                    #clip=scan_area
                )
            )

        return doc_as_dict


    @staticmethod
    def get_page_scan_area(
        page:fitz.Page,
        top_bot_margin:float=.08,
        left_right_margin:float=.0
        ) -> fitz.Rect:
        """Get the area to scan from a page, by offseting the bounding 
        rectangle a the page by the specified offset. Used to remove
        headers and footers.

        Args:
            page (fitz.Page): a page from the document
            top_bot_margin (float, optional): the amount of page to remove on y axis. Defaults to .08.
            left_right_margin (float, optional): the amount of page to remove on x axis. Defaults to .0

        Returns:
            fitz.Rect: the area of the page to take into account
        """
        page_rectangle = page.bound()
        return fitz.Rect([
            int(page_rectangle[0] + left_right_margin*page_rectangle[2]),
            int(page_rectangle[1] + top_bot_margin*page_rectangle[3]),
            int(page_rectangle[2] - left_right_margin*page_rectangle[2]),
            int(page_rectangle[3] - top_bot_margin*page_rectangle[3])
            ])
    

    def infer_table_of_content(self) -> List[Dict]:
        """Tries to find the table of content based on regex matches        

        Returns:
            List[Dict]: The potential titles of the table of content
                and their caracteristics
        """
        # Try to find a table of content (TOC)
        # A TOC is just a line whose text maching the regex "text .... number"
        potential_toc_titles = []
        for line in self.doc_as_lines:
            toc_regex_match = re.match(r"(.+?)(\s+)?[._-]{3,}(\s+)?(\d+)", line["text"])
            if toc_regex_match is not None:
                page_number = toc_regex_match[4]
                potential_toc_titles.append({
                    "title": toc_regex_match[1],
                    "page_id": int(page_number),
                    "fonts": line["fonts"],
                    "sizes": line["sizes"],
                    "x_offset": round(line["bbox"][0], 1)
                    })

        if len(potential_toc_titles) > 0:
            potential_toc_titles = WikitPDFParser._infer_table_of_content_schema(potential_toc_titles)
            print("Table of content found !")

        return potential_toc_titles


    @staticmethod
    def _infer_table_of_content_schema(potential_toc_titles:List[Dict]) -> List[Dict]:
        """Guess the level of each title, taking into account the position of the title
        We assume we go down in levels for each right tab

        Args:
            potential_toc_titles (List[Dict]): the table of content title infered from
            infer_table_of_content()

        Raises:
            NotImplementedError: _description_

        Returns:
            List[Dict]: _description_
        """
        # Get the different x_offsets
        x_pos_levels = sorted(set([title["x_offset"] for title in potential_toc_titles]))
        # If we have various x offsets, we can infer title level using them
        # So far, infering using regex is buggy so we only use offsets
        if len(x_pos_levels) > 1 or True:
            # save a mapping of xoffset -> level
            level2offset_mapping = {x:i for i, x in enumerate(x_pos_levels)}
            potential_toc_titles = [
                d|{"level": level2offset_mapping[d["x_offset"]]}
                for d in potential_toc_titles
                ]
        # Else use regex to detect numerotation and infer levels
        else:
            potential_toc_titles = [
                d|dict(zip(["level", "numerotation"], WikitPDFParser._infer_with_regex(d["title"])))
                for d in potential_toc_titles
                ]

        # Find the parents titles of each subtitle in multiple level TOC
        parents_buffer = [None] * (max([d["level"] for d in potential_toc_titles]) + 1)
        for title in potential_toc_titles:
            parents_buffer[title["level"]] = title["title"]
            parents = parents_buffer[:title["level"]]
            title["parents"] = [p for p in parents if p is not None]

        return potential_toc_titles
    

    @staticmethod
    def _infer_with_regex(text:str) -> Tuple[int, str]:
        """Infer the level of the title based on its numerotation 
        (such as 1.1)

        Args:
            text (str): the title text

        Returns:
            (int, str): the level (of the title), and its numerotation
        """
        regex = "(^[a-zA-Z0-9])[.)\s]?([a-zA-Z0-9])?[.)]?([a-zA-Z0-9])?[.)]?\s+?"
        matches = re.match(regex, text)
        level = matches.span()[1]-1 if matches is not None else 0
        numerotation = matches[0] if matches is not None else None

        return level, numerotation


    def show_page_elements(self, elem_to_show:str="blocks"):
        """Shows the coordinates of the blocks or the chunks
        on the pdf by drawing squares

        Args:
            elem_to_show (str): "blocks" or "chunks"
        Raises:
            ValueError: if page_numbers is not a list
        """
        match elem_to_show:
            case "blocks":
                for page_id, page_elems in enumerate(self.doc_as_dict):
                    page = self.document.load_page(page_id)
                    for block in page_elems["blocks"]:
                        rect = fitz.Rect(block["bbox"])
                        page.draw_rect(rect, color=(1,0,0), fill=(1,1,0), width=1, stroke_opacity=1, fill_opacity=.3)

            case "chunks":
                color_switch = True
                for chunk in self.chunks:
                    color_switch = not color_switch 
                    for page_id, box in zip(chunk["page_ids"],chunk["bboxes"]):
                        page = self.document.load_page(page_id-1)
                        rect = fitz.Rect(box)
                        page.draw_rect(rect, color=(1,0,0),
                                    fill=(int(color_switch),1,int(not color_switch)),
                                    width=1, stroke_opacity=1, fill_opacity=.3)
            case other:
                raise ValueError(f"elem_to_show argument must be 'blocks or 'chunks'. Got {other}")
        self.document.save("WikitPDFParser_Output.pdf")
        print("Document saved as:  WikitPDFParser_Output.pdf")

    
    def get_font_caracs(doc_as_dict:List[Dict]) -> List[Dict]:
        """Get the caracteristics of fonts used in the document
        output is of format [{'font': 'Calibri-Bold', 'size': 24.0, 'occurence': 1}, ...]

        Args:
            doc_as_dict (Dict): the document text as a dict. ouput from read_pdf_document

        Returns:
            Dict: list of dicts of font caracteristics
        """
        # Get the font and fontsize of each line of text
        font_caracs = []
        for page in doc_as_dict:
            for block in page["blocks"]:
                if block["type"] == 0: # text
                    for line in block["lines"]:
                        for span in line["spans"]:
                            if len(span["text"].split()) > 0: # if text is not empty
                                font_caracs.append({"font": span["font"], "size": span["size"]})
        # counts the unique dicts to get occurences of each font-fontsize 
        occurences = Counter(frozenset(d.items()) for d in font_caracs)
        # add the occurence to the dicts
        font_caracs = [dict(k,occurence=v) for k,v in occurences.items()]
        font_caracs = sorted(font_caracs, key=lambda x: x["size"], reverse=True)
        
        return font_caracs


    def map_font_to_doc_structure(font_caracs):

        # We assume that the body's font size is the font size of the most used font
        body_fontsize = max(font_caracs, key=lambda x: x["occurence"])["size"]
        # the fonts of the body are the fonts which have the size = bodysize
        body_fonts = set([d["font"] for d in font_caracs if d["size"] == body_fontsize])
        # We assume title font are the ones with size > body font
        titles_fonts = set([d["font"] for d in font_caracs if d["size"] > body_fontsize])
        # also save other small fonts
        other_fonts = set([d["font"] for d in font_caracs if d["size"] < body_fontsize])

        return {"body_fonts": body_fonts, "titles_fonts": titles_fonts, "other_fonts": other_fonts}
    

    def chunk_document(self) -> List[Dict]:
        """Chunks the document.
        If a TOC has been detected, it will try to do a chunk per title.
        If not, it will consider the fontsize : every line the font size increases, we consider it is a new part

        Raises:
            ValueError: if the chunk method specified is "titles" but no TOC was detected

        Returns:
            List[Dict]: the chunks
        """
        match self.chunking_method.lower():
            case "auto":
                # if a table of content has been found, split by title
                if self.toc:
                    merge_idxes = self.merge_lines_by_titles(self.doc_as_lines)
                    # if chunking didn't work because titles were not found in text
                    # then fallback to merging by fontsize
                    if len(set(merge_idxes)) == 1:
                        print("Table of content was found be chunking by title didn't work")
                        merge_idxes = WikitPDFParser.merge_lines_by_fontsize(self.doc_as_lines)
                else:
                    merge_idxes = WikitPDFParser.merge_lines_by_fontsize(self.doc_as_lines)
            case "fontsize":
                merge_idxes = WikitPDFParser.merge_lines_by_fontsize(self.doc_as_lines)
            case "titles":
                if self.toc:
                    merge_idxes = self.merge_lines_by_titles(self.doc_as_lines)
                else:
                    raise ValueError("No table of content detected. Can't perform chunking using the 'titles' method.")
        chunks = WikitPDFParser.merge_lines_together(self.doc_as_lines, merge_idxes)

        return chunks
   

    def consolidate_lines(self) -> List[Dict]:
        """Regroups the elements that belong to the same line by:
        - merging texts together
        - listing all fonts, sizes etc
        - build the bouning box of the line
        - cleaning lianes that are headers, or pages number
        Returns:
            List[Dict]: the consolidated lines
        """
        doc_as_lines = []
        for page_id, page in enumerate(self.doc_as_dict):
            for block in page["blocks"]:
                if block["type"] == 0:
                    for line in block["lines"]:
                        text = "".join([s["text"] for s in line["spans"]])
                        consolidated_line = {
                            "text": WikitPDFParser.cleanup_text(text),
                            "fonts": list(set([s["font"] for s in line["spans"]])),
                            "sizes": list(set([s["size"] for s in line["spans"]])),
                            "bbox": line["bbox"],
                            "page_id": page_id + 1
                        }
                        if len(consolidated_line["text"].split()) > 0\
                        and consolidated_line["sizes"][0] > 6:
                            doc_as_lines.append(consolidated_line)
        
        doc_as_lines = WikitPDFParser.remove_header_and_footer(doc_as_lines, self.document.page_count)
        doc_as_lines = WikitPDFParser.remove_page_tags(doc_as_lines)

        return doc_as_lines


    @staticmethod
    def remove_header_and_footer(lines, page_count:int):
        """
        Removes elements that appear mutliple times inthe the pages
        as they are likely to be a header or footer
        """
        if page_count >= 3 :
            # Count each occurences of the texts of each line
            cntr = {}
            for i, l in enumerate(lines):
                if l["text"] not in cntr:
                    cntr[l["text"]] = {"count":0, "ids":[]}
                cntr[l["text"]]["count"] += 1
                cntr[l["text"]]["ids"].append(i)
            # if occurence is more that n_pages // 2 + 1, remove those lines
            indexes2remove = [
                v["ids"] for v in cntr.values()
                if v["count"] >= page_count//2 + 1
                ]
            indexes2remove = [i for j in indexes2remove for i in j]
            lines = [l for i, l in enumerate(lines) if i not in indexes2remove]
        
        return lines
    

    @staticmethod
    def remove_page_tags(lines:List[Dict]) -> List[Dict]:
        """Removes a line if it starts with 'page'
        or if it is only a number as they are likely to be page numbers
        Args:
            lines (List[Dict]): _description_

        Returns:
            List[Dict]: the lines with page related stuff removed
        """
        return [l for l in lines 
                if not l["text"].isnumeric() 
                and not l["text"].lower().startswith("page")]


    def merge_lines_by_titles(self, lines:List[Dict]) -> List[int]:
        """Group lines based on titles. Lines in between 2 titles
        will be grouped

        Args:
            lines (List[Dict]): le lines, as returned by consolidate_lines()

        Returns:
            List[int]: a list of indexes telling to which group each line belongs
        """
        merged_idxes = []
        merge_id = 0
        prev_line_was_title = False
        # normalize text of lines and title. Used to make sure we match without being case sensitive or accent sensitive
        normalized_line_text = [unidecode(line["text"]).lower() for line in lines]
        normalized_toc_text = [unidecode(t["title"].lower()) for t in self.toc]
        for i, line in enumerate(lines):
            # Check if this line is a title. To be a title :
            ## - it must contain the title text
            ## - be on the same page as the title or the page after if the front page counts as 0
            title_on_this_line = [
                t for t, normt in zip(self.toc, normalized_toc_text)\
                if normt == normalized_line_text[i]\
                and line["page_id"] in [t["page_id"], t["page_id"]+1]
                ]
            # If this line is a title, and if previous line wasn't a title, then increment index
            if bool(title_on_this_line):
                if not prev_line_was_title:
                    merge_id += 1
                prev_line_was_title = True
            else:
                prev_line_was_title = False
            merged_idxes.append(merge_id)

        return merged_idxes


    @staticmethod
    def merge_lines_by_fontsize(lines:List[Dict]) -> List[int]:
        """Groupe lines using the font size. We assume that if the font size
        increases from a line to another, we must have hit a new title

        Args:
            lines (List[Dict]): the lines, as returned by the consolidate_lines()

        Returns:
            List[Dict]: a list of indexes telling to which group each line belongs
        """
        # first we build a list of idxes indicating to which group each block belongs to
        merge_idxes = []
        merge_id = 0
        last_size = 500 # initialize with large fontsize
        for line in lines:
            # if the block's fontsize is bigger than the previous block's fontsize, it is a new header
            # except if it is the title of a figure 
            if line["sizes"][0] > last_size\
            and not line["text"].lower().startswith("figure"):
                merge_id += 1
            # if the text is entirely in uppercase, it is likely to be a header
            #if block["text"].isupper():
            #    merge_id += 1
            merge_idxes.append(merge_id)
            last_size = line["sizes"][0]

        return merge_idxes


    @staticmethod
    def merge_lines_together(lines:List[Dict], merge_idxs:List[int]):
        """Uses a list of indexes to merge each group of blocks
        which have the same indexes

        Args:
            blocks (List[Dict]): the blocksas returned by consolidate_blocks()
            merge_idx (List[int]): a list of indexes telling to which group belongs each block

        Returns:
            (List[Dict]): The blocks merged
        """
        chunks = []
        for i in set(merge_idxs):
            lines_to_merge = [l for idx, l in zip(merge_idxs, lines) if idx == i]
            text = " ".join([b["text"] for b in lines_to_merge])
            new_chunk = {
                "text": text,
                "word_count": len(text.split()),
                "fonts": set([f for b in lines_to_merge for f in b["fonts"]]),
                "sizes": set([s for b in lines_to_merge for s in b["sizes"]]),
                "bboxes": WikitPDFParser._get_bbox(lines_to_merge),
                "page_ids": set([l["page_id"] for l in lines_to_merge]),
                }
            chunks.append(new_chunk)

        return chunks


    @staticmethod
    def _get_bbox(lines:List[Dict]):
        """From a list of lines, gets the bbox that contains them all.
        The box may be multipage

        Args:
            lines (List[List[float]]): List of lines, as returned by consolidate_lines()
        """
        bboxes = []
        page_ids = set([l["page_id"] for l in lines])
        for pid in page_ids:
            lines_of_that_page = [l for l in lines if l["page_id"] == pid]
            bboxes_of_that_page = [l["bbox"] for l in lines_of_that_page]

            x1 = min([box[0] for box in bboxes_of_that_page])
            y1 = min([box[1] for box in bboxes_of_that_page])
            x2 = max([box[2] for box in bboxes_of_that_page])
            y2 = max([box[3] for box in bboxes_of_that_page])

            bboxes.append((x1, y1, x2, y2))

        return bboxes


    @staticmethod
    def cleanup_text(text:str) -> str:
        """Just applies unidecode to the text and cleanup few things

        Args:
            text (str): the text to cleanup

        Returns:
            (str) : clean text
        """
        # remove invalid chars
        def is_invalid_char(char):
            return unicodedata.category(char) == 'Co'
        text = "".join([char for char in text if not is_invalid_char(char)])
        # remove double spaces and trailing spaces
        text = " ".join(list(filter(None, text.split(" "))))
            
        return text

In [62]:
%time
#filename = "./data/raw/Wikit_Charte Teletravail_01 08 2023.pdf"
# filename = "./data/raw/1701871800.43557-Guide GDF Utilisateur.pdf"
#filename = "./data/raw/1702463429.7771-Vade-mecum i-parapheur - v1.3.pdf"
#filename = "./data/raw/1701963097.697041-Prise en main et installation de Microsoft Teams.pdf"
#filename = "./data/raw/1702465761.691289-Organigramme_DSIN_04.01.2024.pdf"
#filename = "./data/raw/1701963094.115837-Client Printer Logic (1).pdf"
filename = "./data/raw/1701926796.668725-Guide_Manager CHRONOTIME.pdf"
#filename = "./data/raw/1701939792.419826-ENVIRONNEMENT COLLABORATEUR (02 11 23).pdf"
#filename = "./data/raw/FAH - Affecter un avoir à une facture.pdf"
#filename = "./data/raw/FAH DIVERS_Catégorie tarifaire.pdf"
#filename = "./data/raw/INSA_groupe_1A_2023.pdf"
#filename = "./data/raw/Reglement_interieur_de_la_collectivite_en_vigueur_au_23_01_2020.pdf"
#filename = "./data/raw/MSTeams_QuickStartGuide_FR.pdf"
pdf_parser = WikitPDFParser(filename)

CPU times: total: 0 ns
Wall time: 0 ns
Table of content found !


In [11]:
import fitz

# TODO : see if reading the pdf as an html wouldn't be a better start
pdf = "C:/Users/mathi/Wikit/Data/Miscelaneous/raw_pdfs/Reglement_interieur_de_la_collectivite_en_vigueur_au_23_01_2020.pdf"

pdf = fitz.open(pdf)

In [14]:
def build_doc_as_dict(pdf):

    doc_as_dict = []
    for i, page in enumerate(pdf.pages()):
        if i == 5:
            doc_as_dict.append(
                page.get_text(
                    "html",
                )
            )
            return doc_as_dict

out = build_doc_as_dict(pdf)

with open("out.html", "w") as f:
    f.write(out[0])

In [63]:
pdf_parser.toc

[{'title': '1. Mon Equipe',
  'page_id': 3,
  'fonts': ['Verdana-Bold', 'Verdana'],
  'sizes': [11.029000282287598, 15.0],
  'x_offset': 59.5,
  'level': 0,
  'parents': []},
 {'title': 'Mes profils',
  'page_id': 4,
  'fonts': ['Verdana'],
  'sizes': [11.029000282287598, 12.0],
  'x_offset': 123.3,
  'level': 1,
  'parents': ['1. Mon Equipe']},
 {'title': 'Personnalisation d’une population',
  'page_id': 5,
  'fonts': ['Verdana'],
  'sizes': [11.029000282287598, 12.0],
  'x_offset': 123.3,
  'level': 1,
  'parents': ['1. Mon Equipe']},
 {'title': 'Personnalisation des briques d’accueil « Mon Equipe »',
  'page_id': 11,
  'fonts': ['Verdana'],
  'sizes': [11.029000282287598, 12.0],
  'x_offset': 123.3,
  'level': 1,
  'parents': ['1. Mon Equipe']},
 {'title': '2. Présentation et fonctionnalités',
  'page_id': 12,
  'fonts': ['Verdana-Bold', 'Verdana'],
  'sizes': [11.029000282287598, 15.0],
  'x_offset': 59.5,
  'level': 0,
  'parents': []},
 {'title': 'Brique « Demandes à valider »',


In [64]:
pdf_parser.show_page_elements("chunks")

Document saved as:  WikitPDFParser_Output.pdf


In [65]:
pdf_parser.chunks

[{'text': 'Chronotime PLAN Guide Manager CD06 2 / 44 Sommaire 1. Mon Equipe _______________________________ 3 1.1 1.2 Mes profils ___________________________________________ 4 1.3 Personnalisation d’une population _________________________ 5 1.4 Personnalisation des briques d’accueil « Mon Equipe » ________ 11 2. Présentation et fonctionnalités _______________ 12 2.1 Brique « Demandes à valider » __________________________ 12 2.1.1 Ecran Absences ________________________________________________ 14 2.1.2 Ecran Badgeages _______________________________________________ 16 2.1.3 Ecran Régularisations ____________________________________________ 18 2.2 Brique « Anomalies à lever » ____________________________ 18 2.3 Brique « Mes effectifs » ________________________________ 22 3. Planning ________________________________ 23 3.1 Présentation générale du planning ________________________ 23 3.1.1 Sélection de la population ________________________________________ 23 3.1.2 Présentation rapide _

In [57]:
# FIND TABLES
doc = fitz.open(filename)
page = doc[6]
table_finder = page.find_tables()  # detect the tables
if len(table_finder.tables) > 0:
    print("Table found !")
    tab = table_finder.tables[0]
    df = tab.to_pandas()
    df.head()

Table found !
