In [93]:
# !pip install -U -q chromadb PyMuPDF pandas sentence-transformers gradio openai prompts

In [94]:
import pandas as pd
import pymupdf
import chromadb
from sentence_transformers import SentenceTransformer
import os
from chromadb import Settings
import numpy as np
import re
import textwrap

In [95]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Parameters:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - str: Extracted text from the PDF.
    """
    try:
        with pymupdf.open(pdf_path) as doc:
            text = ""
            print(f"Extracting text from: {pdf_path}")
            for page in doc:
                current_text = page.get_text()
                current_text = re.sub(r'\xa0', ' ', current_text)  # Replace non-breaking spaces
                text += current_text
            title = os.path.basename(pdf_path).replace('.pdf', '')
            print(f"Title: {title}")
        return text, title
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None
    

def get_text(df, livre, paragraph):
    """
    Retrieves the text for a specific livre and paragraph.

    Parameters:
    - livre (int): The livre number.
    - paragraph (int): The paragraph number.

    Returns:
    - str: The text of the specified livre and paragraph.
    """
    return df.loc[(df.livre == livre) & (df.paragraph == paragraph)].text.values[0]

In [96]:
files = os.listdir('documents')
texts = {}
for file in files:
    if file.endswith('.pdf'):
        text, title = extract_text_from_pdf(os.path.join('documents', file))
        texts[title] = text

Extracting text from: documents/Pensees_moi_meme.pdf
Title: Pensees_moi_meme


In [97]:
# First cleaning : remove [number] references

def remove_references(text):
    """
    Removes reference patterns like [1], [23] from the text.

    Parameters:
    - text (str): The input text.

    Returns:
    - str: Cleaned text without references.
    """
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    return cleaned_text

def display_text(txt, width=150):
    wrapper = textwrap.TextWrapper(width=width)
    print(wrapper.fill(txt))

cleaned_text = remove_references(txt)

# Extract avant-propos as its written by different author than Marcus Aurelius
end_avant_propos_idx = re.search(r'LIVRE PREMIER', cleaned_text).start()
avant_propos = cleaned_text[:end_avant_propos_idx]
cleaned_text = cleaned_text[end_avant_propos_idx:]

# Remove page numbers : on whole text, we'll start at page 11 and increment and remove the first occurence of the expected page number 11 to 382

def remove_page_numbers(text, start_page=11, end_page=382):
    for page_num in range(start_page, end_page + 1):
        pattern = r'\n{}\n'.format(page_num)
        text = re.sub(pattern, '\n', text, count=1)
    return text

In [98]:
cleaned_text = remove_page_numbers(cleaned_text, 11, 382)

In [101]:
# Chunking

# First split : by LIVRE

livres = re.split(r'LIVRE [A-Z]+', cleaned_text)[1:]

# New split : by paragraph (I, II etc...)
def get_paragraphs(text):
    """
    Splits the text into paragraphs based on Roman numeral headings.

    Parameters:
    - text (str): The input text.

    Returns:
    - list: List of paragraphs.
    """ 
    return re.split(r'\n[A-Z]+\n', text)[1:]

def get_paragraphs_split_footers(livres):

    footers = {}
    paragraphs = {}
    for i in range(1, len(livres) + 1):
        current_paragraphs = get_paragraphs(livres[i-1])
        last_paragraph = current_paragraphs[-1]
        splits = re.split(r'\b1. ', last_paragraph, maxsplit=1)
        try:
            current_paragraphs[-1] = splits[0]
            footers[i] = splits[1]
            paragraphs[i] = current_paragraphs
        except IndexError:
            print(f"No footer found for one of the books.")
            print('Retry after better cleaning?')
            return None
    return paragraphs, footers

# Create dataframe to store paragraph, with column linked to livre and paragraph number



paragraphs, footers = get_paragraphs_split_footers(livres)
    
    # Also remove capitalization ?
# paragraphs = {k: [para.lower() for para in v] for k, v in paragraphs.items()}

# Remove \n within paragraphs
paragraphs = {k: [para.replace('\n', ' ') for para in v] for k, v in paragraphs.items()}

df = pd.DataFrame(columns=['livre', 'paragraph', 'text'])
for livre_num, paras in paragraphs.items():
    for para_num, para_text in enumerate(paras, start=1):
        df = pd.concat([df, pd.DataFrame({'livre': [livre_num], 'paragraph': [para_num], 'text': [para_text]})], ignore_index=True)

In [102]:
# Save df

df.to_csv('cleaned/marcus_aurelius_paragraphs.csv', index=False)