In [3]:
import os
import csv
from pathlib import Path
from PyPDF2 import PdfReader
import tiktoken  # Make sure to install this package: pip install tiktoken

# Setup tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def extract_text_from_pdf(pdf_path):
    text_by_page = []
    try:
        reader = PdfReader(pdf_path)
        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            text_by_page.append((page_num + 1, text))
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text_by_page

def process_pdf_files(pdf_folder, output_csv_path):
    pdf_files = list(Path(pdf_folder).glob("*.pdf"))
    chunk_data = []
    
    for pdf_path in pdf_files:
        print(f"Processing: {pdf_path.name}")
        text_by_page = extract_text_from_pdf(pdf_path)

        chunk_counter = 1  # Reset for each file
        for page_num, page_text in text_by_page:
            if not page_text or page_text.strip() == "":
                continue  # skip empty pages
            tokens = tokenizer.encode(page_text)
            chunk_data.append({
                "source": pdf_path.name,
                "page_number": page_num,
                "chunk_number":chunk_counter, 
                "text": page_text.strip()
            })
            chunk_counter += 1  # Increment after each chunk

    # Write output to CSV
    with open(output_csv_path, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["source", "page_number", "chunk_number", "text"])
        writer.writeheader()
        writer.writerows(chunk_data)

    print(f"\n Output saved to: {output_csv_path}")

# Run the processing
pdf_folder = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files"
output_csv = "pdf_chunks_text.csv"
process_pdf_files(pdf_folder, output_csv)

Processing: new-approaches-and-procedures-for-cancer-treatment.pdf
Processing: Ocean_ecogeochemistry_A_review.pdf
Processing: The-Alchemist.pdf
Processing: The_Plan_of_the_Giza_Pyramids.pdf

 Output saved to: pdf_chunks_text.csv
