In [1]:
!pip install PyPDF2 pdfplumber pdf2image pycryptodome



In [2]:
import os
import json
from PyPDF2 import PdfReader
import pdfplumber
from pdf2image import convert_from_path

In [3]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ''
    return text

In [4]:
# Function to extract tables from a PDF file using pdfplumber
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_tables = page.extract_tables()
            for table in extracted_tables:
                tables.append({
                    "page": page_num,
                    "table": table
                })
    return tables

In [5]:
def extract_images_from_pdf(pdf_path, output_directory):
    images = []
    image_dir = os.path.join(output_directory, os.path.splitext(os.path.basename(pdf_path))[0])
    os.makedirs(image_dir, exist_ok=True)

    pages = convert_from_path(pdf_path)
    for page_num, page in enumerate(pages, start=1):
        image_filename = f"page_{page_num}.jpg"  # Adjust extension as needed
        image_path = os.path.join(image_dir, image_filename)
        page.save(image_path)
        images.append({
            "page": page_num,
            "image_file": os.path.relpath(image_path, output_directory)
        })
    return images

In [6]:
def main():
    pdf_directory = "manuals"
    output_directory = "extracted_content"

    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Iterate over PDF files in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            
            # Extract text
            text = extract_text_from_pdf(pdf_path)
            
            # Extract tables
            tables = extract_tables_from_pdf(pdf_path)
            
            # Save text to JSON file
            text_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_text.json")
            with open(text_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "text": text}, f, ensure_ascii=False, indent=4)
            print(f"Text extracted from {pdf_path} and saved to {text_output_file}")
            
            # Save tables to JSON file
            tables_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_tables.json")
            with open(tables_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "tables": tables}, f, ensure_ascii=False, indent=4)
            print(f"Tables extracted from {pdf_path} and saved to {tables_output_file}")
            
            # Extract and save images
            images = extract_images_from_pdf(pdf_path, output_directory)
            images_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_images.json")
            with open(images_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "images": images}, f, ensure_ascii=False, indent=4)
            print(f"Images extracted from {pdf_path} and saved to {images_output_file}")

if __name__ == "__main__":
    main()

Text extracted from manuals/nexon.pdf and saved to extracted_content/nexon_text.json
Tables extracted from manuals/nexon.pdf and saved to extracted_content/nexon_tables.json
Images extracted from manuals/nexon.pdf and saved to extracted_content/nexon_images.json
Text extracted from manuals/Verna.pdf and saved to extracted_content/Verna_text.json
Tables extracted from manuals/Verna.pdf and saved to extracted_content/Verna_tables.json
Images extracted from manuals/Verna.pdf and saved to extracted_content/Verna_images.json
Text extracted from manuals/exter.pdf and saved to extracted_content/exter_text.json
Tables extracted from manuals/exter.pdf and saved to extracted_content/exter_tables.json
Images extracted from manuals/exter.pdf and saved to extracted_content/exter_images.json
Text extracted from manuals/punch.pdf and saved to extracted_content/punch_text.json
Tables extracted from manuals/punch.pdf and saved to extracted_content/punch_tables.json
Images extracted from manuals/punch.p