In [None]:
#Step1: Check the PDF Document Type
# Image-based PDFs: Use Qwen-VL to extract text and visual data. 
# Text-based PDFs: Use fitz (PyMuPDF) or similar libraries to extract text directly.

import os
import fitz  # PyMuPDF
from pathlib import Path

def check_pdf_type_and_extract(pdf_path, output_dir="pdf_images"):
    """
    Check the type of PDF (text-based or image-based) and extract data accordingly.
    
    Args:
        pdf_path (str): Path to the input PDF file.
        output_dir (str): Directory to save images if the PDF is image-based.
    
    Returns:
        dict: Extracted data (either text chunks or image paths).
    """
    # Open the PDF document
    doc = fitz.open(pdf_path)
    
    # Check if the PDF contains selectable text
    is_text_based = False
    for page in doc:
        if page.get_text("text").strip():  # Check if the page contains selectable text
            is_text_based = True
            break

    #return "text" if is_text_based else "image"
    
    if is_text_based:
        # Case 1: Text-Based PDF
        print("✅ Detected text-based PDF. Extracting text...")
        return extract_text_with_page_numbers(doc)
    else:
        # Case 2: Image-Based PDF
        print("✅ Detected image-based PDF. Converting pages to images...")
        os.makedirs(output_dir, exist_ok=True)  # Create the output directory
        return convert_pdf_to_images(doc, output_dir)


def extract_text_with_page_numbers(doc):
    """
    Extract text from a text-based PDF with page numbers.
    
    Args:
        doc (fitz.Document): Opened PDF document.
    
    Returns:
        list: List of dictionaries containing page numbers and extracted text.
    """
    chunks = []
    for page_num in range(len(doc)):
        text = doc[page_num].get_text("text")
        if text.strip():
            chunks.append({
                "page": page_num + 1,  # Store page number (1-based index)
                "text": text
            })
    return {"type": "text", "data": chunks}



def convert_pdf_to_images(doc, output_dir):
    """
    Convert each page of an image-based PDF to images.
    
    Args:
        doc (fitz.Document): Opened PDF document.
        output_dir (str): Directory to save the images.
    
    Returns:
        dict: Dictionary containing the paths to the saved images.
    """

    # Open the PDF document
    #doc = fitz.open(doc)

    image_paths = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        pix = page.get_pixmap()  # Convert the page to a pixmap (image)
        image_path = os.path.join(output_dir, f"page{page_num + 1}.png")
        pix.save(image_path)  # Save the image as a PNG file
        image_paths.append(image_path)
        print(f"Saved: {image_path}")
    print("✅ PDF pages successfully converted to PNG!")
    return {"type": "image", "data": image_paths}

In [61]:
import os
import fitz  # PyMuPDF
from pathlib import Path

#Step 1: Convert PDF to images 

def check_pdf_type_and_convert_to_images(pdf_path, output_dir="pdf_images"):
    """
    Convert all pages of a PDF (text-based or image-based) into PNG images.
    
    Args:
        pdf_path (str): Path to the input PDF file.
        output_dir (str): Directory to save the generated PNG images.
    
    Returns:
        dict: Dictionary containing the paths to the saved images.
    """
    # Open the PDF document
    doc = fitz.open(pdf_path)
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    print("✅ Converting all PDF pages to PNG images...")


    base_filename = os.path.splitext(os.path.basename(pdf_path))[0]
    
    # Convert each page to an image
    image_paths = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        pix = page.get_pixmap()  # Convert the page to a pixmap (image)
        image_path = os.path.join(output_dir, f"{base_filename}_page{page_num + 1}.png")
        #image_path = f"{base_filename}_page{page_num + 1}.png"

        pix.save(image_path)  # Save the image as a PNG file
        image_paths.append(image_path)
        print(f"Saved: {image_path}")
    
    print("✅ All PDF pages successfully converted to PNG!")
    return {"type": "image", "data": image_paths}

In [60]:
# Input PDF path
pdf_path = "/Users/sunnyjovita/Desktop/Puninar/pdf_raw/BC_file.pdf"

# Output directory for images (if needed)
output_dir = "pdf_images"

# Check PDF type and extract data
check_pdf_type_and_convert_to_images(pdf_path, output_dir)

✅ Converting all PDF pages to PNG images...
Saved: pdf_images/BC_file_page1.png
Saved: pdf_images/BC_file_page2.png
Saved: pdf_images/BC_file_page3.png
Saved: pdf_images/BC_file_page4.png
Saved: pdf_images/BC_file_page5.png
Saved: pdf_images/BC_file_page6.png
Saved: pdf_images/BC_file_page7.png
✅ All PDF pages successfully converted to PNG!


{'type': 'image',
 'data': ['pdf_images/BC_file_page1.png',
  'pdf_images/BC_file_page2.png',
  'pdf_images/BC_file_page3.png',
  'pdf_images/BC_file_page4.png',
  'pdf_images/BC_file_page5.png',
  'pdf_images/BC_file_page6.png',
  'pdf_images/BC_file_page7.png']}

In [None]:
#Step 2 Extracting text using Qwen VL

In [62]:
query = """

Strictly extract solely the text in detailed within the document in json format

"""

In [35]:
query2 = """


# Role
You are a meticulous document processing expert, skilled in extracting and organizing structured information from unstructured documents. Your role is to identify sections within a document, extract their content, and format the output as specified.

## Skills
### Skill 1: Section Identification
- Identify all sections within the provided document.
- Extract the names of the sections for use in the output.

### Skill 2: Content Extraction
- Extract all information contained within each identified section in detail.
- Ensure that the extracted content is complete and unmodified.

### Skill 3: JSON Formatting
- Format the extracted section names and their corresponding content into a JSON structure.
- Ensure the output strictly adheres to the specified JSON format without any additional explanation or modification.

## Limitations
- Only process the sections and content present in the document; do not add, remove, or modify any information.
- Output must be in JSON format as specified, with no additional text or explanations.
- If the document structure is unclear or ambiguous, make reasonable assumptions but ensure consistency in extraction.

## Output Format
json
{
    "&lt;section&gt;": "&lt;information inside the section&gt;",
    "&lt;section2&gt;": "&lt;information inside the section2&gt;",
    ...
}


"""

In [28]:
query3 = """You are a helpful assistant. 

Read all the text in the image. 

"""

In [63]:
import os
import re
import json
from openai import OpenAI
import base64

# Function to encode an image file into base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Function to get response from the API for a single image
def get_response(image_path):
    base64_image = encode_image(image_path)
    client = OpenAI(
        api_key="put your apikey here",
        base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
    )
    
    completion = client.chat.completions.create(
        model="qwen-vl-plus",
        messages=[
            {
              "role": "user",
              "content": [
                {"type": "text", "text": f"{query}"},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
              ]
            }
          ],
          stream=False
        )
    
    return json.loads(completion.model_dump_json())

def process_images_in_folder(folder_path, output_json_path):
    extracted_data = {}

    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")
            
            try:
                # Get raw response
                response = get_response(image_path)
                raw_text = response["choices"][0]["message"]["content"]
                
                # Step 1: Clean markdown and prepare JSON
                cleaned_json = raw_text.replace('```json', '').replace('```', '').strip()
                
                # Step 2: Process lines to add missing keys
                lines = cleaned_json.split('\n')
                processed_lines = []
                unstructured_counter = 0

                for line in lines:
                    stripped = line.strip()
                    # Detect lines that start with a value but no key
                    if stripped.startswith('"') and ':' not in stripped:
                        # Add default key with counter
                        new_line = f'  "unstructured_{unstructured_counter}": {stripped},'
                        processed_lines.append(new_line)
                        unstructured_counter += 1
                    else:
                        processed_lines.append(line)

                # Rebuild JSON and fix trailing commas
                cleaned_json = '\n'.join(processed_lines)
                cleaned_json = re.sub(r',\s*([}\]])', r'\1', cleaned_json)
                
                # Step 3: Validate and parse JSON
                try:
                    dict_output = json.loads(cleaned_json)
                except json.JSONDecodeError as e:
                    print(f"  JSON structure error in {filename}: {e}")
                    print("  Attempting to save partial data...")
                    dict_output = {"error": str(e), "raw_content": raw_text}
                
                # Store results with filename as key
                extracted_data[filename] = dict_output

            except Exception as e:
                print(f"  Error processing {filename}: {e}")
                extracted_data[filename] = {"error": str(e)}

    # Save to JSON file
    with open(output_json_path, "w", encoding="utf-8") as json_file:
        json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)

# Set folder path and output JSON file path
image_folder = "/Users/sunnyjovita/Desktop/Puninar/pdf_images"
output_json = "/Users/sunnyjovita/Desktop/Puninar/extracted_datafromquery1.json"

# Process images and save JSON
process_images_in_folder(image_folder, output_json)
print(f"Extraction completed! JSON saved at {output_json}")

Processing: BC_file_page1.png
Processing: BC_file_page3.png
Processing: BC_file_page2.png
Processing: BC_file_page6.png
Processing: BC_file_page7.png
Processing: BC_file_page5.png
Processing: BC_file_page4.png
Extraction completed! JSON saved at /Users/sunnyjovita/Desktop/Puninar/extracted_datafromquery1.json
