In [2]:
import logging
from typing import List
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from pydantic import BaseModel
from models.invoice_schema import PageTextData, ElaboratedPageTextData
import os
import logging 
logger = logging.getLogger(__name__)

In [3]:
from pathlib import Path
from pdf2image import convert_from_path

# Correct full path to the file
file_path = Path("d:/AI_Inovice/app/invoice_3_five_pages_different_layout.pdf")

# Check if file exists
if not file_path.exists():
    raise FileNotFoundError(f"File not found: {file_path}")

# Now convert

images = convert_from_path(str(file_path), dpi=300)
print(f"Total pages converted: {len(images)}")

Total pages converted: 4


In [4]:
filename = 'Invoice_8'
pages = []
for page_number, image in enumerate(images, start=1):
    logger.info(f"Running OCR on page {page_number}...")
    text = pytesseract.image_to_string(image)
    pages.append(PageTextData(page_number=page_number,filename = filename, text=text))

In [5]:
pages

[PageTextData(page_number=1, filename='Invoice_8', text='DEF Tech Solutions\n\nInvoice #: INV-005\nClient: XYZ Corporation\nDate: 2025-07-20\n\nItem Qty Unit Price Total\n\nProduct 0 3 $15.50 $46.50\nProduct 1 3 $16.50 $49.50\nProduct 2 3 $17.50 $52.50\nProduct 3 3 $18.50 $55.50\nProduct 4 3 $19.50 $58.50\nProduct 5 3 $20.50 $61.50\nProduct 6 3 $21.50 $64.50\nProduct 7 3 $22.50 $67.50\nProduct 8 3 $23.50 $70.50\nProduct 9 3 $24.50 $73.50\nProduct 10 3 $15.50 $46.50\nProduct 11 3 $16.50 $49.50\nProduct 12 3 $17.50 $52.50\nProduct 13 3 $18.50 $55.50\nProduct 14 3 $19.50 $58.50\nProduct 15 3 $20.50 $61.50\nProduct 16 3 $21.50 $64.50\nProduct 17 3 $22.50 $67.50\nProduct 18 3 $23.50 $70.50\nProduct 19 3 $24.50 $73.50\nProduct 20 3 $15.50 $46.50\nProduct 21 3 $16.50 $49.50\nProduct 22 3 $17.50 $52.50\nProduct 23 3 $18.50 $55.50\nProduct 24 3 $19.50 $58.50\nProduct 25 3 $20.50 $61.50\nProduct 26 3 $21.50 $64.50\nProduct 27 3 $22.50 $67.50\nProduct 28 3 $23.50 $70.50\nProduct 29 3 $24.50 $73.5

In [6]:
from langchain_groq import ChatGroq
key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(model="gemma2-9b-it", groq_api_key=key)

In [7]:
from typing import List, Dict
from pydantic import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
import os

def generate_elaborated_texts(pages: List[PageTextData]) -> List[ElaboratedPageTextData]:
    elaborated_pages = []
    for page in pages:
        prompt = (
            "You are an expert in reading semi-structured invoices.\n"
            "Below is the raw OCR text from one page of a PDF invoice.\n"
            "Write a clear and complete elaboration of what this invoice page contains in natural language.\n\n"
            f"Page {page.page_number}:\n{page.text}"
        )
        response = llm.invoke([HumanMessage(content=prompt)])
        elaborated_pages.append(
                ElaboratedPageTextData(
                    page_number=page.page_number,
                    filename= page.filename,
                    text= page.text,
                    elaborated_text=response.content.strip()))
    return elaborated_pages

In [8]:
d= generate_elaborated_texts(pages= pages)
d

[ElaboratedPageTextData(page_number=1, filename='Invoice_8', text='DEF Tech Solutions\n\nInvoice #: INV-005\nClient: XYZ Corporation\nDate: 2025-07-20\n\nItem Qty Unit Price Total\n\nProduct 0 3 $15.50 $46.50\nProduct 1 3 $16.50 $49.50\nProduct 2 3 $17.50 $52.50\nProduct 3 3 $18.50 $55.50\nProduct 4 3 $19.50 $58.50\nProduct 5 3 $20.50 $61.50\nProduct 6 3 $21.50 $64.50\nProduct 7 3 $22.50 $67.50\nProduct 8 3 $23.50 $70.50\nProduct 9 3 $24.50 $73.50\nProduct 10 3 $15.50 $46.50\nProduct 11 3 $16.50 $49.50\nProduct 12 3 $17.50 $52.50\nProduct 13 3 $18.50 $55.50\nProduct 14 3 $19.50 $58.50\nProduct 15 3 $20.50 $61.50\nProduct 16 3 $21.50 $64.50\nProduct 17 3 $22.50 $67.50\nProduct 18 3 $23.50 $70.50\nProduct 19 3 $24.50 $73.50\nProduct 20 3 $15.50 $46.50\nProduct 21 3 $16.50 $49.50\nProduct 22 3 $17.50 $52.50\nProduct 23 3 $18.50 $55.50\nProduct 24 3 $19.50 $58.50\nProduct 25 3 $20.50 $61.50\nProduct 26 3 $21.50 $64.50\nProduct 27 3 $22.50 $67.50\nProduct 28 3 $23.50 $70.50\nProduct 29 3 $2

In [9]:
import json

def map_invoice_pages(elaborated_pages: List[ElaboratedPageTextData]) -> Dict[str, List[int]]:
    # Prepare LLM input by formatting all pages
    formatted = "\n\n".join([f"Page {i+1}:\n{txt}" for i, txt in enumerate(elaborated_pages)])
    
    system_prompt = ("""
        You are a document analysis assistant.
        You will be given a list of pages (with their page numbers) that may contain one or more invoice documents.\n"
        Your task is to identify which pages belong to the same invoice by recognizing invoice numbers and group them.\n\n"
        Return ONLY a valid JSON in the following format.
        like "{"invoice number\": [page numbers in int under invoice number]}
        for E.g.,
        "{"INV-003": [2]} or
        "{"INV-005": [4,5]} or 
        "{"344256": [1,2,3,4]}"
        json should contain have invoice number as key and list of page number belonging to that invoice  
    """)

    user_prompt = (
        "Here are the elaborated texts for all pages:\n\n"
        f"{formatted}\n\n"
        "Now identify and group the pages by invoice number."
    )

    response = llm.invoke([
        HumanMessage(content=system_prompt),
        HumanMessage(content=user_prompt)
    ])

    # Parse and return the JSON result
    try:
        raw_content = response.content.strip()
        clean_response = clean_llm_json(raw_content)

        invoice_map = json.loads(clean_response)
        return invoice_map
    except json.JSONDecodeError:
        print("LLM response was not valid JSON:\n", response.content)
        return {}

def clean_llm_json(text: str) -> str:
        """
        Cleans the LLM response to extract only the JSON content.

        :param text: Raw LLM output, possibly wrapped in ```json ... ```
        :return: Cleaned JSON string
        """
        if text.startswith("```json"):
            text = text.replace("```json", "").strip()
        if text.startswith("```"):
            text = text.replace("```", "").strip()
        if text.endswith("```"):
            text = text[:-3].strip()
        return text


In [10]:
a = map_invoice_pages(d)
a

{'INV-005': [1, 2, 3, 4]}

In [11]:
import yaml
from typing import Dict, List

def save_invoices_to_yaml_txt(
    invoice_map: Dict[str, List[int]],
    elaborated_pages: List[ElaboratedPageTextData],
    output_path: str = './output/'
):
     # Ensure the output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Construct the dictionary to dump to YAML
    yaml_data = {}

    for invoice_number, page_indices in invoice_map.items():
        # Combine only raw_text from mapped pages (1-based to 0-based)
        combined_text = "\n\n".join([
            elaborated_pages[i - 1].text
            for i in page_indices
            if 0 <= i - 1 < len(elaborated_pages)
        ])
        
        yaml_data = {
            "invoice_number": invoice_number,
            "content": combined_text
        }
        # Dump to YAML string
        yaml_string = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True)

        # Write YAML string to .txt file
        file_path = os.path.join(output_path, f"{invoice_number}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(yaml_string)

        print(f"✅ YAML saved to {output_path}")


In [12]:
save_invoices_to_yaml_txt(a,d)

✅ YAML saved to ./output/


In [24]:
for invoice_number, page_indices in a.items():
    print(invoice_number, page_number)

INV-001 2
