In [1]:
import logging
from typing import List
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from pydantic import BaseModel
from models.invoice_schema import PageTextData, ElaboratedPageTextData
import os
import dotenv
from dotenv import load_dotenv
load_dotenv()
import logging 
logger = logging.getLogger(__name__)

In [2]:
from pathlib import Path
from pdf2image import convert_from_path

# Correct full path to the file
file_path = Path("d:/AI_Inovice/app/BYK INVOICE.pdf")

# Check if file exists
if not file_path.exists():
    raise FileNotFoundError(f"File not found: {file_path}")

# Now convert

images = convert_from_path(str(file_path), dpi=300)
print(f"Total pages converted: {len(images)}")

Total pages converted: 2


In [3]:
filename = file_path.name
pages = []
for page_number, image in enumerate(images, start=1):
    logger.info(f"Running OCR on page {page_number}...")
    text = pytesseract.image_to_string(image)
    pages.append(PageTextData(page_number=page_number,filename = filename, text=text))

pages

[PageTextData(page_number=1, filename='BYK INVOICE.pdf', text='Internal Invoice\n\nPage 1 von 1\n\nPlease always specify\nCustomer No.: 8000310000\n\nInternal invoice 9019015804 Date: 28.07.2025\n\nBYK USA Inc., 524 South Cherry Street, Wallingford, CT 06492\n\nBYK-Chemie GmbH\n\nAbelstr. 45\n\n46483 Wesel\n\nFederal Republic of Germany\n\nforwarding address\n\nBYK Chemie GmbH\nDistribution Center\nEmmelsumer Str. 221\n\n46485 Wesel\n\nFederal Republic of Germany\n\nYour order\n\nDate Mr. Gunjal, Pravin\n\nDelivery terms:\n\n04.06.2025\nCPT Rotterdam (Incoterms 2020)\n\nTel: 22 6820 4765 Fax: -4771\nOperations.BYK.India@altana.com\n\nShipping type: Stock Transfers\nPlanned goods movement date:\ninternal order No.: 4501547983\nMaterial Qty. Packing total Kg Netto Product name Total value/INR\n122530 400 Bags 1,0 400,0 RAW.200306 599,74 239.896,00\nSum items 239.896 ,00\n400 Units Kg Netto 400,0\nKg Brutto 420,0\nTotal amount INR 239.896 ,00\n\nPayment conditions:\nWithin 30 days Due net

In [4]:
from langchain_groq import ChatGroq
key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(model="gemma2-9b-it", groq_api_key=key)

In [5]:
from typing import List, Dict
from pydantic import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
import os

def generate_elaborated_texts(pages: List[PageTextData]) -> List[ElaboratedPageTextData]:
    elaborated_pages = []
    for page in pages:
        prompt = (
            "You are an expert in reading semi-structured invoices.\n"
            "Below is the raw OCR text from one page of a PDF invoice.\n"
            "Write a clear and complete elaboration of what this invoice page contains in natural language.\n\n"
            f"Page {page.page_number}:\n{page.text}"
        )
        response = llm.invoke([HumanMessage(content=prompt)])
        elaborated_pages.append(
                ElaboratedPageTextData(
                    page_number=page.page_number,
                    filename= page.filename,
                    text= page.text,
                    elaborated_text=response.content.strip()))
    return elaborated_pages

In [6]:
d= generate_elaborated_texts(pages= pages)
d

[ElaboratedPageTextData(page_number=1, filename='BYK INVOICE.pdf', text='Internal Invoice\n\nPage 1 von 1\n\nPlease always specify\nCustomer No.: 8000310000\n\nInternal invoice 9019015804 Date: 28.07.2025\n\nBYK USA Inc., 524 South Cherry Street, Wallingford, CT 06492\n\nBYK-Chemie GmbH\n\nAbelstr. 45\n\n46483 Wesel\n\nFederal Republic of Germany\n\nforwarding address\n\nBYK Chemie GmbH\nDistribution Center\nEmmelsumer Str. 221\n\n46485 Wesel\n\nFederal Republic of Germany\n\nYour order\n\nDate Mr. Gunjal, Pravin\n\nDelivery terms:\n\n04.06.2025\nCPT Rotterdam (Incoterms 2020)\n\nTel: 22 6820 4765 Fax: -4771\nOperations.BYK.India@altana.com\n\nShipping type: Stock Transfers\nPlanned goods movement date:\ninternal order No.: 4501547983\nMaterial Qty. Packing total Kg Netto Product name Total value/INR\n122530 400 Bags 1,0 400,0 RAW.200306 599,74 239.896,00\nSum items 239.896 ,00\n400 Units Kg Netto 400,0\nKg Brutto 420,0\nTotal amount INR 239.896 ,00\n\nPayment conditions:\nWithin 30 da

In [7]:
import json

def map_invoice_pages(elaborated_pages: List[ElaboratedPageTextData]) -> Dict[str, List[int]]:
    # Prepare LLM input by formatting all pages
    formatted = "\n\n".join([f"Page {i+1}:\n{txt}" for i, txt in enumerate(elaborated_pages)])
    
    system_prompt = ("""
        You are a document analysis assistant.
        You will be given a list of pages (with their page numbers) that may contain one or more invoice documents.\n"
        Your task is to identify which pages belong to the same invoice by recognizing invoice numbers and group them.\n\n"
        Return ONLY a valid JSON in the following format.
        like "{"invoice number\": [page numbers in int under invoice number]}
        for E.g.,
        "{"INV-003": [2]} or
        "{"INV-005": [4,5]} or 
        "{"344256": [1,2,3,4]}"
        json should contain have invoice number as key and list of page number belonging to that invoice  
    """)

    user_prompt = (
        "Here are the elaborated texts for all pages:\n\n"
        f"{formatted}\n\n"
        "Now identify and group the pages by invoice number."
    )

    response = llm.invoke([
        HumanMessage(content=system_prompt),
        HumanMessage(content=user_prompt)
    ])

    # Parse and return the JSON result
    try:
        raw_content = response.content.strip()
        clean_response = clean_llm_json(raw_content)

        invoice_map = json.loads(clean_response)
        return invoice_map
    except json.JSONDecodeError:
        print("LLM response was not valid JSON:\n", response.content)
        return {}

def clean_llm_json(text: str) -> str:
        """
        Cleans the LLM response to extract only the JSON content.

        :param text: Raw LLM output, possibly wrapped in ```json ... ```
        :return: Cleaned JSON string
        """
        if text.startswith("```json"):
            text = text.replace("```json", "").strip()
        if text.startswith("```"):
            text = text.replace("```", "").strip()
        if text.endswith("```"):
            text = text[:-3].strip()
        return text


In [8]:
a = map_invoice_pages(d)
a

{'9019015804': [1, 2]}

In [9]:
def combine_invoice_text(page_mapping: Dict[str, List[int]],
                            elaborated_text_pages: List[ElaboratedPageTextData]
    ) -> Dict[str, str]:
    """
    Combines elaborated text for each invoice based on a given page mapping.

    Args:
    page_mapping (Dict[str, List[int]]): Dictionary mapping invoice IDs to a list of page numbers.
    Example: {"INV001": [1, 2], "INV002": [3]}

    elaborated_text_pages (List[ElaboratedTextPageData]): List of ElaboratedTextPageData objects
    containing page numbers and text.

    Returns:
        Dict[str, str]: A dictionary mapping invoice IDs to concatenated elaborated text.
        Example: {"INV001": "Full text from page 1 and 2...", "INV002": "Text from page 3"}
    """
    try:
        # Convert list to dict for quick lookup
        page_text_lookup = {page.page_number: page.text for page in elaborated_text_pages}
        logging.info("Page text lookup table created with %d entries.", len(page_text_lookup))

        combined_texts = {}

        for invoice_id, pages in page_mapping.items():
            combined_text = []
            for page_num in pages:
                text = page_text_lookup.get(page_num)
                if text:
                    combined_text.append(text)
                else:
                    logging.warning(
                        "Page number %d for invoice %s not found in provided elaborated text data.",
                        page_num, invoice_id
                    )
            combined_texts[invoice_id] = "\n".join(combined_text).strip()
            logging.info("Combined %d pages for invoice %s.", len(combined_text), invoice_id)

            return combined_texts

    except Exception as e:
        logging.exception("Error occurred while combining invoice texts: %s", e)
        raise


In [10]:
f= combine_invoice_text(a,d)
f

{'9019015804': 'Internal Invoice\n\nPage 1 von 1\n\nPlease always specify\nCustomer No.: 8000310000\n\nInternal invoice 9019015804 Date: 28.07.2025\n\nBYK USA Inc., 524 South Cherry Street, Wallingford, CT 06492\n\nBYK-Chemie GmbH\n\nAbelstr. 45\n\n46483 Wesel\n\nFederal Republic of Germany\n\nforwarding address\n\nBYK Chemie GmbH\nDistribution Center\nEmmelsumer Str. 221\n\n46485 Wesel\n\nFederal Republic of Germany\n\nYour order\n\nDate Mr. Gunjal, Pravin\n\nDelivery terms:\n\n04.06.2025\nCPT Rotterdam (Incoterms 2020)\n\nTel: 22 6820 4765 Fax: -4771\nOperations.BYK.India@altana.com\n\nShipping type: Stock Transfers\nPlanned goods movement date:\ninternal order No.: 4501547983\nMaterial Qty. Packing total Kg Netto Product name Total value/INR\n122530 400 Bags 1,0 400,0 RAW.200306 599,74 239.896,00\nSum items 239.896 ,00\n400 Units Kg Netto 400,0\nKg Brutto 420,0\nTotal amount INR 239.896 ,00\n\nPayment conditions:\nWithin 30 days Due net\n\nOur general terms and conditions of sale an

In [11]:
def extract_invoice_table_from_dict(invoices_dict: dict) -> list[dict]:
        """
        Extracts structured invoice data from a dictionary of {invoice_id: invoice_text}.

        This method:
        - Iterates over each invoice's full text (already merged from multiple pages)
        - Sends it to the LLM to extract all possible fields as JSON
        - Adds the `Invoice_ID` to each extracted JSON

        Parameters
        ----------
        invoices_dict : dict
            Dictionary where:
                - Key   = Invoice ID (string)
                - Value = Full invoice text (string)

        Returns
        -------
        list[dict]
            A list of extracted invoice dictionaries, each containing:
                - 'Invoice_ID': The invoice ID from the key
                - All extracted fields from the LLM
            Returns an empty list if no valid data is extracted.

        Exceptions
        ----------
        Logs errors but does not raise exceptions.
        """
        results = []

        try:
            if not invoices_dict or not isinstance(invoices_dict, dict):
                logging.warning("⚠ No valid invoice dictionary provided.")
                return []

            for invoice_id, invoice_text in invoices_dict.items():
                try:
                    prompt = f"""
                    You are an expert in reading invoices.  
                    Below is the full text of one invoice.  
                    Extract **all available fields** such as invoice number, seller name, buyer name, address,
                    item details, amount, tax, total, etc.  
                    Output as **valid JSON** with keys as field names and values as strings.  
                    Do NOT skip any field present in the text.  
                    If a field is missing, do not include it.

                    Invoice Text:
                    {invoice_text}
                    """

                    response = llm.invoke([HumanMessage(content=prompt)])
                    clean_json = clean_llm_json(response.content)
                    invoice_data = json.loads(clean_json)

                    if isinstance(invoice_data, dict):
                        invoice_data["Invoice_ID"] = invoice_id
                        results.append(invoice_data)
                    else:
                        logging.warning(f"⚠ LLM did not return a valid dict for invoice {invoice_id}")

                except json.JSONDecodeError as e:
                    logging.error(f"❌ JSON parsing failed for invoice {invoice_id}: {e}")
                except Exception as e:
                    logging.error(f"❌ Error processing invoice {invoice_id}: {e}")

            return results

        except Exception as e:
            logging.exception(f"Unexpected error in extract_invoice_table_from_dict: {e}")
            return []


In [12]:
q=extract_invoice_table_from_dict(f)
q

[{'Internal Invoice': '9019015804',
  'Date': '28.07.2025',
  'Seller Name': 'BYK-Chemie GmbH',
  'Seller Address': 'Abelstr. 45\n46483 Wesel\nFederal Republic of Germany',
  'Seller Forwarding Address': 'BYK Chemie GmbH\nDistribution Center\nEmmelsumer Str. 221\n46485 Wesel\nFederal Republic of Germany',
  'Buyer Name': 'BYK India Pvt. Ltd.',
  'Buyer Address': '403, 4th Floor, Quantum\nHiranandani Business Park\nGhodbunder Road, Thane\nMumbai 400 607, India',
  'Customer No.': '8000310000',
  'Order Date': '04.06.2025',
  'Delivery Terms': 'CPT Rotterdam (Incoterms 2020)',
  'Shipping Type': 'Stock Transfers',
  'Internal Order No.': '4501547983',
  'Material': [{'Product Name': 'RAW.200306',
    'Qty': '400',
    'Packing': 'Bags',
    'Net': '1.0',
    'Total Kg Netto': '400.0',
    'Total Value/INR': '239.896,00'}],
  'Sum Items': '239.896,00',
  'Total Kg Netto': '400.0',
  'Total Kg Brutto': '420.0',
  'Total Amount INR': '239.896,00',
  'Payment Conditions': 'Within 30 days Due

In [13]:
import pandas as pd

def flatten_invoice_data(invoices: list[dict]) -> pd.DataFrame:
    """
    Flattens invoice JSON data so that:
    - Any field with list[dict] gets expanded into rows
    - Header fields appear only once at the top
    - Final result is one continuous DataFrame
    """
    all_frames = []

    for invoice in invoices:
        # Separate header fields (non-list)
        header_data = {k: v for k, v in invoice.items() if not isinstance(v, list)}
        
        # Find all list-of-dict fields
        list_fields = {k: v for k, v in invoice.items() if isinstance(v, list) and all(isinstance(i, dict) for i in v)}
        
        if list_fields:
            for field_name, list_items in list_fields.items():
                df_items = pd.DataFrame(list_items)
                df_items.insert(0, "Section", field_name)  # Label which section items came from
                all_frames.append(df_items)
        else:
            # No list-of-dict field — just output the header
            all_frames.append(pd.DataFrame([header_data]))

        # Add a blank row after each invoice's details to separate visually
        all_frames.append(pd.DataFrame([{}]))

    # Merge all frames
    final_df = pd.concat(all_frames, ignore_index=True)

    # Insert header info only once at the top
    header_df = pd.DataFrame([header_data])
    final_df = pd.concat([header_df, final_df], ignore_index=True)

    return final_df


In [14]:
h= flatten_invoice_data(q)
h

Unnamed: 0,Internal Invoice,Date,Seller Name,Seller Address,Seller Forwarding Address,Buyer Name,Buyer Address,Customer No.,Order Date,Delivery Terms,...,Packing List Date of Dispatch ex Warehouse,Packing List Mode of Shipment,Packing List NO OF PALLETS,Invoice_ID,Section,Product Name,Qty,Packing,Net,Total Value/INR
0,9019015804.0,28.07.2025,BYK-Chemie GmbH,Abelstr. 45\n46483 Wesel\nFederal Republic of ...,BYK Chemie GmbH\nDistribution Center\nEmmelsum...,BYK India Pvt. Ltd.,"403, 4th Floor, Quantum\nHiranandani Business ...",8000310000.0,04.06.2025,CPT Rotterdam (Incoterms 2020),...,23.08.2025,Stock Transfers,1.0,9019015804.0,,,,,,
1,,,,,,,,,,,...,,,,,Material,RAW.200306,400.0,Bags,1.0,"239.896,00"
2,,,,,,,,,,,...,,,,,,,,,,
