In [None]:
# !pip uninstall fitz

In [2]:
from langgraph.graph import Graph
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyPDFLoader
from langchain.chains import LLMChain
from langchain.document_loaders.image import UnstructuredImageLoader
import os
import base64
import openai
from dotenv import load_dotenv
load_dotenv()
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
TEMPERATURE = os.getenv("TEMPERATURE")
MODEL_NAME = os.getenv("MODEL_NAME")

from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated
import operator
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
from langchain.chat_models import ChatOpenAI
from pdf2image import convert_from_path


In [3]:
class AgentState(TypedDict):
    messages: Annotated[list[AnyMessage], operator.add]
    

In [4]:
class Agent:

    def __init__(self,model,system="",filepath=None):
        self.system=system
        self.model = model
        self.filepath = filepath    
        graph = StateGraph(AgentState)
        graph.add_node("classify_file_type",self.classify_file_type)
        graph.add_node("extract_from_pdf",self.extract_pdf_data)
        graph.add_node("extract_from_image",self.extract_data_from_image)
        graph.add_node("extract_attributes",self.extract_attributes_from_text)

        graph.add_conditional_edges("classify_file_type",self.file_type_check,{"PDF":"extract_from_pdf","IMAGE":"extract_from_image"})
        graph.add_edge("extract_from_pdf","extract_attributes")
        graph.add_edge("extract_from_image","extract_attributes")
        
        graph.set_entry_point("classify_file_type")
        graph.set_finish_point("extract_attributes")  # Either can be final
        self.graph = graph.compile()



    def file_type_check(self, state: AgentState):
        print(state['messages'][-1]['file_type'])
        return state['messages'][-1]['file_type']
        # if state["messages"][-1].content.startswith("data:application/pdf;base64,"):


    def classify_file_type(self, state: AgentState):
        messages = state['messages']
        if self.system:
            messages = [SystemMessage(content=self.system)] + messages
        message = self.model.invoke(messages)
        print(message.content)
        print("")
        if message.content.upper() == "PDF":
            print("in pdf check")
            dictn = {"file_type":"PDF"}
                
        elif message.content.upper() == "IMAGE" :
            print("in image check")
            dictn = {"file_type":"IMAGE"} 
        else:
            dictn = {"file_type":"None"} 
        
        new_messages = state["messages"]+[dictn]
        # print(new_messages)
        return {**state, "messages": new_messages}


    def extract_pdf_data(self, state: AgentState):
        print("Extracting data from PDF")
        try:
            loader = PyPDFLoader(self.filepath)
            docs = loader.load()
        except Exception as e:
            print(f"Error loading PDF: {e}")
            return None 
        
        pdf_content = "\n".join([doc.page_content for doc in docs])
        print(pdf_content.upper().split())
        print(type(pdf_content))

        if(('INVOICE' not in pdf_content.upper().split()) or ('TOTAL' not in pdf_content.upper().split())):
            print("PDF does not contain invoice information, converting to image")
            self.convert_pdf_to_image(self)
            pdf_content = self.extract_data_from_image(self)['messages'][0]
            print(pdf_content)


        
        if not pdf_content:
            print("No content in the pdf")
            return None
        
        return {'messages': [pdf_content]}

    def convert_pdf_to_image(self, state: AgentState):
        """Extract text from scanned (image-based) PDF using OCR."""
        print("Extracting text from scanned PDF")
        print(self.filepath)
        try:

            images = convert_from_path(self.filepath, dpi=300)
            
            if images:
                image = images[0]
                directory = os.path.dirname(self.filepath)
                base_name = os.path.basename(self.filepath)             
                file_root = os.path.splitext(base_name)[0]         
                output_path = os.path.join(directory, file_root + ".png")
                output_path = output_path.replace("\\", "/")
                images[0].save(output_path, "PNG")
                print(f"Saved pdf as image: {output_path}")  
                self.filepath = output_path 
                print("New file path:", self.filepath)                
                return None              

        except Exception as e:
            print(f"Error converting PDF to image: {e}")
            return None
        

   
    

    def encode_image(self, state: AgentState):
        with open(self.filepath, "rb") as f:
            return base64.b64encode(f.read()).decode("utf-8")


    def extract_data_from_image(self,state: AgentState):

        print("Extracting data from image")
        print(self.filepath)
        base64_image = self.encode_image(self.filepath)

        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    },
                    {
                        "type": "text",
                        "text": "Extract all visible text from this image. Do not summarize or omit anything."
                    }
                ]
            }
        ], max_tokens=1000, temperature=0.0
            )

        # print(response['choices'][0]['message']['content'])
        extracted_text = response.choices[0].message.content
        return {'messages': [extracted_text]}

    def extract_attributes_from_text(self,state: AgentState):
        print("Extracting attributes from text")

        prompt = PromptTemplate(template = """You are an AI assistant that extracts structured data from invoices.Extract the following information :
                        invoice number, 
                        invoice date, 
                        vendor information, 
                        line items, 
                        total due, 
                        tax details, 
                        payment terms
                    from Text : {text}. 
                    If any of the field cannot be found , keep it blank rather than putting wrong value.
                    Final output should be json file with above attribute. Line items should be nested json with main json.",input_variables=["text"]""")

        chain = LLMChain(llm=self.model,prompt=prompt)
        response = chain.run(text = state['messages'][-1])
        print(response)

        return {'messages': [response]}

In [5]:
# prompt = """You are an analyst who analyses the invoices raised by the vendors. The invoices can be variour types including images,pdfs,scanned pdfs. \
# By the name of the file, figure out the type of file and return the file type. The answer should be one word e.g. PDF, IMAGE, SCANNED_PDF.etc \
# """

# model=ChatOpenAI(openai_api_key=OPENAI_KEY,model_name = MODEL_NAME,temperature = TEMPERATURE)        
# abot = Agent(model, system=prompt,filepath='./data/Freelancer.pdf')

# human_query_prompt = """Extract the following fields:
#                     invoice number, 
#                     invoice date, 
#                     vendor information, 
#                     line items, 
#                     total due, 
#                     tax details, 
#                     payment terms from the file at below location filepath = {filepath}"""
# formatted_prompt = human_query_prompt.format(filepath=abot.filepath)

# messages = [HumanMessage(content=formatted_prompt)]
# result = abot.graph.invoke({"messages": messages})
# 'INVOICE' not in ['BIOPLEX', 'WE', 'LOVE', 'CHEMISTRY', '5', 'RUE', 'BADER', 'NARBONNE,', 'AUDE,', '11100', 'PHONE:', '+33', '140', '260294', 'INVOICE', 'INVOICE', '#', 'BPXINV-00550', 'DATE:', '23.05.2021', 'TO:', 'ROGER', 'BIGOT', 'BONBONO', '4', 'RUE', 'DES', 'CITES', 'AUBERVILLIERS', ',', 'SEINE', '-SAINT-DENIS,', '93300', 'PHONE:', '+33', '148', '340990', 'SHIP', 'TO:', 'ROGER', 'BIGOT', 'BONBONO', '4', 'RUE', 'DES', 'CITES', 'AUBERVILLIERS', ',', 'SEINE', '-SAINT-DENIS,', '93300', 'PHONE:', '+33', '148', '340990', 'COMMENTS', 'OR', 'SPECIAL', 'INSTRUCTIONS:', 'NONE', 'SALESPERSON', 'P.O.', 'NUMBER', 'TERMS', 'MARIANNE', 'DE', 'LA', 'GUILLAUME', 'BPXPO-00536', 'DUE', 'AFTER', '30', 'DAYS', 'QUANTITY', 'DESCRIPTION', 'UNIT', 'PRICE', 'TOTAL', '10', 'DEXTROMETHORPHAN', 'POLISTIREX', 'BPXPN', '-00057', '12.45', '124.50', '25', 'VENLAFAXINE', 'HYDROCHLORIDE', 'BPXPN', '-00012', '16.00', '400.00', '25', 'METOCLOPRAMIDE', 'HYDROCHLORIDE', '(BPXPO', '-00537)', 'BPXPN', '-00002', '9.99', '249.75', '10', 'AVOBENZONE,', 'OCTINOXATE', '(BPXPO', '-00538)', 'BPXPN', '-00027', '4.45', '44.50', '10', 'VERAPAMIL', 'HYDROCHLORIDE', 'BPXPN', '-00066', '7.89', '78.90', '15', 'TIAGABINE', 'HYDROCHLORIDE', 'BPXPN', '-00017', '10.25', '153.75', '10', 'ZIPRASIDONE', 'HYDROCHLORIDE', '(BPXPO', '-00537)', 'BPXPN', '-00044', '34.99', '349.90', '10', 'RISPERIDONE', 'BPXPN', '-00023', '34.99', '349.90', '10', 'METOPROLOL', 'SUCCINATE', 'BPXPN', '-00067', '34.99', '349.90', '10', 'ACETAMINOPHEN', 'BPXPN', '-00045', '34.99', '349.90', '15', 'SORAFENIB', 'BPXPN', '-00018', '16.00', '240.00', '15', 'TELMISARTAN', 'BPXPN', '-00022', '9.99', '149.85', '15', 'FAMOTIDINE', 'BPXPN', '-00068', '4.45', '66.75', '15', 'METHYLPHENIDATE', 'HYDROCHLORIDE', 'BPXPN', '-00005', '7.89', '118.35', '100', 'IBUPROFEN', '(BPXPO', '-00538)', '0.99', '99.00', '2', 'BPXPN', '-00052', '15', 'METFORMIN', 'HYDROCHLORIDE', '(BPXPO', '-00538)', 'BPXPN', '-00046', '2.15', '32.25', '15', 'AVOBENZONE,', 'OCTISALATE', 'AND', 'OCTOCRYLENE', 'BPXPN', '-00069', '16.99', '254.85', '10', 'CARISOPRODOL', 'BPXPN', '-00070', '34.99', '349.90', '10', 'LOSARTAN', 'POTASSIUM', 'BPXPN', '-00047', '34.99', '349.90', '10', 'PENTAZOCINE', 'HYDROCHLORIDE', 'AND', 'NALOXONE', 'HYDROCHLORIDE', 'BPXPN', '-00051', '34.99', '349.90', '25', 'OMEPRAZOLE', 'BPXPN', '-00071', '9.99', '249.75', '25', 'LOSARTAN', 'POTASSIUM', 'BPXPN', '-00019', '4.45', '111.25', '10', 'SALINE', 'BPXPN', '-00048', '7.89', '78.90', '25', 'TITANIUM', 'DIOXIDE', 'BPXPN', '-00021', '10.25', '256.25', '25', 'BICALUTAMIDE', '(BPXPO', '-00538)', 'BPXPN', '-00049', '2.15', '53.75', '15', 'AMPICILLIN', 'SODIUM', 'BPXPN', '-00050', '16.99', '254.85', '15', 'OCTINOXATE,', 'TITANIUM', 'DIOXIDE,', 'OCTISALATE', 'BPXPN', '-00004', '12.45', '186.75', '25', 'CAVIA', 'PORCELLUS', 'HAIR', 'AND', 'CAVIA', 'PORCELLUS', 'SKIN', 'BPXPN', '-00020', '12.45', '311.25', 'SUBTOTAL', '5964.50', 'SALES', 'TAX', '596.45', 'SHIPPING', '&', 'HANDLING', '50.00', 'TOTAL', 'DUE', '6610.95', 'MAKE', 'ALL', 'CHECKS', 'PAYABLE', 'TO', 'BIOPLEX', 'IF', 'YOU', 'HAVE', 'ANY', 'QUESTIONS', 'CONCERNING', 'THIS', 'INVOICE', ',', 'CONTACT', 'MARIANNE', 'DE', 'LA', 'GUILLAUME', ',', '+33', '140', '260294', ',', 'MARIANNE.GUILLAUME', '@BIOPLEX.FR', 'THANK', 'YOU', 'FOR', 'YOUR', 'BUSINESS!']


In [6]:
directory_path = './data'
count = 0

for filename in os.listdir(directory_path):
    # if (count ==3):
    #     break
    # count+= 1

    file_path = os.path.join(directory_path, filename)
    file_path= file_path.replace("\\", "/")  # Normalize path for consistency
    if os.path.isfile(file_path):
        print(file_path)
    prompt = """You are an analyst who analyses the invoices raised by the vendors. The invoices can be variour types including images,pdfs,scanned pdfs. \
    By the name of the file, figure out the type of file and return the file type. The answer should be one word e.g. PDF, IMAGE, SCANNED_PDF.etc \
    """

    model=ChatOpenAI(openai_api_key=OPENAI_KEY,model_name = MODEL_NAME,temperature = TEMPERATURE)        
    abot = Agent(model, system=prompt,filepath=file_path)

    human_query_prompt = """Extract the following fields:
                        invoice number, 
                        invoice date, 
                        vendor information, 
                        line items, 
                        total due, 
                        tax details, 
                        payment terms from the file at below location filepath = {filepath}"""
    formatted_prompt = human_query_prompt.format(filepath=abot.filepath)

    messages = [HumanMessage(content=formatted_prompt)]
    result = abot.graph.invoke({"messages": messages})

./data/bioplex.pdf


  model=ChatOpenAI(openai_api_key=OPENAI_KEY,model_name = MODEL_NAME,temperature = TEMPERATURE)


PDF

in pdf check
PDF
Extracting data from PDF
['BIOPLEX', 'WE', 'LOVE', 'CHEMISTRY', '5', 'RUE', 'BADER', 'NARBONNE,', 'AUDE,', '11100', 'PHONE:', '+33', '140', '260294', 'INVOICE', 'INVOICE', '#', 'BPXINV-00550', 'DATE:', '23.05.2021', 'TO:', 'ROGER', 'BIGOT', 'BONBONO', '4', 'RUE', 'DES', 'CITES', 'AUBERVILLIERS', ',', 'SEINE', '-SAINT-DENIS,', '93300', 'PHONE:', '+33', '148', '340990', 'SHIP', 'TO:', 'ROGER', 'BIGOT', 'BONBONO', '4', 'RUE', 'DES', 'CITES', 'AUBERVILLIERS', ',', 'SEINE', '-SAINT-DENIS,', '93300', 'PHONE:', '+33', '148', '340990', 'COMMENTS', 'OR', 'SPECIAL', 'INSTRUCTIONS:', 'NONE', 'SALESPERSON', 'P.O.', 'NUMBER', 'TERMS', 'MARIANNE', 'DE', 'LA', 'GUILLAUME', 'BPXPO-00536', 'DUE', 'AFTER', '30', 'DAYS', 'QUANTITY', 'DESCRIPTION', 'UNIT', 'PRICE', 'TOTAL', '10', 'DEXTROMETHORPHAN', 'POLISTIREX', 'BPXPN', '-00057', '12.45', '124.50', '25', 'VENLAFAXINE', 'HYDROCHLORIDE', 'BPXPN', '-00012', '16.00', '400.00', '25', 'METOCLOPRAMIDE', 'HYDROCHLORIDE', '(BPXPO', '-00537)

  chain = LLMChain(llm=self.model,prompt=prompt)
  response = chain.run(text = state['messages'][-1])


{
"invoice_number": "BPXINV-00550",
"invoice_date": "23.05.2021",
"vendor_information": {
    "name": "Bioplex",
    "address": "5 Rue Bader, Narbonne, Aude, 11100",
    "phone": "+33 140 260294",
    "contact_person": "Marianne de la Guillaume",
    "contact_email": "marianne.guillaume@bioplex.fr"
},
"line_items": [
    {"quantity": 10, "description": "Dextromethorphan polistirex", "unit_price": 12.45, "total": 124.50},
    {"quantity": 25, "description": "Venlafaxine Hydrochloride", "unit_price": 16.00, "total": 400.00},
    {"quantity": 25, "description": "Metoclopramide Hydrochloride", "unit_price": 9.99, "total": 249.75},
    {"quantity": 10, "description": "Avobenzone, octinoxate", "unit_price": 4.45, "total": 44.50},
    {"quantity": 10, "description": "Verapamil hydrochloride", "unit_price": 7.89, "total": 78.90},
    {"quantity": 15, "description": "Tiagabine hydrochloride", "unit_price": 10.25, "total": 153.75},
    {"quantity": 10, "description": "Ziprasidone hydrochloride",