In [None]:
! pip -q install ollama-ocr
! pip -q install autogen

In [2]:
from autogen import AssistantAgent, UserProxyAgent
from autogen import register_function
from ollama_ocr import OCRProcessor

In [3]:
def doc_parser(file_path:str)->str:
    ocr = OCRProcessor(model_name='granite3.2-vision')
    result = ocr.process_file(
    input_file_path=file_path,
    format_type="text",
    language="eng",
)
    
    return result


In [4]:
config_list = [
    {
        "model": "llama3.2",
        "base_url": "http://localhost:11434/v1",
        'api_key': 'ollama',
    },
]
llm_config = {"config_list": config_list, "cache_seed": 42}

In [5]:
user = UserProxyAgent(
    name="human",
    llm_config=False,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
    code_execution_config= False
    
)

In [6]:
assistant = AssistantAgent(
    name="OCR_Agent",
    system_message="You are an expert OCR assistant. "
    "Your primary task is to extract text from documents using the 'doc_parser' tool. "
    "You should call the 'doc_parser' tool with the correct file path. "
    "Once you have extracted the text, summarize the document in no more than 50 words."
    "Return 'TERMINATE' when the task is done.",
    llm_config=llm_config,
    code_execution_config=False,
)

In [7]:
register_function(
    doc_parser,
    caller=assistant,
    executor= user,
    name="doc_parser",
    description="Extract text from a document and returns complete extracted text.",
)

In [None]:
user.initiate_chat(
    assistant,
    message="Hello, I have a document that I need help extracting text from 'panel_ui.pdf' ",
)

[33mhuman[0m (to OCR_Agent):

Hello, I have a document that I need help extracting text from 'panel_ui.pdf' 

--------------------------------------------------------------------------------
[33mOCR_Agent[0m (to human):


[32m***** Suggested tool call (call_abf74q3z): doc_parser *****[0m
Arguments: 
{"file_path":"panel_ui.pdf"}
[32m***********************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION doc_parser...
Call ID: call_abf74q3z
Input arguments: {'file_path': 'panel_ui.pdf'}[0m
No. of pages in the PDF 1
Using default prompt: Extract all visible text from this image in eng **without any changes**.
                                - **Do not summarize, paraphrase, or infer missing text.**
                                - Retain all spacing, punctuation, and formatting exactly as in the image.
                                - If text is unclear or partially visibl