In [19]:
# pip install langchain duckduckgo-search
# pip install langchain-google-community
# pip install pip-system-certs
# pip install -qU duckduckgo-search langchain-community
import os
cwd_path = os.getcwd()
import warnings
warnings.filterwarnings('ignore')

In [70]:
import streamlit as st
from openai import AzureOpenAI
from langchain.prompts import PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain import tools
from langchain_core.tools import tool, Tool
import langchain_community
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import AIMessage
from langchain.agents import AgentExecutor, create_tool_calling_agent, initialize_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import requests
# from langchain_community.document_loaders import WebBaseLoader, OnlinePDFLoader
# from langchain_community.tools import DuckDuckGoSearchRun
from googlesearch import search
from PyPDF2 import PdfReader
from pathlib import Path
import random

In [71]:
ENDPOINT = os.environ['OPENAI_ENDPOINT']
API_KEY = os.environ['OPENAI_API_KEY']
AZURE_DEPLOYMENT = os.environ['AZURE_DEPLOYMENT']
AZURE_OPENAI_VERSION = os.environ['AZURE_OPENAI_VERSION']

llm = AzureChatOpenAI(
    azure_endpoint=ENDPOINT,
    openai_api_key=API_KEY,
    azure_deployment=AZURE_DEPLOYMENT,
    openai_api_version=AZURE_OPENAI_VERSION
)

In [78]:
@tool
def SearchWebForPdf(query):
    """ Search for pdf dataset in the web. Return list of urls of pdf documents. The input
    to the tool will be a query text to perform the search of the datasheet. For example, if
    query is 'lm741 datasheet pdf' then you need to return all the links of the search results
    in a format of list. For example, if there are 3 search results of Link1, Link2 and Link3, 
    then this function should return [Link1, Link2, Link3]
    """ 
    result = search(query, tld="co.in", num=2, stop=2, pause=2)
    search_list =[]
    for j in result:
        search_list.append(j)
    return search_list

@tool
def DownloadFileTool(search_list):
    """Downloads top pdf files from URLs and saves it to a specified path. The input
    to the tool will be a list of links. All the links will be checked one by one if the link 
    contains pdf file or not. If the link type is pdf, then it will download the pdf
    file at a specified location in local disk"""
    
    failed_file = 0
    file_number = 1
    for url in search_list:    
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            # Check if the content type is PDF
            if 'application/pdf' not in response.headers.get('content-type', '').lower():
                print("Error: URL does not point to a PDF file.")
                failed_file+=1
            else:
                dwnld_path = cwd_path+"\Datasheet_Folder"
                if not os.path.exists(dwnld_path):
                    os.makedirs(dwnld_path)
                file_name = dwnld_path+"\datasheet_pdf_"+str(file_number) + ".pdf"
                with open(file_name, 'wb') as pdf_file:
                    for chunk in response.iter_content(chunk_size=8192):
                        pdf_file.write(chunk)
                print(f"File downloaded successfully")
                file_number+=1
        except requests.exceptions.RequestException as e:
            print(f"Error downloading file: {e}")
            failed_file+=1
    if(failed_file==len(search_list)):
        print ("No pdf file is found downloadable")
        return error
    else:
        print ("pdf file(s)) saved successfully")
        return dwnld_path
@tool
# Get the pdf file list
def get_pdf_files(folder_path):
    """
    Retrieves a list of PDF file paths from a specified folder.

    Args:
        folder_path (str): The path to the folder to search.

    Returns:
        list: A list of Path objects representing the PDF files found.
    """
    folder = Path(folder_path)
    pdf_files = list(folder.glob("*.pdf"))  # Finds all .pdf files in the immediate folder
    return pdf_files

@tool
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

@tool
def classify_document_with_openai(pdf_path, text_content, microchip_name):
    """Classifies the document using Azure OpenAI."""
    client = AzureOpenAI(
        api_key=API_KEY,
        azure_endpoint=ENDPOINT,
        api_version=AZURE_OPENAI_VERSION
    )

    prompt = f"""Analyze the following document text and determine if it is a datasheet for the microchip '{microchip_name}'.
    A microchip datasheet typically includes details like:
    - Electrical characteristics (voltage, current, frequency)
    - Pin configurations and descriptions
    - Operating conditions and environmental ratings
    - Package dimensions and types
    - Block diagrams or functional diagrams
    - Part numbers and ordering information
    - Application notes or typical usage circuits.

    Based on these criteria, is this document a microchip datasheet? Respond with 'YES' or 'NO' and a brief explanation.

    Document Text:
    {text_content[:8000]} # Truncate for very large documents if needed
    """

    response = client.chat.completions.create(
        model=AZURE_DEPLOYMENT,
        messages=[
            {"role": "system", "content": "You are an expert document classifier."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150
    )
    file_name = str(pdf_path).split('/')[-1]
    if('**NO**' in response.choices[0].message.content):
        print(f"{file_name} is not the datasheet")
    if('**YES**' in response.choices[0].message.content):
        print(f"{file_name} is the datasheet")

In [None]:
tools = [SearchWebForPdf,DownloadFileTool,get_pdf_files,extract_text_from_pdf,classify_document_with_openai]
random.shuffle(tools)
system_message = """
You are a web search and datasheet validation agent. Your job is to search and download datasheets in pdf for a specified
components and then read them from a folder and validate how many of them are valid datasheet. You have five helper functions 
'SearchWebForPdf', 'DownloadFileTool', 'get_pdf_files', 'extract_text_from_pdf' and 'classify_document_with_openai'. 
'SearchWebForPdf' will help you to get the all list of urls related to the human query. 'DownloadFileTool' will help you 
to download all valid pdfs from the list one by one and store in local disk in a specified location. 'get_pdf_files' will 
help to get the list of all pdf files from the targeted folder. 'extract_text_from_pdf' will help to get the content of 
pdf file into text format. Finally, 'classify_document_with_openai' will help to decide if a pdf file is the datasheet or not.
While calling 'classify_document_with_openai', you need to pass the microchip name from the human query message as one of the
parameters.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message),
        # MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ])

# initializing the agent
# agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)

# agent = create_tool_calling_agent(model, tools, prompt)
agent = create_tool_calling_agent(llm, tools, prompt)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)


human_query = '''
can you do google search and download datasheet of LF260 and verify if the downloaded file(s) is the datasheet of LF260? 
Do not print anything on the console unless printed by agent. 
'''
# Input with tool calls
messages = [
    ("user", human_query)
]

result = agent_executor.invoke({"input": messages})
# result = agent.run(human_query)

Test Functions

In [61]:
pdf_list = get_pdf_files(dwld_path)
for element in pdf_list:
    file_content = extract_text_from_pdf(element)
    classify_document_with_openai(element, file_content, 'LCO 7A chip')

datasheet_pdf_1.pdf is not the datasheet
datasheet_pdf_2.pdf is not the datasheet
datasheet_pdf_3.pdf is not the datasheet
datasheet_pdf_4.pdf is not the datasheet
datasheet_pdf_5.pdf is not the datasheet
datasheet_pdf_6.pdf is not the datasheet
datasheet_pdf_7.pdf is not the datasheet
datasheet_pdf_8.pdf is not the datasheet


In [45]:
response = classify_document_with_openai(pdf_file_exmple,'LCO 7A chip')