In [1]:
!python3.11 -m pip install --upgrade pip
!pip3 install langchain
!pip3 install pypdf
!pip3 install vertexai
!pip3 install chromadb
!pip3 install -U langchain-together
!pip3 install lark
# !pip3 install google-cloud-aiplatform

# !pip3 install OpenAI
# !pip3 install pypdf
# !pip3 install rapidocr-onnxruntime
# !pip3 install sentence-transformers
# !pip3 install chroma
# !pip3 install tiktoken
# !pip3 install llama-cpp-python
# !pip3 install jq

# !pip3 install pdfminer.six
# !pip3 install Cython




In [2]:
import time
import chromadb
from langchain.llms import Together
from langchain.document_loaders import (PyPDFLoader, JSONLoader)
from langchain.text_splitter import CharacterTextSplitter
from langchain_together.embeddings import TogetherEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.agents import AgentType, Tool, initialize_agent
from langchain.retrievers.self_query.base import SelfQueryRetriever



In [3]:

llm = Together(
    together_api_key="73d9504f61ce7f7b552568845901023587a3728c25cc04849d0f0c5e276a5d34", 
    model="mistralai/Mistral-7B-Instruct-v0.2"
    # model="mistralai/Mixtral-8x7B-Instruct-v0.1"
    # model="togethercomputer/Llama-2-7B-32K-Instruct"
    # model="openchat/openchat-3.5-1210"

)

vectorstore = chromadb.PersistentClient()

embeddings = TogetherEmbeddings(
    together_api_key="73d9504f61ce7f7b552568845901023587a3728c25cc04849d0f0c5e276a5d34",
    model="mistralai/Mistral-7B-v0.1"
    # model="togethercomputer/Llama-2-7B-32K-Instruct"
    # model="openchat/openchat-3.5-1210"
)


In [4]:

def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["chapterId"] = record.get("chapterId")
    metadata["chapterTitle"] = record.get("chapterTitle")
    metadata["requirementId"] = record.get("requirementId")
    return metadata


docs = JSONLoader(
    file_path='./standards/reporting-standard.json',
    jq_schema='.requirements[]',
    content_key="requirementText",
    metadata_func=metadata_func
).load()



for doc in docs:
    # esrs_store = Chroma.from_documents(client=vectorstore, collection_name="reporting-standard", documents=[doc], embedding=embeddings)
    esrs_store = Chroma.from_documents(documents=[doc], embedding=embeddings)
    time.sleep(1)

# esrs_retriever = SelfQueryRetriever.from_llm(
#     llm=llm, 
#     vectorstore=esrs_store, 
#     document_contents="One of the requirements.",
#     metadata_field_info=[
#         {"name": "chapterId", "description": "A numerical ID to group all requirements that are part of the same chapter.", "type": "string"},
#         {"name": "chapterTitle", "description": "The title of the chapter that the requirement is part of.", "type": "string"},
#         {"name": "requirementId", "description": "An ID to refer to a requirement. The ID consists of the chapterId followed by a letter.", "type": "string"},
#     ]
# )
esrs_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=esrs_store.as_retriever(search_kwargs={"k": 1}))



In [5]:
# esrs_chain("""
#     You are a helpful assistant who looks up the requested requirements based on the provided requirementId and checks if the provided text meets that requirement.

#     Assess whether the text below meets the requirement with the requirementId 1a.
#     "We will do these three things tomorrow to make the world a better place."
# """)
# esrs_chain(
#     {
#         "query": "Assess whether the text below meets the requirement with the requirementId 1a: 'We will do these three things tomorrow to make the world a better place. This will help the environment'", 
#         "context": "You are a helpful assistant who looks up the requested requirements based on the provided requirementId and checks if the provided text meets that requirement."
#     }
# )

In [6]:
# pdfLoader = PyPDFLoader("./SUSTAINABILITY_REPORT_2022-1-12.pdf")
# pages = pdfLoader.load()


with open("./reports/EcoCorp_test_report.txt", "r") as f:
    text = f.read()
texts = CharacterTextSplitter(
    # separators=["\n\n"],
    chunk_size = 500,
    chunk_overlap  = 20
).split_text(text)

for text in texts:
    report_store = Chroma.from_texts(client=vectorstore, collection_name="ecocorp-report", texts=[text], embedding=embeddings)
    time.sleep(1)

report_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=report_store.as_retriever())


HTTPError: 524 Server Error:  for url: https://api.together.xyz/api/v1/embeddings

In [None]:
tools = [
    Tool(
        name="Custom Palau Reporting Standard",
        func=lambda x: esrs_chain.run({
            "context": "You are a helpful assistant who answers any questions related to the 'Custom Palau Reporting Standard'. The standard consists of a list of requirements that are stored in the vectorstore. You have access to this store via your retriever.", 
            "query": x
        }),
        description="An LLM that has access to the requirements listed in the 'Custom Palau Reporting Standard' document. You can ask it questions about the requirements.",
    ),
    Tool(
        name="EcoCorp report",
        func=lambda x: report_chain.run({"context": "You are a helpful assistant who answers any questions related to the EcoCorp sustainability report.", "query": x}),
        description="An LLM that has access to the EcoCorp sustainability report. You can ask it questions about the contents of the report.",
    ),
]
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)


In [None]:
agent.invoke({
    "input": """
        You are a sustainability consultant and assess the compliance of a piece of text with a provided set of requirements. The piece of text is provided by the "EcoCorp report" Tool which has access to a retriever with the content. The set of requirements is provided to you by the "Custom Palau Reporting Standard" Tool which has access to a retriever with the requirements. 
        You should use the tools below to answer the question posed of you:
        - Custom Palau Reporting Standard
        - EcoCorp report

        Always use the following output format:

        Question: the input question you must answer
        Thought: you should always think about what to do
        Action: the action to take, should be one of [Custom Palau Reporting Standard, EcoCorp report]
        Action Input: the input to the action
        Observation: the result of the action
        ... (this Thought/Action/Action Input/Observation can repeat up to 3 times)
        Thought: I now know the final answer
        Final Answer: the final answer to the original input question

        Begin!
    """
})




[1m> Entering new AgentExecutor chain...[0m


ValueError: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: `I should look at the EcoCorp report to see if it meets the requirements of the Custom`

In [None]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["chapterTitle"] = record.get("chapterTitle")
    metadata["reference"] = record.get("reference")
    return metadata


data = JSONLoader(
    file_path='./ESRS_E1.json',
    jq_schema='.requirements[]',
    content_key="content",
    metadata_func=metadata_func
).load()

print(data)