In [1]:
import PyPDF2

In [2]:
!wget "https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_LongerReport.pdf" -O "ipcc_report.pdf"

--2024-08-21 13:12:30--  https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_LongerReport.pdf
Resolving www.ipcc.ch (www.ipcc.ch)... 2606:4700:8de4:eefd:38fd:71:6814:fe03, 104.20.255.3, 104.20.254.3, ...
Connecting to www.ipcc.ch (www.ipcc.ch)|2606:4700:8de4:eefd:38fd:71:6814:fe03|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5218936 (5.0M) [application/pdf]
Saving to: ‘ipcc_report.pdf’


2024-08-21 13:12:50 (328 KB/s) - ‘ipcc_report.pdf’ saved [5218936/5218936]



In [3]:
text=""
with open("./ipcc_report.pdf", 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    num_pages = len(reader.pages)
    for page_num in range(num_pages):
        text += reader.pages[page_num].extract_text()

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024, chunk_overlap=200, separators=[".", "\n"], length_function=len)

In [8]:
chunks = text_splitter.create_documents([text])


In [18]:
from langchain_chroma import Chroma
from langchain_google_vertexai import VertexAI
from langchain_community.embeddings import VertexAIEmbeddings
from langchain_google_vertexai import VertexAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)


In [10]:
Chroma.from_documents(documents=chunks, embedding=VertexAIEmbeddings(),persist_directory="./db")

  warn_deprecated(
Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


<langchain_chroma.vectorstores.Chroma at 0x7d1cdcd6f4c0>

In [12]:
retriever = Chroma(persist_directory="./db", embedding_function=VertexAIEmbeddings()).as_retriever()

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


In [19]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


In [40]:
safety_settings: [
    {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_NONE"
    },
    {
      "category": "HARM_CATEGORY_HATE_SPEECH",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_HARASSMENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_ONLY_HIGH"
    }
  ]


In [43]:
session_id = "xyz300"

CHAT_MODEL=VertexAI(model_name="gemini-pro")


history_aware_retriever = create_history_aware_retriever(
    CHAT_MODEL, retriever, contextualize_q_prompt
)

question_answer_chain = create_stuff_documents_chain(CHAT_MODEL, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

store = {}
store[session_id] = ChatMessageHistory()

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)



In [21]:
conversational_rag_chain.invoke(
    {"input": "What was the global surface temperature"},
    config={
        "configurable": {"session_id": session_id}
    })["answer"]

'The global surface temperature has increased by 1.1°C since the pre-industrial period (1850-1900). This warming is primarily due to human activities, such as the burning of fossil fuels. The most recent decade (2011-2020) was around 1.1°C warmer than 1850-1900. This is the highest average global surface temperature for at least 125,000 years. '

In [24]:
from giskard.rag import KnowledgeBase, generate_testset, QATestset
import pandas as pd

In [49]:
import os
import giskard

import google.generativeai as genai

from giskard.llm.client.gemini import GeminiClient

genai.configure(api_key="api key")

giskard.llm.set_default_client(GeminiClient(model="gemini-pro"))

In [48]:
GeminiClient?

[0;31mInit signature:[0m [0mGeminiClient[0m[0;34m([0m[0mmodel[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'gemini-pro'[0m[0;34m,[0m [0m_client[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Helper class that provides a standard way to create an ABC using
inheritance.
[0;31mFile:[0m           ~/source/repos/BIMPoC/pythonenv/lib/python3.10/site-packages/giskard/llm/client/gemini.py
[0;31mType:[0m           ABCMeta
[0;31mSubclasses:[0m     

In [50]:
knowledge_base_df = pd.DataFrame([node.page_content for node in chunks], columns=["text"])

In [51]:
knowledge_base = KnowledgeBase(knowledge_base_df,embedding_model=VertexAIEmbeddings())



In [52]:
testset = generate_testset(knowledge_base,
                           num_questions=120,
                           agent_description="A chatbot answering questions about the IPCC report")


2024-08-21 14:05:12,393 pid:182292 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-08-21 14:05:48,315 pid:182292 MainThread giskard.rag  INFO     Found 9 topics in the knowledge base.


Generating questions:   0%|          | 0/120 [00:00<?, ?it/s]

2024-08-21 14:05:48,646 pid:182292 MainThread giskard.rag  ERROR    Encountered error in question generation: 400 Please use a valid role: user, model.. Skipping.
2024-08-21 14:05:48,647 pid:182292 MainThread giskard.rag  ERROR    400 Please use a valid role: user, model.
Traceback (most recent call last):
  File "/home/debian/source/repos/BIMPoC/pythonenv/lib/python3.10/site-packages/giskard/rag/question_generators/base.py", line 57, in generate_questions
    yield self.generate_single_question(knowledge_base, *args, **kwargs)
  File "/home/debian/source/repos/BIMPoC/pythonenv/lib/python3.10/site-packages/giskard/rag/question_generators/simple_questions.py", line 96, in generate_single_question
    generated_qa = self._llm_complete(messages=messages)
  File "/home/debian/source/repos/BIMPoC/pythonenv/lib/python3.10/site-packages/giskard/rag/question_generators/base.py", line 42, in _llm_complete
    out = self._llm_client.complete(
  File "/home/debian/source/repos/BIMPoC/pythonenv/li

KeyError: "None of ['id'] are in the columns"

In [53]:
testset = QATestset.load("ipcc_testset.jsonl")


In [55]:
testset.to_pandas().head(5)


Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
450623f7-e644-4bfa-88d5-90f31dd15d99,What are the consequences of global warming ex...,Climate resilient development will not be poss...,Document 196: Accelerated and equitable mitiga...,[],"{'question_type': 'simple', 'seed_document_id'..."
79f98d3d-766b-4cbf-800f-03e87966e3e5,What is the projected decline in coral reefs w...,Coral reefs are projected to decline by a furt...,Document 123: 71\nLong-Term Climate and Develo...,[],"{'question_type': 'simple', 'seed_document_id'..."
1ee224a2-62af-4877-b172-baec006512e6,What is the expected uncertainty range in the ...,The uncertainty in the total potential is typi...,Document 251: Where a gradual colour transitio...,[],"{'question_type': 'simple', 'seed_document_id'..."
16264bd2-510a-4368-a9d6-0a5fef7feb65,What is the effect of increasing cumulative ne...,The proportion of emissions taken up by land a...,Document 166: While \nnatural land and ocean c...,[],"{'question_type': 'simple', 'seed_document_id'..."
c31c6857-c505-45ef-98e5-aa524c4b05e7,What does hatching represent on the maps depic...,Hatching indicates areas where less than 70% o...,Document 135: Interquartile ranges of WGLs by ...,[],"{'question_type': 'simple', 'seed_document_id'..."


In [56]:
from giskard.rag import evaluate, RAGReport
from giskard.rag.metrics.ragas_metrics import ragas_context_recall, ragas_context_precision


In [57]:
def answer_fn(question, history=None):
    if history:
        answer = chat_engine.chat(question, chat_history=[ChatMessage(role=MessageRole.USER if msg["role"] =="user" else MessageRole.ASSISTANT,
                                                          content=msg["content"]) for msg in history])
    else:
        answer = chat_engine.chat(question, chat_history=[])
    return str(answer)

report = evaluate(answer_fn,
                testset=testset,
                knowledge_base=knowledge_base,
                metrics=[ragas_context_recall, ragas_context_precision])


Asking questions to the agent:   0%|          | 0/120 [00:00<?, ?it/s]

NameError: name 'chat_engine' is not defined