In [8]:
from langchain_community.llms import Ollama
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import giskard
from openai import OpenAI
from giskard.llm.client.openai import OpenAIClient
from giskard.llm.embeddings.openai import OpenAIEmbedding
from giskard.rag import KnowledgeBase, QATestset, generate_testset

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
_client = OpenAI(base_url="http://localhost:11434/v1/", api_key="ollama")

llm_client = OpenAIClient(model="llama3", client=_client)
embed_client = OpenAIEmbedding(model="nomic-embed-text", client=_client)

giskard.llm.set_default_client(llm_client)
giskard.llm.embeddings.set_default_embedding(embed_client)

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=20,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
)
loader = PyPDFLoader('test/chap1/chap1.pdf')
docs = loader.load()
all_splits = text_splitter.split_documents(docs)

In [4]:
df = pd.DataFrame([d.page_content for d in docs], columns=["text"])
df.head(10)

Unnamed: 0,text
0,
1,Prescribed by National Curriculum and Textboo...
2,Published by\nNational Curriculum and Textbook...
3,PREFACE\nEducation is the pre-requisite for th...
4,One PHYSICAL QUANTITIES AND MEASUREMENT 1-25\n...
5,Physics 1 \nChapter one \nPHYSICAL QUANTITIES...
6,2 Physics \n1.1 Physics \nThe branch of scie...
7,Physics 3 \nthis time West European civilizat...
8,4 Physics \ndetermination of relations among ...
9,Physics 5 \nbomb and nuclear reactor are inve...


In [5]:
knowledge_base = KnowledgeBase(df)

In [7]:
testset = generate_testset(
    knowledge_base,
    num_questions=5,
    agent_description="A chatbot answering questions from physics textbook",
)

2024-10-29 00:32:27,749 pid:50069 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-10-29 00:37:37,091 pid:50069 MainThread giskard.rag  INFO     Found 3 topics in the knowledge base.


Generating questions:   0%|          | 0/5 [00:00<?, ?it/s]

2024-10-29 00:42:43,816 pid:50069 MainThread giskard.rag  ERROR    Encountered error in question generation: Expecting value: line 1 column 1 (char 0). Skipping.
2024-10-29 00:42:43,817 pid:50069 MainThread giskard.rag  ERROR    Expecting value: line 1 column 1 (char 0)
Traceback (most recent call last):
  File "/home/tamim-ishrak/.config/jupyterlab-desktop/jlab_server/lib/python3.12/site-packages/giskard/rag/question_generators/base.py", line 57, in generate_questions
    yield self.generate_single_question(knowledge_base, *args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tamim-ishrak/.config/jupyterlab-desktop/jlab_server/lib/python3.12/site-packages/giskard/rag/question_generators/simple_questions.py", line 96, in generate_single_question
    generated_qa = self._llm_complete(messages=messages)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tamim-ishrak/.config/jupyterlab-desktop/jlab_server/lib/python3.12

In [9]:
testset.save("testset_chapter1.jsonl")

loaded_testset = QATestset.load("testset_chapter1.jsonl")

In [10]:
df_chap1 = loaded_testset.to_pandas()

In [11]:
df_chap1.to_csv('chapter1_test.csv')

In [12]:
df_chap1.shape

(3, 5)

In [15]:
df_chap1["reference_answer"].to_list()

['Max Planck discovered the quantum theory of radiation.',
 'The standard value of `g` at sea level altitude 45 ° is accepted as 9.80665ms-2, and taken to be 9.8ms-2 or 9.81ms-2 for convenience.',
 'The unit of speed is meter/second (ms-1) and average speed is calculated as total distance divided by time.']