In [1]:
!pip3 install -q "giskard[llm]>=2.0.0b" --upgrade

In [2]:
!pip3 install -q langchain faiss-cpu pypdf openai tiktoken langchain-openai langchain_chroma

In [3]:
!pip3 install PYPDF2



In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [33]:
import os
from pathlib import Path

import openai
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.chains.base import Chain
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA, load_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
import PyPDF2

from giskard import Dataset, Model, scan, GiskardClient


In [34]:
template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""


In [35]:
def read_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = reader.pages
        text = ''
        for page_num in num_pages:
            text += page_num.extract_text()
    return text

def chunk(document):
  text_splitter = RecursiveCharacterTextSplitter(
    separators=[".", "\n"],
    chunk_size=1024,
    chunk_overlap=200,
    length_function=len,
  )
  splits = text_splitter.create_documents([document])
  return splits

docs = read_pdf("./BIM.pdf")
splits = chunk(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(),persist_directory="./chroma_db")

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [8]:
LLM_NAME="gpt-3.5-turbo"
TEXT_COULUMN_QUERY="query"

In [9]:
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(temperature=0)
prompt = ChatPromptTemplate.from_template(template)
base_retriever = vectorstore.as_retriever()

In [10]:
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

In [23]:
class RAGModel(Model):
    def model_predict(self, df: pd.DataFrame) -> pd.DataFrame:
        return df[TEXT_COULUMN_QUERY].apply(lambda x: self.model.invoke({"question": x}))

    def save_model(self, path: str):
        out_dest = Path(path)
        # Save the chain object
        self.model.save(out_dest.joinpath("model.json"))

        # Save the FAISS-based retriever
        db = self.model.retriever.vectorstore
        db.save_local(out_dest.joinpath("chroma"))

    @classmethod
    def load_model(cls, path: str) -> Chain:
        src = Path(path)

        db = Chroma(persist_directory=src.joinpath("chroma"), embedding_function=OpenAIEmbeddings())

        chain = load_chain(src.joinpath("model.json"), retriever=db.as_retriever())
        return chain


In [24]:
giskard_model = RAGModel(
    model=retrieval_augmented_qa_chain,  # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.
    model_type="text_generation",  # Either regression, classification or text_generation.
    name="Question Answering",  # Optional.
    description="This model answers any question about BIM",  # Is used to generate prompts during the scan.
    feature_names=[TEXT_COULUMN_QUERY]  # Default: all columns of your dataset.
)

# Optional: Wrap a dataframe of sample input prompts to validate the model wrapping and to narrow specific tests' queries.
giskard_dataset = Dataset(pd.DataFrame({
    TEXT_COULUMN_QUERY: [
        "What is BIM?",
        "What are different maturity levels of BIM"
    ]
}))


2024-06-19 09:40:34,312 pid:40819 MainThread giskard.datasets.base INFO     Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.




In [25]:
print(giskard_model.predict(giskard_dataset).prediction)

2024-06-19 09:40:35,054 pid:40819 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'query': 'object'} to {'query': 'object'}
2024-06-19 09:40:37,568 pid:40819 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (2, 1) executed in 0:00:02.518350
[{'response': AIMessage(content='Answer: BIM stands for Building Information Modelling.', response_metadata={'token_usage': {'completion_tokens': 11, 'prompt_tokens': 887, 'total_tokens': 898}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-8788e251-7a16-42d1-a31f-3c69b66ce83e-0', usage_metadata={'input_tokens': 887, 'output_tokens': 11, 'total_tokens': 898}), 'context': [Document(page_content='. SNÆBJÖRNSSON, \nI. KJARTANSDOTTIR, P. NOWAK) \n \nThis manual is about a new approach to design, construction, and facility \nmanagement called building information modelling (BIM). It provides an \ninsight into BIM technologies, the busines

In [26]:
results = scan(giskard_model, giskard_dataset, only="hallucination")


🔎 Running scan…
Estimated calls to your model: ~30
Estimated LLM calls for evaluation: 22

2024-06-19 09:40:58,299 pid:40819 MainThread giskard.scanner.logger INFO     Running detectors: ['LLMImplausibleOutputDetector', 'LLMBasicSycophancyDetector']
Running detector LLMImplausibleOutputDetector…
2024-06-19 09:41:09,308 pid:40819 MainThread httpx        INFO     HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-19 09:41:09,326 pid:40819 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'query': 'object'} to {'query': 'object'}
2024-06-19 09:41:09,718 pid:40819 ThreadPoolExecutor-8_0 httpx        INFO     HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 09:41:10,742 pid:40819 ThreadPoolExecutor-10_0 httpx        INFO     HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-19 09:41:11,560 pid:40819 ThreadPoolExecutor-11_0 httpx        INFO     HTTP Request:

In [27]:
display(results)

In [28]:
test_suite = results.generate_test_suite("Test suite generated by scan")
test_suite.run()

2024-06-19 09:46:18,240 pid:40819 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'query': 'object'} to {'query': 'object'}
2024-06-19 09:46:18,242 pid:40819 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (10, 1) executed in 0:00:00.008270
Executed 'Output plausibility' with arguments {'model': <__main__.RAGModel object at 0x70051df16c80>, 'dataset': <giskard.datasets.base.Dataset object at 0x70051df47eb0>}: 
               Test failed
               Metric: 8
               
               
2024-06-19 09:46:45,835 pid:40819 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'query': 'object'} to {'query': 'object'}
2024-06-19 09:46:45,840 pid:40819 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (10, 1) executed in 0:00:00.021899
2024-06-19 09:46:45,853 pid:40819 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'query': 'object'} to {'query': 'object'}
2

In [29]:
full_results = scan(giskard_model, giskard_dataset)

🔎 Running scan…
Estimated calls to your model: ~365
Estimated LLM calls for evaluation: 148

2024-06-19 09:51:56,197 pid:40819 MainThread giskard.scanner.logger INFO     Running detectors: ['LLMBasicSycophancyDetector', 'LLMCharsInjectionDetector', 'LLMHarmfulContentDetector', 'LLMImplausibleOutputDetector', 'LLMInformationDisclosureDetector', 'LLMOutputFormattingDetector', 'LLMPromptInjectionDetector', 'LLMStereotypesDetector', 'LLMFaithfulnessDetector']
Running detector LLMBasicSycophancyDetector…
2024-06-19 09:52:22,301 pid:40819 MainThread httpx        INFO     HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-19 09:52:22,312 pid:40819 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'query': 'object'} to {'query': 'object'}
2024-06-19 09:52:22,711 pid:40819 ThreadPoolExecutor-98_0 httpx        INFO     HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 09:52:24,059 pid:40819 ThreadPool

Downloading builder script: 100%|█████████████████████| 7.95k/7.95k [00:00<00:00, 18.4MB/s]


2024-06-19 09:54:32,202 pid:40819 MainThread matplotlib.font_manager INFO     Failed to extract font properties from /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)
2024-06-19 09:54:32,378 pid:40819 MainThread matplotlib.font_manager INFO     generated new fontManager
2024-06-19 09:55:20,485 pid:40819 MainThread giskard.scanner.logger INFO     LLMCharsInjectionDetector: Tested `query` for special char injection `\r`	Fail rate = 0.000	Vulnerable = False
2024-06-19 09:55:20,510 pid:40819 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'query': 'object'} to {'query': 'object'}
2024-06-19 09:55:21,097 pid:40819 ThreadPoolExecutor-180_0 httpx        INFO     HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 09:55:22,119 pid:40819 ThreadPoolExecutor-182_0 httpx        INFO     HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 400 Bad Reques





In [30]:
display(full_results)

In [32]:
## Generation Metrics

## ROGUE evaluates how good a machine is at summarization by considering common words and sequences between the llm and reference summaries

## BLEU evaluates how the generated text matches the reference it mainly considers the precision of word overlap and could score the summary a
## bit lower because it is only considering the word overlap

## METEOR is more comprehensive as it considers more factors like stemming and synonyms and it considers the overall quality of the sentence
## Might give a higher score because its more flexible when considering the synonyms in an example 