In [1]:
!pip install langchain langchain_community langchain_huggingface langchain_openai langgraph langchain_chroma langchain_anthropic langgraph-checkpoint-sqlite pypdf datasets ragas ragatouille

Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting langgraph
  Downloading langgraph-0.2.37-py3-none-any.whl.metadata (13 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting langchain_anthropic
  Downloading langchain_anthropic-0.2.3-py3-none-any.whl.metadata (2.3 kB)
Collecting langgraph-checkpoint-sqlite
  Downloading langgraph_checkpoint_sqlite-2.0.0-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting ragas
  Down

In [2]:
%cd "/content/drive/MyDrive/Documents/General reference/J/Job/Cathay Pacific/"

/content/drive/MyDrive/Documents/General reference/J/Job/Cathay Pacific


In [3]:
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

True

In [4]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./general-conditions-of-carriage-for-passengers-baggage-en.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

35


In [5]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

 
 
 1 
 GENERAL CONDITIONS OF 
CARRIAGE FOR PASSENGERS  
AND BAGGAGE  
 
 
Effective date from 7 Ju
{'source': './general-conditions-of-carriage-for-passengers-baggage-en.pdf', 'page': 0}


In [5]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")

In [7]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=700)
splits = text_splitter.split_documents(docs)
vectorstore = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)

retriever = vectorstore.as_retriever()

In [8]:
len(splits)

205

In [9]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "What's the luggage policy of Cathay Pacific?"})

results

{'input': "What's the luggage policy of Cathay Pacific?",
 'context': [Document(id='d1b9ab01-6c08-4d6d-b66a-687d10088e90', metadata={'source': './general-conditions-of-carriage-for-passengers-baggage-en.pdf', 'page': 18}, page_content='free of charge, either into the cabin or having it placed for carriage in the \naircraft hold. Your Baggage allowance will depend on the terms and  \nconditions of your Ticket and is also subject to certain limitations as to the \nnumber of items of luggage, the size and weight.  \nFor your allowance please refer to our dedicated Baggage page for more \ndetails at:  \nhttps://www.cathaypacific.com/cx/en_MY/baggage.html  \n9.2 EXCESS BAGGAGE  \nYou will be required to pay a charge for carriage of Baggage in excess of your \nfree Bagga ge allowance under your Ticket. You can purchase  an excess \nBaggage allowance in advance at a discount. Higher charges will apply if \npurchased at the airport. Our charges for excess Baggage are available at the \nairport

In [10]:
print(results["context"][0].page_content)

free of charge, either into the cabin or having it placed for carriage in the 
aircraft hold. Your Baggage allowance will depend on the terms and  
conditions of your Ticket and is also subject to certain limitations as to the 
number of items of luggage, the size and weight.  
For your allowance please refer to our dedicated Baggage page for more 
details at:  
https://www.cathaypacific.com/cx/en_MY/baggage.html  
9.2 EXCESS BAGGAGE  
You will be required to pay a charge for carriage of Baggage in excess of your 
free Bagga ge allowance under your Ticket. You can purchase  an excess 
Baggage allowance in advance at a discount. Higher charges will apply if 
purchased at the airport. Our charges for excess Baggage are available at the 
airport , from our Authorised Agents or from ou r website at:  
https://www.cathaypacific.com/cx/en_US/baggage/extra -baggage -
charges/travel -on-after -01082019.html  
9.3 ITEMS UNACCEPTABLE AS BAGGAGE


In [11]:
print(results["context"][0].metadata)

{'source': './general-conditions-of-carriage-for-passengers-baggage-en.pdf', 'page': 18}


In [12]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets

pd.set_option("display.max_colwidth", None)

## Set up agents for question generation

In [13]:
from huggingface_hub import InferenceClient


repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
)


def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1000},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["generated_text"]


call_llm(llm_client, "This is a test context")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


'This is a test context for the `@mui/material` library.\n\n## Installation\n\n```sh\nnpm install @mui/material\n```\n\n## Usage\n\n```jsx\nimport React from \'react\';\nimport { Button } from \'@mui/material\';\n\nfunction App() {\n  return (\n    <div className="App">\n      <Button variant="contained" color="primary">\n        Hello World\n      </Button>\n    </div>\n  );\n}\n\nexport default App;\n```\n\n## Documentation\n\n- [Material-UI](https://material-ui.com/)\n- [Material Design](https://material.io/)'

In [14]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [15]:
from tqdm import tqdm

print(f"Generating {len(splits)} QA couples...")

outputs = []
for context in tqdm(splits):
    # Generate QA couple
    output_QA_couple = call_llm(llm_client, QA_generation_prompt.format(context=context.page_content))
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": context.metadata["source"],
            }
        )
    except:
        continue

Generating 205 QA couples...


100%|██████████| 205/205 [04:28<00:00,  1.31s/it]


In [16]:
display(pd.DataFrame(outputs).head(5))

Unnamed: 0,context,question,answer,source_doc
0,"1 \n GENERAL CONDITIONS OF \nCARRIAGE FOR PASSENGERS \nAND BAGGAGE \n \n \nEffective date from 7 June 2024 \nFor tickets purchased before 7 June 2024, please click here .",When do the general conditions of carriage for passengers and baggage become effective?\n,The general conditions of carriage for passengers and baggage become effective from 7 June 2024.,./general-conditions-of-carriage-for-passengers-baggage-en.pdf
1,"i \n TABLE OF CONTENTS \nARTICLE Page \nARTICLE 1: DEFINITIONS ................................ ................................ ................................ ... 2 \nARTICLE 2: APPLICABILITY ................................ ................................ .............................. 5 \nARTICLE 3: TICKETS ................................ ................................ ................................ ............ 6 \nARTICLE 4: OUR NAME AND ADDRESS ................................ ................................ ...... 10 \nARTICLE 5: FARES, TAXES, FEES AND CHARGES ................................ ..................... 10 \nARTICLE 6: RESERVATIONS ................................ ................................ ............................ 11 \nARTICLE 7: CHECK -IN AND BOARDING ................................ ................................ ..... 15 \nARTICLE 8: REFUSAL OF AND LIMITATION ON CARRIAGE ................................ . 15",What is the title of Article 6 in the table of contents?\n,Reservations,./general-conditions-of-carriage-for-passengers-baggage-en.pdf
2,"ARTICLE 3: TICKETS ................................ ................................ ................................ ............ 6 \nARTICLE 4: OUR NAME AND ADDRESS ................................ ................................ ...... 10 \nARTICLE 5: FARES, TAXES, FEES AND CHARGES ................................ ..................... 10 \nARTICLE 6: RESERVATIONS ................................ ................................ ............................ 11 \nARTICLE 7: CHECK -IN AND BOARDING ................................ ................................ ..... 15 \nARTICLE 8: REFUSAL OF AND LIMITATION ON CARRIAGE ................................ . 15 \nARTICLE 9: BAGGAGE ................................ ................................ ................................ ....... 18 \nARTICLE 10: SCHEDULES, CANCELLATION OF FLIGHTS ................................ ...... 23",What is the title of article 9?\n,Baggage,./general-conditions-of-carriage-for-passengers-baggage-en.pdf
3,"ARTICLE 5: FARES, TAXES, FEES AND CHARGES ................................ ..................... 10 \nARTICLE 6: RESERVATIONS ................................ ................................ ............................ 11 \nARTICLE 7: CHECK -IN AND BOARDING ................................ ................................ ..... 15 \nARTICLE 8: REFUSAL OF AND LIMITATION ON CARRIAGE ................................ . 15 \nARTICLE 9: BAGGAGE ................................ ................................ ................................ ....... 18 \nARTICLE 10: SCHEDULES, CANCELLATION OF FLIGHTS ................................ ...... 23 \nARTICLE 11: REFUNDS ................................ ................................ ................................ ...... 24 \nARTICLE 12: CONDUCT ABOARD AIRCRAFT ................................ ............................ 26 \nARTICLE 13: ARRANGEMENTS FOR ADDITIONAL SERVICES .............................. 27",What is the title of Article 13?\n,Arrangements for Additional Services,./general-conditions-of-carriage-for-passengers-baggage-en.pdf
4,"ARTICLE 8: REFUSAL OF AND LIMITATION ON CARRIAGE ................................ . 15 \nARTICLE 9: BAGGAGE ................................ ................................ ................................ ....... 18 \nARTICLE 10: SCHEDULES, CANCELLATION OF FLIGHTS ................................ ...... 23 \nARTICLE 11: REFUNDS ................................ ................................ ................................ ...... 24 \nARTICLE 12: CONDUCT ABOARD AIRCRAFT ................................ ............................ 26 \nARTICLE 13: ARRANGEMENTS FOR ADDITIONAL SERVICES .............................. 27 \nARTICLE 14: ADMINISTRATIVE FORMALITIES ................................ ......................... 27 \nARTICLE 15: SUCCESSIVE CARRIERS ................................ ................................ ............ 28 \nARTICLE 16: LIABILITY FOR DAMAGE ................................ ................................ ......... 29",What is the title of Article 12 in the given context?\n,"The title of Article 12 in the given context is ""CONDUCT ABOARD AIRCRAFT"".",./general-conditions-of-carriage-for-passengers-baggage-en.pdf


In [17]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to flight passengers.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the definition of "PASSENGER" according to the context?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [18]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(
            llm_client,
            question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
        ),
        "relevance": call_llm(
            llm_client,
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_llm(
            llm_client,
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        continue

Generating critique for each QA couple...


100%|██████████| 168/168 [21:21<00:00,  7.63s/it]


In [19]:
import pandas as pd

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 4)
    & (generated_questions["standalone_score"] >= 4)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,When do the general conditions of carriage for passengers and baggage become effective?\n,The general conditions of carriage for passengers and baggage become effective from 7 June 2024.,5.0,,
1,What is the title of Article 6 in the table of contents?\n,Reservations,5.0,1.0,5.0
2,What is the title of article 9?\n,Baggage,5.0,1.0,1.0
3,What is the title of Article 13?\n,Arrangements for Additional Services,5.0,1.0,5.0
4,What is the title of Article 12 in the given context?\n,"The title of Article 12 in the given context is ""CONDUCT ABOARD AIRCRAFT"".",5.0,,
...,...,...,...,...,...
163,What is the time limit for notifying the airline of damage to checked-in baggage?\n,The time limit for notifying the airline of damage to checked-in baggage is seven (7) days of receipt of the baggage.,5.0,,
164,How long do you have to notify the airline of checked baggage damage?\n,You have 21 days from the date the baggage ought to have been delivered to you to notify the airline of checked baggage damage.,,,
165,How long do I have to notify Cathay Pacific about a claim for compensation?\n,"According to the passage, you have to notify Cathay Pacific in writing within the applicable time frame, otherwise they reserve the right to deny you compensation.",3.0,4.0,5.0
166,How long do I have to bring an action for compensation for damages?\n,"You have two years from the date of arrival at the destination, or the date on which the aircraft ought to have arrived, or the date on which the carriage stopped.",3.0,4.0,5.0


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
9,What is an Authorised Agent?\n,An Authorised Agent is a passenger sales agent who is permitted to sell air passenger transportation tickets and other services of the airline and/or other carriers to passengers.,5.0,4.0,5.0
15,What is a Conjunction Ticket?\n,"A Conjunction Ticket is when a passenger purchases two separate tickets from the same airline or Authorized Agent at the same time for continuous travel, and both the passenger and the airlines consider it to be a single operation and contract.",5.0,5.0,5.0
17,What is a Tariff?\n,"A Tariff is the published fares, charges and/or related conditions of carriage of an airline filed, which have been filed where required, with the appropriate authorities.",5.0,4.0,5.0
23,What is a codeshare arrangement in the context of air travel?\n,A codeshare arrangement in air travel is a commercial and operational agreement between two carriers where one carrier operates a flight under the flight number of the other carrier.,5.0,5.0,4.0
28,Who is the contract between for a ticket?\n,The contract is between the airline and the passengers named on the ticket.,5.0,4.0,5.0
29,What is the policy on transferring a ticket to another person?\n,You cannot transfer your Ticket to another person.,5.0,5.0,5.0
33,How long is a ticket valid for if no part of the ticket has been used?\n,A ticket is valid for 12 months from the date first issued if no part of the ticket has been used.,5.0,4.0,5.0
40,How are changes to the sequence of flights on a ticket with multiple flights and sectors handled before travel?\n,"Changes to the sequence of flights on a ticket with multiple flights and sectors are subject to the passenger paying the difference of the recalculated fare (if higher), any taxes, and the applicable change fees.",5.0,5.0,5.0
42,How long do I have to request to maintain the validity of my ticket if I miss a flight in Italy?\n,You have 24 hours from the scheduled departure time of the missed flight or at least two hours before the departure of the subsequent flight if it is within 24 hours of the missed flight.,5.0,4.0,5.0
46,What is the condition for waiving re-booking fees for a ticket to Spain?\n,"The re-booking fees for a ticket to Spain will be waived if the reservation system shows that the ticket was issued in Spain, or if the passenger presents a valid Spanish passport or Spanish resident ID card, or if the origin of the entire ticket is in Spain.",5.0,4.0,5.0




In [56]:
# Save the final generated questions to save the generation process in the future
if not os.path.exists("./data/datasets/"):
    os.mkdir("./data/datasets/")

with open('./data/datasets/generated_questions.json', 'w') as f:
    json.dump(generated_questions.to_dict(), f, indent=4)

In [67]:
with open('./data/datasets/generated_questions.json', 'r') as f:
    generated_questions = json.load(f)
generated_questions = pd.DataFrame(generated_questions)
eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)

# Build RAG System

## Indexing

In [20]:
# Load
# The loading step is already done when we use a text splitter to split
# our pdf document into small chunks/documents with 1000 characters each

# Split
# Here we will further split each small document into snippets for retrieval
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: str,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

# Store
# Embed the snippets in vectors and store them in a vectorestore
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os

def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "thenlper/gte-small",
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

## Retrieval and Generation

In [21]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [22]:
from langchain_community.llms import HuggingFaceHub

repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"

READER_LLM = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

  READER_LLM = HuggingFaceHub(


In [23]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM

def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 5,
    num_docs_final: int = 3,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)

    return answer, relevant_docs

In [25]:
for item in tqdm(eval_dataset):
    print(item['context'])
    break

  0%|          | 0/54 [00:00<?, ?it/s]

before continuing your journey with us, from that agreed stopping place to your final 
destination, on the same Ticket.  
"AIRLINE DESIGNATOR CODE" means two -characters (IATA) or three letters 
(ICAO) which identify particular Carriers (such as CX or CP A for Cathay Pacific or 
UO or HKE for Hong Kong Express.  
"AUTHORISED AGENT" means a passenger  sales agent who is permitted to sell air 
passenger transportation tickets and other of our and/or other Carriers’ services to 
you.  
"BAGGAGE" means your personal property accompanying you on your journey with 
us, including your Checked -In Baggage and Cabin  Baggage.  
"BAGGAGE IDENTIFICATION TAG" means a document issued by us to identify 
either your Checked -In Baggage or other Baggage that you hand to us for our care, 
custody and control in the cabin or elsewhere on board the aircraft.  
"BANNING NOTICE" means a  notice in writing in which we inform you that you 
have been banned from our flights and services.





# Evaluate RAG System

In [26]:
from langchain_core.language_models import BaseChatModel


def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "context": example["context"],
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f, indent=4)

In [27]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [28]:
from langchain_openai import ChatOpenAI

eval_chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
evaluator_name = "GPT4"


def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f, indent=4)

In [42]:
if not os.path.exists("./output"):
    os.mkdir("./output")

for chunk_size in [200, 400, 600]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                splits,
                chunk_size=chunk_size,
                embedding_model_name=embeddings
            )

            print("Running RAG...")
            reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") if rerank else None
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            print("Running evaluation...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )

Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...



`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884


`torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.


`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 64.36it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 70.85it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 67.96it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00

Running evaluation...


100%|██████████| 51/51 [03:16<00:00,  3.84s/it]


Running evaluation for chunk:200_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 54/54 [05:10<00:00,  5.75s/it]


Running evaluation...


100%|██████████| 51/51 [03:24<00:00,  4.00s/it]


Running evaluation for chunk:400_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...



`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884


`torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.


`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 68.10it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 67.59it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 70.22it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00

Running evaluation...


100%|██████████| 51/51 [03:30<00:00,  4.13s/it]


Running evaluation for chunk:400_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 54/54 [05:19<00:00,  5.92s/it]


Running evaluation...


100%|██████████| 51/51 [03:02<00:00,  3.59s/it]


Running evaluation for chunk:600_embeddings:thenlper~gte-small_rerank:True_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...



`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884


`torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.


`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 70.85it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 71.28it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00:00, 67.13it/s]

`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.


100%|██████████| 1/1 [00:00<00

Running evaluation...


100%|██████████| 51/51 [03:42<00:00,  4.35s/it]


Running evaluation for chunk:600_embeddings:thenlper~gte-small_rerank:False_reader-model:zephyr-7b-beta:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 54/54 [03:56<00:00,  4.39s/it]


Running evaluation...


100%|██████████| 51/51 [03:36<00:00,  4.24s/it]


In [43]:
import glob

outputs = []
for file in glob.glob("./output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)

In [44]:
result["eval_score_GPT4"] = result["eval_score_GPT4"].apply(lambda x: int(x) if isinstance(x, str) else 1)
result["eval_score_GPT4"] = (result["eval_score_GPT4"] - 1) / 4

In [45]:
average_scores = result.groupby("settings")["eval_score_GPT4"].mean()
settings = []
for setting in average_scores.index:
    chunk_size = setting.split('rag_chunk:')[1].split('_embeddings')[0]
    rerank = setting.split('rerank:')[1].split('_reader')[0]
    if rerank == 'True':
        settings.append('+'.join([chunk_size, 'reranker']))
    else:
        settings.append(chunk_size)
average_scores = pd.Series(average_scores.values, index=settings, name='eval_score_GPT4')
average_scores = average_scores * 100
average_scores.sort_values(inplace=True)
average_scores

Unnamed: 0,eval_score_GPT4
400,89.215686
200,89.705882
600+reranker,89.705882
400+reranker,90.686275
600,91.176471
200+reranker,92.156863


In [47]:
import plotly.express as px
fig = px.bar(
    average_scores,
    color=average_scores,
    labels={
        "value": "Accuracy",
        "settings": "Configuration",
    },
    color_continuous_scale="bluered",
)
fig.update_layout(
    width=1000,
    height=600,
    barmode="group",
    yaxis_range=[0, 100],
    title="<b>Accuracy of different RAG configurations</b>",
    xaxis_title="RAG settings",
    font=dict(size=15),
)
fig.layout.yaxis.ticksuffix = "%"
fig.update_coloraxes(showscale=False)
fig.update_traces(texttemplate="%{y:.1f}", textposition="outside")
fig.show()

In [15]:
from ragas import SingleTurnSample
from ragas import evaluate
# from ragas.metrics import LLMContextPrecisionWithReference
from ragas.metrics import context_precision
from datasets import Dataset

# context_precision = LLMContextPrecisionWithReference()

# sample = SingleTurnSample(
#     user_input="Where is the Eiffel Tower located?",
#     reference="The Eiffel Tower is located in Paris.",
#     retrieved_contexts=["The Eiffel Tower is located in Paris."],
# )
data = {
"user_input":"Where is the Eiffel Tower located?",
"reference": "The Eiffel Tower is located in Paris.",
"retrieved_contexts":"The Eiffel Tower is located in Paris.",
}
dataset = Dataset.from_dict(data)
evaluate(dataset, [context_precision])
# await context_precision.single_turn_ascore(sample)

ArrowInvalid: Column 1 named reference expected length 34 but got length 37

In [None]:
from ragas.metrics import LLMContextPrecisionWithoutReference
from ragas.llms import LangchainLLMWrapper
from ragas import evaluate

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
metrics = [LLMContextRecall(), FactualCorrectness(), Faithfulness()]
results = evaluate(dataset=sample, metrics=metrics, llm=evaluator_llm,)

In [11]:
# from ragas import SingleTurnSample
# from ragas.metrics import LLMContextPrecisionWithoutReference

# context_precision = LLMContextPrecisionWithoutReference(llm=llm)

sample = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    reference="The Eiffel Tower is located in Paris.",
    retrieved_contexts=["The Eiffel Tower is located in Paris."],
)
# await context_precision.single_turn_ascore(sample)

from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas.llms import LangchainLLMWrapper
from ragas import evaluate
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
metrics = [LLMContextRecall(), FactualCorrectness(), Faithfulness()]
results = evaluate(dataset=sample, metrics=metrics, llm=evaluator_llm,)

AttributeError: 'SingleTurnSample' object has no attribute 'get_sample_type'