In [None]:
!pip install langchain-google-genai
!pip install langchain-chroma

Loading evaluation dataset

In [2]:
import pandas as pd
eval_data = pd.read_parquet("datasets/output_files/critiqued_qna_pairs_filtered.parquet")

Loading embedding model

In [3]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os

os.environ["GOOGLE_API_KEY"] = "<gemini api key>"

try:
    doc_embeddings_model = GoogleGenerativeAIEmbeddings(
        model="models/text-embedding-004", task_type="RETRIEVAL_DOCUMENT"
    )
    query_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", task_type="RETRIEVAL_QUERY"
    )
    print("Gemini Embeddings model initialized successfully.")
except Exception as e:
    print(f"Error initializing Gemini Embeddings model: {e}")

Gemini Embeddings model initialized successfully.


In [4]:
from langchain_chroma import Chroma
PERSIST_DIRECTORY ="../assets/chroma_langchain_db"

vector_db_load = Chroma(persist_directory=PERSIST_DIRECTORY ,
    embedding_function=doc_embeddings_model,
    collection_name="rag_collection")

Generate RAG responses for evaluation dataset

In [5]:
RAG_PROMPT_TEMPLATE="""
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know. Answer should be based on concise piece of factual information from the context.
Question: {question}
Context: {context}
Answer:
"""

In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

def call_llm(prompt):
    response = llm.invoke(prompt)
    return response

def get_RAG_response(question):
    retrieval_result = vector_db_load.similarity_search_by_vector(
        embedding=query_embeddings.embed_query(question), k=1
    )
    context_text=retrieval_result[0].page_content
    prompt_txt=RAG_PROMPT_TEMPLATE.format(question=question, context=context_text)
    response=call_llm(prompt_txt)
    return context_text, response.content

In [7]:
eval_data.iloc[0]

Unnamed: 0,0
context,UK youth 'interested' in politics\n\nThe major...
question,What percentage of 16 to 20-year-olds in the U...
answer,81%
source_category,politics
groundedness_score,5
groundedness_eval,The context directly answers the question. It ...
standalone_score,5
standalone_eval,The question is self-contained and does not de...


In [39]:
get_RAG_response(eval_data.iloc[1]["question"])

('"We\'ve just had our heads down really, we\'ve never actually broken up, or parted, or whatever. "So for us it just seems, even though we haven\'t played for years, we still see each other and write - we just haven\'t released a record for a long time." Portishead will not play any new material at next month\'s concert, which will feature singer Beth Gibbons playing with an acoustic backing. Other acts appearing include Liverpool band The Coral and former Led Zeppelin singer Robert Plant. Portishead became an international success and a deeply influential band despite their scant recorded output. Dummy was a critics\' favourite in 1994, hailed for its blend of menacing sounds and hip-hop beats married to old soul samples. The follow up, Portishead, was released three years later.',
 'The provided text does not state when Portishead won the Mercury Music Prize.')

In [9]:
rag_eval_set=eval_data[["question", "answer", "source_category"]]

In [10]:
rag_eval_set["retrieved_context"],rag_eval_set["rag_answer"]= zip(*rag_eval_set["question"].apply(lambda x: get_RAG_response(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rag_eval_set["retrieved_context"],rag_eval_set["rag_answer"]= zip(*rag_eval_set["question"].apply(lambda x: get_RAG_response(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rag_eval_set["retrieved_context"],rag_eval_set["rag_answer"]= zip(*rag_eval_set["question"].apply(lambda x: get_RAG_response(x)))


In [11]:
rag_eval_set

Unnamed: 0,question,answer,source_category,retrieved_context,rag_answer
0,What percentage of 16 to 20-year-olds in the U...,81%,politics,UK youth 'interested' in politics\n\nThe major...,81%
1,When did Portishead win the Mercury Music Prize?,1995,entertainment,"""We've just had our heads down really, we've n...",The provided text does not state when Portishe...
2,What animals are covered by Texas hunting laws...,"State laws on hunting only covered ""regulated ...",tech,". ""Animals hit but not killed would without do...",Texas hunting laws cover regulated animals suc...
3,What is the slogan for the Conservative Party'...,"""It's not racist to impose limits on immigration""",politics,The Tories have promised an upper limit on the...,"""It's not racist to impose limits on immigration"""
4,Who is the chairman of Wada?,Dick Pound,sport,Wada will appeal against ruling\n\nThe World A...,The provided text does not name the chairman o...
...,...,...,...,...,...
292,Who will sing America the Beautiful at the Sup...,Alicia Keys,entertainment,Alicia Keys to open US Super Bowl\n\nR&B star ...,Alicia Keys
294,What quote is attributed to Thomas Jefferson r...,A nation that limits freedom in the name of se...,politics,Frederic Forsythe's comments seem to me to be ...,"""A nation that limits freedom in the name of s..."
296,Which regions in the UK experienced the larges...,The Midlands and South.,business,The bank said that the recent pattern of house...,Wales experienced the largest house price fall...
298,What is Graham Cluley's job title?,Graham Cluley is a senior technology consultan...,tech,But Mr Curry's technology and broadcast intere...,This question cannot be answered from the give...


In [12]:
rag_eval_set.to_parquet("datasets/output_files/rag_generated_answers.parquet")