In [None]:
from chroma_utils import load_split_document, index_document_to_chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_chroma import Chroma
from giskard.rag import generate_testset, evaluate
import os

In [None]:
textsplitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200
)

embedding_model = HuggingFaceInferenceAPIEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    api_key= os.getenv('api_key')
)

vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding_model
)

splits = load_split_document('/Users/vasstavkumarchava/Desktop/AI/RAG-project/data/puma.pdf')

vectorstore.add_documents(splits)

retriever = vectorstore.as_retriever(search_kwargs = {'k': 5})

In [None]:
%pip install -qU langchain-groq

In [None]:
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import os
from dotenv import load_dotenv
load_dotenv()


groq_api_key = os.getenv('groq_api_key')
# Define LLM
llm = ChatGroq(model_name="llama3-70b-8192", temperature=0, api_key=groq_api_key)

# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [None]:
from datasets import Dataset

questions = [
    "What percentage of PUMA products met sustainability criteria in 2022?", 
    "How does PUMA power its offices, stores, and warehouses?", 
    "What initiative did PUMA launch for recycling polyester jerseys?", 
    "By how much did PUMA reduce its own carbon emissions compared to 2017?", 
    "What step did PUMA take to reduce its transport emissions?", 
    "What is the average payment above minimum wages for PUMAs core suppliers?", 
    "How did PUMA reduce plastic waste in its stores?", 
    "How many factory workers were trained on workplace harassment in 2022?", 
    "How does PUMA ensure its leather sourcing is sustainable?", 
    "How many hours did PUMA employees contribute to community work?"
]

ground_truths = [
    ["70% of PUMA products were made with more sustainable materials in 2022."],
    ["PUMA sources 100% renewable electricity for its offices, stores, and warehouses."],
    ["PUMA launched the RE:JERSEY project for garment-to-garment polyester recycling."],
    ["PUMA reduced its carbon emissions by 86% compared to 2017."],
    ["PUMA invested in electrifying its car fleet and introduced its first electric truck in the USA."],
    ["The average payment is 13.4% above minimum wages for PUMA’s core Tier 1 factories."],
    ["PUMA eliminated consumer-facing plastic bags from its owned and operated stores."],
    ["160,000 factory workers were trained on workplace harassment."],
    ["PUMA uses only Leather Working Group-certified leather to avoid deforestation."],
    ["PUMA employees contributed 43,000 hours of community engagement."]
]

answers = []
contexts = []

for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.invoke(query)])

data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

print(answers)
print(contexts)

dataset = Dataset.from_dict(data)

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision
    ],
)

df = result.to_pandas()