In [2]:
from chroma_utils import load_split_document, index_document_to_chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_chroma import Chroma
import os

In [3]:
textsplitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200
)

embedding_model = HuggingFaceInferenceAPIEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    api_key= os.getenv('api_key')
)

vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding_model
)

splits = load_split_document('/Users/vasstavkumarchava/Desktop/AI/RAG-project/data/puma.pdf')

vectorstore.add_documents(splits)

retriever = vectorstore.as_retriever(search_kwargs = {'k': 3})

In [4]:
print(len(splits))

240


In [5]:
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import os
from dotenv import load_dotenv
load_dotenv()


groq_api_key = os.getenv('groq_api_key')

llm = ChatGroq(model_name="llama3-70b-8192", temperature=0, api_key=groq_api_key)

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [6]:
questions = [
    "How does PUMA power its offices, stores, and warehouses?", 
    "What initiative did PUMA launch for recycling polyester jerseys?", 
    "By how much did PUMA reduce its own carbon emissions compared to 2017?", 
    "What step did PUMA take to reduce its transport emissions?", 
]

ground_truths = [
    "PUMA sources 100% renewable electricity for its offices, stores, and warehouses.",
    "PUMA launched the RE:JERSEY project for garment-to-garment polyester recycling.",
    "PUMA reduced its carbon emissions by 86% compared to 2017.",
    "PUMA invested in electrifying its car fleet and introduced its first electric truck in the USA.",
]

answers = []
contexts = []


for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.invoke(query)])



In [8]:
data = [
    {
        "question": questions[i],
        "ground_truth": ground_truths[i],
        "answer": answers[i],
        "context": contexts[i],
    }
    for i in range(len(questions))
]


formatted_context = "\n".join(
    f"Q: {entry['question']}\nGT: {entry['ground_truth']}\nCTX: {', '.join(entry['context'])}\nANS: {entry['answer']}\n"
    for entry in data
)



In [9]:
print(formatted_context)

Q: How does PUMA power its offices, stores, and warehouses?
GT: PUMA sources 100% renewable electricity for its offices, stores, and warehouses.
CTX: are operated by a third party.
** A location-based method reflects the average emissions intensity of grids on which energy consumption occurs. 
*** A market-based method reflects emissions from electricity that companies have purposefully chosen. It derives emission 
factors from contractual instruments, which include any type of contract between two parties for the sale and purchase of 
energy bundled with attributes about the energy generation, or for unbundled attribute claims. 
1. PUMA’s greenhouse gas reporting is in line with the GHG Protocol International Accounting Standard. Fugitive emissions 
(emissions from unintentional releases or leaks) are not included in Scope 1 emissions. 
2. Methodological changes over the last three years have influenced results. In 2020 updated emission factors were applied 
and the consolidated struc

In [23]:

llm_evalution = ChatGroq(model_name="llama3-70b-8192", temperature=0, api_key=groq_api_key)

template = """

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) model. Given a dataset with `question`, `answer`, `ground_truths`, and `contexts`, evaluate both the retrieval and generation components of the model. 

### **1. Evaluate the Retrieval Quality:**  
For each `question`, the model retrieves a `context`. Assess how well the retrieved `context` matches the `ground_truths` using the following metrics:

- **Recall@k**: Measure the percentage of times the correct `ground_truths` appear in the top-k retrieved `contexts`. Compute Recall@1, Recall@3, and Recall@5.
- **Mean Reciprocal Rank (MRR)**: Compute the MRR score to rank the relevance of retrieved documents.
- **Normalized Discounted Cumulative Gain (NDCG)**: Compute NDCG@3 and NDCG@5 to evaluate ranking quality of retrieval results.
- **BLEU / ROUGE Similarity (Retrieval)**: Compare the similarity between retrieved `contexts` and `ground_truths` using ROUGE-L or BLEU.

### **2. Evaluate the Generation Quality:**  
For each `question`, the model generates an `answer`. Assess the quality of the generated `answer` compared to `ground_truths` using:

- **Exact Match (EM)**: Percentage of answers that exactly match the `ground_truths`.
- **F1 Score**: Compute the word overlap between the generated answer and `ground_truths`.
- **ROUGE-L**: Compare the longest common subsequence between generated and ground truth answers.
- **BLEU Score**: Compute n-gram overlap to measure the fluency and relevance of the generated response.
- **BERTScore**: Use contextual embeddings to measure similarity between generated answers and ground truths.

### **3. Compute Overall Scores:**  
For a holistic evaluation, compute an **aggregated performance score** by combining retrieval and generation scores. Provide a detailed breakdown of the scores and highlight areas of improvement.

based on all the above scores give me the overall score of the model. Tell me the overall score out of 100. if the score is below 50 result should just be BAD MODEL, if the score is in between 50 and 70 result should just be Average MODEL, if the score is between 70 and 80 result should just be Good MODEL, if the score is above 90 result should just be Excellent MODEL

I just need the bucketting output. I don't need any scores for generation and retrieval.

based on the dataset: {context}
"""

prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": lambda x: formatted_context} 
    | prompt
    | llm_evalution
    | StrOutputParser()
)

result = rag_chain.invoke({"context": formatted_context})


In [24]:
print(result)

Based on the provided dataset, I evaluated the Retrieval-Augmented Generation (RAG) model and computed the overall score. Here is the result:

**Overall Score: 82**

Based on this score, I would categorize the model as **Good MODEL**.


In [12]:
con = contexts[2]

print(con)

['Annual Report 2022     ↗ Sustainability \n \n \n102 \n↗ T.19 PUMA’S SCOPE 3 CATEGORY-1 CO2e EMISSIONS FROM SELECTED VALUE CHAIN \nACTIVITIES \nScope 3 emissions (category -1)   \n2017 \n(baseline) 2021 2022 \n% change \n2017/2021 \nAbsolute GHG emissions (tCO2 eq)  1,409,265 1,242,468 1,278,758 -9%  \n     \nNote: Scope 3 category 1 estimation includes GHG emissions associated with goods and services purchased \nby PUMA from its suppliers related to PUMA products and associated packaging. This excludes emissions \nassociated with other goods and services acquired by PUMA offices, stores and warehouses. \nWe can see that our absolute scope 3 emissions from the category, purchased goods and services have \ndecreased by 9% from 2017 to 2022, while material consumption has increased by 27% during the same \nperiod. Due to energy efficiency improvements and the use of renewable electricity at factory level, as well \nas the usage of more sustainable materials, our absolute emissions have 

***Manual Evaluation***

Cosine Similarity


In [15]:
from sentence_transformers import SentenceTransformer, util

def evaluate_retrieval_and_answers(questions, answers, contexts, ground_truths):
    total_relevant_retrieved = 0
    total_similarity_ans = 0
    total_retrieved = 0
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    for i in range(len(questions)): 
        ques = questions[i]
        ans = answers[i]
        con = contexts[i]
        gt = ground_truths[i]

        gt_embed = model.encode(gt)
        con_embed = [model.encode(ctx) for ctx in con]
        ans_embed = model.encode(ans)

        similarities_context = [util.pytorch_cos_sim(gt_embed, emb)[0].item() for emb in con_embed]
        similarity_ans = util.pytorch_cos_sim(gt_embed, ans_embed).item()

        threshold = 0.7
        relevant_retrieved = sum(1 for sim in similarities_context if sim >= threshold)
        total_retrieved  = total_retrieved + len(con)
        
        total_relevant_retrieved = total_relevant_retrieved + relevant_retrieved
        total_similarity_ans = total_similarity_ans + similarity_ans


        print(f'Evaluating question: {ques}')
        print("These are the cosine similarity values of chunks:", similarities_context)
        print("No of relevant retrieved chunks:", relevant_retrieved)
        print('This is the cosine similarity of the ground truth and answer generated:', similarity_ans)
        print('--------------------------')

    print("The average cosine similarity of relevant retrieved chunks : ", total_relevant_retrieved/len(questions))
    print("The average cosine similarity of the ground truth and answer generates: ", total_similarity_ans/len(questions))

    # context_precision = total_relevant_retrieved/total_retrieved
    # context_recall = total_relevant_retrieved/240
    # f1_score = 2 * (context_precision * context_recall) / (context_precision + context_recall)

    # print('Context precision of the model is : ', context_precision)
    # print('context recall of the model is : ', context_recall)
    # print('F1 score of the model is : ', f1_score)

evaluate_retrieval_and_answers(questions=questions, answers=answers, contexts=contexts, ground_truths=ground_truths)


Evaluating question: How does PUMA power its offices, stores, and warehouses?
These are the cosine similarity values of chunks: [0.59479820728302, 0.59479820728302, 0.534403383731842]
No of relevant retrieved chunks: 0
This is the cosine similarity of the ground truth and answer generated: 0.7839676737785339
--------------------------
Evaluating question: What initiative did PUMA launch for recycling polyester jerseys?
These are the cosine similarity values of chunks: [0.8139957189559937, 0.8139957189559937, 0.6340339183807373]
No of relevant retrieved chunks: 2
This is the cosine similarity of the ground truth and answer generated: 0.9183568954467773
--------------------------
Evaluating question: By how much did PUMA reduce its own carbon emissions compared to 2017?
These are the cosine similarity values of chunks: [0.7125610113143921, 0.7125610113143921, 0.6776577234268188]
No of relevant retrieved chunks: 2
This is the cosine similarity of the ground truth and answer generated: 0.7

Jaccard Similarity

In [16]:
def jaccard_similarity(questions, text1, text2):

    average_jaccard_similarity = 0

    for i in range(len(answers)):
        text1 = ground_truths[i]
        text2 = answers[i]
        question = questions[i]

        set1, set2 = set(text1.split()), set(text2.split())
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        result = intersection / union if union != 0 else 0
        print("Evaluating question : ", question )
        print('The jaccard similarity of this question is :', result)
        print('-----------------------')
        average_jaccard_similarity = average_jaccard_similarity + result
    print("The average jaccard similarity of ground_truth and answer is :", average_jaccard_similarity/len(answers))
jaccard_similarity(questions, ground_truths, answers)

Evaluating question :  How does PUMA power its offices, stores, and warehouses?
The jaccard similarity of this question is : 0.11627906976744186
-----------------------
Evaluating question :  What initiative did PUMA launch for recycling polyester jerseys?
The jaccard similarity of this question is : 0.21428571428571427
-----------------------
Evaluating question :  By how much did PUMA reduce its own carbon emissions compared to 2017?
The jaccard similarity of this question is : 0.12903225806451613
-----------------------
Evaluating question :  What step did PUMA take to reduce its transport emissions?
The jaccard similarity of this question is : 0.20833333333333334
-----------------------
The average jaccard similarity of ground_truth and answer is : 0.1669825938627514


BLEU Score

In [17]:
%pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [18]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu(questions, references, candidates):
    for i in range(len(questions)):
        reference = [references[i].split()]  
        candidate = candidates[i].split()

        bleu_score = sentence_bleu(reference, candidate)

        print("Evaluating question:", questions[i])
        print("BLEU Score:", bleu_score)
        print("--------------------------")

calculate_bleu(questions=questions, references=ground_truths, candidates=answers)


Evaluating question: How does PUMA power its offices, stores, and warehouses?
BLEU Score: 1.2714599839721324e-78
--------------------------
Evaluating question: What initiative did PUMA launch for recycling polyester jerseys?
BLEU Score: 0.0860132510739358
--------------------------
Evaluating question: By how much did PUMA reduce its own carbon emissions compared to 2017?
BLEU Score: 3.951829566952597e-155
--------------------------
Evaluating question: What step did PUMA take to reduce its transport emissions?
BLEU Score: 2.9857029691673486e-78
--------------------------


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


ROUGR Score

In [19]:
%pip install rouge-score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [20]:
from rouge_score import rouge_scorer

def calculate_rouge(questions, references, candidates):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    for i in range(len(questions)):
        reference = references[i]
        candidate = candidates[i]

        scores = scorer.score(reference, candidate)

        print("Evaluating question:", questions[i])
        print("ROUGE-1:", scores["rouge1"].fmeasure)
        print("ROUGE-2:", scores["rouge2"].fmeasure)
        print("ROUGE-L:", scores["rougeL"].fmeasure)
        print("--------------------------")

calculate_rouge(questions=questions, references=ground_truths, candidates=answers)


Evaluating question: How does PUMA power its offices, stores, and warehouses?
ROUGE-1: 0.26666666666666666
ROUGE-2: 0.10344827586206896
ROUGE-L: 0.16666666666666669
--------------------------
Evaluating question: What initiative did PUMA launch for recycling polyester jerseys?
ROUGE-1: 0.37209302325581395
ROUGE-2: 0.1951219512195122
ROUGE-L: 0.37209302325581395
--------------------------
Evaluating question: By how much did PUMA reduce its own carbon emissions compared to 2017?
ROUGE-1: 0.25
ROUGE-2: 0.10526315789473684
ROUGE-L: 0.25
--------------------------
Evaluating question: What step did PUMA take to reduce its transport emissions?
ROUGE-1: 0.3636363636363636
ROUGE-2: 0.12903225806451615
ROUGE-L: 0.30303030303030304
--------------------------
