In [34]:
import nltk.translate.bleu_score as bleu
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
import importlib
import chatbot
import models.instances.chatgpt
import models.instances.mistral
importlib.reload(chatbot)
importlib.reload(models.instances.chatgpt)
importlib.reload(models.instances.mistral)
from chatbot import chatbot_instance
from models.instances.mistral import mistral_chat_instance
from models.instances.chatgpt import openai_chat_instance

In [26]:
from dotenv import load_dotenv
load_dotenv()

True

### OpenAI

In [35]:
# Query the chatbot
response = openai_chat_instance.query("What is the purpose of life?")
print("Response: ", response["messages"][-1]["content"])


Response:  Sanskrit Transliteration:
यत्र योगेश्वरः कृष्णो यत्र पार्थो धनुर्धरः।
तत्र श्रीर्विजयो भूतिर्ध्रुवा नीतिर्मतिर्मम।।

English Translation (BG 18.78):
Wherever there is Lord Krishna, the master of yoga, and wherever there is Arjuna, the supreme archer, there will also certainly be opulence, victory, extraordinary power, and morality. This is my opinion.

Explanation:
The purpose of life, according to the Bhagavad Gita, is to establish a loving relationship with Lord Krishna, the Supreme Personality of Godhead. When we engage in devotional service to Krishna, following His instructions as given in the Gita, we can achieve true success and fulfillment in life. Just like Arjuna followed Krishna's guidance on the battlefield of Kurukshetra and attained victory, we too can achieve success in our lives by surrendering to Krishna and following His teachings.

Practical Guidance:
To apply this teaching in our daily lives, we should strive to develop a personal relationship with Krishn

### Mistral

In [36]:
# Query the chatbot
response = mistral_chat_instance.query("What is the purpose of life?")
print("Response: ", response["messages"][-1]["content"])


Response:   The purpose of life is enlightened by Lord Krishna in Bhagavad-gita (4.16):
> arjuna vishada-vasam
> buddhi-yogam samatikramate
> naitat samniedayamyaham
> sarva-sammohasudhau bhrata

Translated as: "Arjuna, when the intelligence is thus purified, one attains to My understanding through intelligence alone without delusion, O Arjun."

In simpler terms, the purpose of life is to develop intelligence and purify it so that we can understand God (Krishna) and our relationship with Him. This is achieved by engaging in the practice of Bhakti-yoga, or devotional service, which helps us cultivate love for God through various spiritual practices such as chanting the Hare Krishna mantra, reading scriptures, serving others selflessly, etc.

One common misconception is that material accomplishments or personal happiness are the purpose of life. However, Bhagavad-gita clearly teaches (2.45) that even the most exalted material achievements are transient and temporary: "na te viduh sarva e

### Generating answers for all the questions

In [57]:
# Load the CSV
data = pd.read_csv("evaluation_data/q&a.csv")

def generate_answers(instance):
    generated_answers = []
    for question in data["question"]:
        response = instance.query(question)
        generated_answers.append(response["messages"][-1]["content"])

    return generated_answers

# Add generated answers to the dataframe
data["open_ai_generated_answer"] = generate_answers(openai_chat_instance)
data["mistral_generated_answer"] = generate_answers(mistral_chat_instance)

In [58]:
data

Unnamed: 0,question,answer,open_ai_generated_answer,mistral_generated_answer
0,Why is the Bhagavad-Gita the perfect theistic ...,"""It is the perfect theistic science because it...",Sanskrit Transliteration:\nश्रीभगवानुवाच |\nइम...,"The Bhagavad-Gita is described as the ""perfec..."
1,Why was Dhritarashtra fearful about the outcom...,"""Because the battle was arranged to be fought ...",Sanskrit Transliteration:\nधृतराष्ट्र उवाच |\n...,In response to your question about Dhritarash...
2,What is Duryodhana's purpose in speaking to Dr...,Duryodhana pointed out that Drona trained Dris...,Sanskrit Transliteration:\nअश्रुपूर्णाकुलेक्षण...,In response to your question about Duryodhana...
3,4.Name four powerful fighters each on the side...,Kaurava's Side:\n-Drona\n-Bhisma\n-Karna\n-Kri...,"In the Bhagavad Gita, Lord Krishna explains th...",In response to your question about four power...
4,Describe the significance of the blowing of co...,"-""The sounding of the transcendental conchshel...",Sanskrit Transliteration:\nअथ व्यवस्थितान्दृष्...,In response to your question about the signif...
...,...,...,...,...
134,What was Arjuna's question to Krishna towards ...,What is the destination of the transcendentali...,\nThe question that Arjuna asked Krishna towar...,Arjuna's question towards the end of Chapter ...
135,What happens to one who falls down after pract...,After enjoying life on the higher planets for ...,Sanskrit Transliteration:\nअप्राप्य योगसंसिद्ध...,"BG 6.41 (Chapter 6, Verse 41):\n> arjuna uvāc..."
136,What happens to one who falls down after pract...,He achieves the rare opportunity of taking bir...,Sanskrit Transliteration:\nyoginām api sarveṣā...,The verse you are referring to is Bhagavad-gī...
137,Define yoga. Define various types of yogas in ...,a) Yoga: linking our consciousness with the Su...,Sanskrit Transliteration:\nयोगः कर्मसु कौशलं (...,"In the sacred text of Bhagavad-gita (6.46), Y..."


In [59]:
# Save the updated CSV
data.to_csv("evaluation_data/questions_and_answers_with_predictions_5_shot_template.csv", index=False)

In [60]:
data = pd.read_csv("evaluation_data/questions_and_answers_with_predictions_5_shot_template.csv")

### Rouge and BLEU Score

In [63]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import pandas as pd

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate BLEU and ROUGE
def calculate_scores(reference, hypothesis):
    # Calculate BLEU
    bleu_score = sentence_bleu([reference.split()], hypothesis.split())

    # Calculate ROUGE
    rouge_scores = scorer.score(reference, hypothesis)

    return bleu_score, rouge_scores

# Compute scores
bleu_scores = []
rouge_scores = []

for _, row in data.iterrows():
    reference = row["answer"]
    hypothesis = row["open_ai_generated_answer"]

    bleu, rouge = calculate_scores(reference, hypothesis)
    bleu_scores.append(bleu)
    rouge_scores.append(rouge)

# Add scores to the dataframe
data["bleu_score"] = bleu_scores
data["rouge1"] = [r["rouge1"].fmeasure for r in rouge_scores]
data["rouge2"] = [r["rouge2"].fmeasure for r in rouge_scores]
data["rougeL"] = [r["rougeL"].fmeasure for r in rouge_scores]

# Save the results
data.to_csv("open_ai_evaluation_results.csv", index=False)

# Display average scores
print(f"Average BLEU Score: {data['bleu_score'].mean():.4f}")
print(f"Average ROUGE-1 F1 Score: {data['rouge1'].mean():.4f}")
print(f"Average ROUGE-2 F1 Score: {data['rouge2'].mean():.4f}")
print(f"Average ROUGE-L F1 Score: {data['rougeL'].mean():.4f}")


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.0058
Average ROUGE-1 F1 Score: 0.1336
Average ROUGE-2 F1 Score: 0.0336
Average ROUGE-L F1 Score: 0.0905


In [64]:
# Compute scores
bleu_scores = []
rouge_scores = []

for _, row in data.iterrows():
    reference = row["answer"]
    hypothesis = row["mistral_generated_answer"]

    bleu, rouge = calculate_scores(reference, hypothesis)
    bleu_scores.append(bleu)
    rouge_scores.append(rouge)

# Add scores to the dataframe
data["bleu_score"] = bleu_scores
data["rouge1"] = [r["rouge1"].fmeasure for r in rouge_scores]
data["rouge2"] = [r["rouge2"].fmeasure for r in rouge_scores]
data["rougeL"] = [r["rougeL"].fmeasure for r in rouge_scores]

# Save the results
data.to_csv("mistral_evaluation_results.csv", index=False)

# Display average scores
print(f"Average BLEU Score: {data['bleu_score'].mean():.4f}")
print(f"Average ROUGE-1 F1 Score: {data['rouge1'].mean():.4f}")
print(f"Average ROUGE-2 F1 Score: {data['rouge2'].mean():.4f}")
print(f"Average ROUGE-L F1 Score: {data['rougeL'].mean():.4f}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.0017
Average ROUGE-1 F1 Score: 0.1218
Average ROUGE-2 F1 Score: 0.0220
Average ROUGE-L F1 Score: 0.0776
