## Basic Langchain Based Prompt with Auto Evaluation

In [1]:
# Importing Necessary libraries
import os
import bs4
from langchain.vectorstores import FAISS
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.evaluation.qa import QAEvalChain
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


1. Setting Ollama Model and Embeddings

In [2]:
embeddings = OllamaEmbeddings(model="llama2:latest")
llm = OllamaLLM(model="llama2:latest")

2. Loading Documents based on URL

In [32]:
loader = WebBaseLoader(
    web_paths=("https://en.wikipedia.org/wiki/India",),
    bs_kwargs=dict(
        parse_only = bs4.SoupStrainer(
            # class_=("post-content", "post-title", "post-header")
        )
    )
)
documents = loader.load()

In [33]:
# Splitting the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents= documents)

3. Creatting Vector Databases using FAISS

In [34]:
vectordb = FAISS.from_documents(texts, embedding= embeddings)

4. Creating Retrival Question and Answer Chain

In [35]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectordb.as_retriever()
)

5.1. Generating answer for Question 1

In [36]:
question = "What is the official name India?"
ground_truth = "The official name of India is Republic of India"


answer = qa_chain.invoke({"query":question})['result']
print(f"Question: {question}\n" + "-"*100)
print(f"Answer: {answer}\n" + "-"*100)
print(f"Ground Truth: {ground_truth}\n" + "-"*100)

Question: What is the official name India?
----------------------------------------------------------------------------------------------------
Answer: The official name of India is Bhārat Gaṇarājya (ISO).
----------------------------------------------------------------------------------------------------
Ground Truth: The official name of India is Republic of India
----------------------------------------------------------------------------------------------------


6.1. Evaluating The Answer for Question 1

In [37]:
# Creating Evaluation Chain
llm_eval = OllamaLLM(model='llama2:latest')

eval_prompt = PromptTemplate(
    input_variables=["question", "answer", "ground_truth"],
    template="""
            You are a helpful assistant that evaluates how well the given answer matches the ground truth.
            Question: {question}
            Answer: {answer}
            Ground Truth: {ground_truth}

            Please provide a score from 1 to 5, where 1 means the answer is completely irrelevant, and 5 means the answer is perfectly aligned with the ground truth.
            Also, provide a short explanation for your score.

            Score:
            Explanation:
    """,
)

# Preparing evaluation data
eval_examples = [{
    "query": question,
    "answer": answer,
    "result": ground_truth
    }]

# Creating eval chain
eval_chain = QAEvalChain.from_llm(
    llm= llm_eval
)

# Prediction text
predictions = [{'result': answer}]

# Run the evaluation
evaluated_result = eval_chain.evaluate(eval_examples, predictions=predictions)

print("\nEvaluation Results:")
print(evaluated_result)



Evaluation Results:
[{'results': 'CORRECT. The student answer accurately reflects the official name of India, which is "Bhārat Gaṇarājya" according to the ISO standard.'}]


7.1. Evaluating Answer using ROUGE method for Question 1

In [38]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = True)
rouge_scores = scorer.score(ground_truth, answer)
print("ROUGE Scores:\n", "rouge1: ", rouge_scores['rouge1'], "\nrouge2: ", rouge_scores['rouge2'], "\nrougeL: ", rouge_scores['rougeL'],)

ROUGE Scores:
 rouge1:  Score(precision=0.5, recall=0.6666666666666666, fmeasure=0.5714285714285715) 
rouge2:  Score(precision=0.45454545454545453, recall=0.625, fmeasure=0.5263157894736842) 
rougeL:  Score(precision=0.5, recall=0.6666666666666666, fmeasure=0.5714285714285715)


8.1. Evaluating Answer using BLUE method for Question 1

In [39]:
import evaluate

# Load the BLEU metric
bleu = evaluate.load("bleu")

# Calculate the BLEU score
results = bleu.compute(predictions=[answer], references=[ground_truth])

# Print the results
print(results['bleu'])

0.41722614486115056


5.2. Generating ANswer for question 2

In [41]:

question = "How many states India have?"
ground_truth = "India have 29 states."


answer = qa_chain.invoke({"query":question})['result']
print(f"Question: {question}\n" + "-"*100)
print(f"Answer: {answer}\n" + "-"*100)
print(f"Ground Truth: {ground_truth}\n" + "-"*100)

Question: How many states India have?
----------------------------------------------------------------------------------------------------
Answer: Based on the given context, India has 29 states.
----------------------------------------------------------------------------------------------------
Ground Truth: India have 29 states.
----------------------------------------------------------------------------------------------------


6.2. Evaluating The Answer for Question 2

In [42]:
# Preparing evaluation data
eval_examples = [{
    "query": question,
    "answer": answer,
    "result": ground_truth
    }]

# Creating eval chain
eval_chain = QAEvalChain.from_llm(
    llm= llm_eval
)

# Prediction text
predictions = [{'result': answer}]

# Run the evaluation
evaluated_result = eval_chain.evaluate(eval_examples, predictions=predictions)

print("\nEvaluation Results:")
print(evaluated_result)



Evaluation Results:
[{'results': 'GRADE: CORRECT'}]


7.2. Evaluating Answer using ROUGE method for Question 2

In [43]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = True)
rouge_scores = scorer.score(ground_truth, answer)
print("ROUGE Scores:\n", "rouge1: ", rouge_scores['rouge1'], "\nrouge2: ", rouge_scores['rouge2'], "\nrougeL: ", rouge_scores['rougeL'],)

ROUGE Scores:
 rouge1:  Score(precision=0.3333333333333333, recall=0.75, fmeasure=0.46153846153846156) 
rouge2:  Score(precision=0.125, recall=0.3333333333333333, fmeasure=0.18181818181818182) 
rougeL:  Score(precision=0.3333333333333333, recall=0.75, fmeasure=0.46153846153846156)


8.2. Evaluating Answer using BLUE method for Question 2

In [44]:
import evaluate

# Load the BLEU metric
bleu = evaluate.load("bleu")

# Calculate the BLEU score
results = bleu.compute(predictions=[answer], references=[ground_truth])

# Print the results
print(results['bleu'])

0.0
