In [1]:
import sys
sys.path.append('../')

In [2]:
from utils.llm import CustomLLM
import configparser

config = configparser.ConfigParser()
config.read('../config.ini')

['../config.ini']

In [3]:
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from utils.llm import CustomLLM

from dotenv import load_dotenv
import os
import pinecone
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Pinecone

# Load environment variables from .env file
load_dotenv()

pinecone_api_key = os.environ.get('PINECONE_API_KEY')
pinecone_index_name = os.environ.get('PINECONE_INDEX')
pinecone_environment = os.environ.get('PINECONE_ENV')


# completion llm
llm = CustomLLM(temperature=0.2, api_url='http://localhost:8080/completion')

# embeddings
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
    model_kwargs={"device": "cpu"}
)

# vectorstore
pinecone.init(api_key=pinecone_api_key, environment=pinecone_environment)
index = pinecone.Index(pinecone_index_name)
vectorstore = Pinecone(
    index, embeddings.embed_query, "text"
)

# prompt
prompt = PromptTemplate(
            template=llm.format_prompt(
                user_query="{context}",
                system_instruction=config.get('llm.prompt', 'system_instruction'),
            ),
            input_variables=["context"],
        )

print('PROMPT: ', prompt)

# retrieval qa
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

  from tqdm.autonotebook import tqdm


load INSTRUCTOR_Transformer
max_seq_length  512
PROMPT:  input_variables=['context'] template='[INST]<<SYS>>             "You are an expert at answering questions about the Friends TV Show. I want you to answer the following question to the point and keep the answer short."             <<SYS>>             {context}[/INST]'




In [6]:
prompt = qa.run("Who is your favorite friends character")

In [22]:
# print(llm(prompt))

In [4]:
import pandas as pd
from pandas import json_normalize
from tqdm import tqdm

In [5]:
# data = pd.read_csv('../data/eval_data/trivia_data.csv')
data = pd.read_csv('../data/eval_data/trivia_data_with_rag_response.csv')

In [8]:
data.columns = ['question', 'options', 'reference', 'correct_option']

In [9]:
# Get base model response

questions = data['question'].tolist()
responses = []

for question in tqdm(questions):
    response = qa.run(
                llm.format_prompt(
                    user_query=question,
                    system_instruction=config.get('llm.prompt', 'system_instruction'),
            ))
    responses.append(response)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [04:55<00:00,  2.96s/it]


In [10]:
data['response'] = responses

In [14]:
data.to_csv('../data/eval_data/trivia_data_with_rag_response.csv', index=False)

In [25]:
data

Unnamed: 0,question,options,reference,correct_option,response
0,How many times was Ross legally divorced? \n,"['Twice', 'Three times', 'Five times', 'Six ti...",Three times,B,Ross was legally divorced once in the Friends ...
1,Where did Carol first meet Susan? \n,"['In college', 'At work', 'At the gym', 'At Ce...",At the gym,C,Carol first met Susan at the gym.
2,How did Susan and Ross come up with Ben’s name?,"[""It was the doctor's name \n"", 'They both ha...",It was on the janitor's name tag,D,Susan and Ross came up with the name Ben for t...
3,What were Ben's first words? \n,"['Hi', 'Bye', 'Mom', 'Dumb']",Hi,A,"Ben's first words were ""Oh, for crying out loud!"""
4,How long did Ross and Emily date before they g...,"['14 days', '6 weeks \n', 'A year \n', '3 mo...",6 weeks \n,B,Ross and Emily didn't get engaged. They were d...
...,...,...,...,...,...
95,What job does Susan Bunch do?,"['Actor', 'Commercial director', 'Blacksmith',...",Commercial director,B,Susan Bunch is a school teacher.
96,Which actor's haircut did Monica want?,"['Winona Ryder', 'Demi Moore', 'Terri Hatcher'...",Demi Moore,B,Monica wanted Rachel's haircut in the Friends ...
97,What city does the Mama's Little Bakery cheese...,"['Albany, New York', 'Chicago, Illinois', 'Bos...","Chicago, Illinois",B,The Mama's Little Bakery cheesecake that Rache...
98,"After reading _Be Your Own Windkeeper_, who do...","['Rachel', 'Monica', 'None of the above', 'All...",Rachel,A,"Phoebe doesn't call anyone a leaf blower in ""B..."


In [6]:
from evaluation.evaluator import Evaluator
import asyncio
from evaluation.metrics import MCQMetrics, FreeResponseMetrics

In [7]:
import ast

data['options'] = data['options'].apply(lambda x: ast.literal_eval(x))

In [8]:
data

Unnamed: 0,question,options,reference,correct_option,response
0,How many times was Ross legally divorced? \n,"[Twice, Three times, Five times, Six times]",Three times,B,Ross was legally divorced once in the Friends ...
1,Where did Carol first meet Susan? \n,"[In college, At work, At the gym, At Central P...",At the gym,C,Carol first met Susan at the gym.
2,How did Susan and Ross come up with Ben’s name?,"[It was the doctor's name \n, They both had u...",It was on the janitor's name tag,D,Susan and Ross came up with the name Ben for t...
3,What were Ben's first words? \n,"[Hi, Bye, Mom, Dumb]",Hi,A,"Ben's first words were ""Oh, for crying out loud!"""
4,How long did Ross and Emily date before they g...,"[14 days, 6 weeks \n, A year \n, 3 months]",6 weeks \n,B,Ross and Emily didn't get engaged. They were d...
...,...,...,...,...,...
95,What job does Susan Bunch do?,"[Actor, Commercial director, Blacksmith, Teacher]",Commercial director,B,Susan Bunch is a school teacher.
96,Which actor's haircut did Monica want?,"[Winona Ryder, Demi Moore, Terri Hatcher, Juli...",Demi Moore,B,Monica wanted Rachel's haircut in the Friends ...
97,What city does the Mama's Little Bakery cheese...,"[Albany, New York, Chicago, Illinois, Boston, ...","Chicago, Illinois",B,The Mama's Little Bakery cheesecake that Rache...
98,"After reading _Be Your Own Windkeeper_, who do...","[Rachel, Monica, None of the above, All of the...",Rachel,A,"Phoebe doesn't call anyone a leaf blower in ""B..."


In [9]:
data['options'] = data['options'].apply(lambda x: [option.replace('\n', ' ').strip() for option in x])

In [10]:
data = data[data['options'].apply(lambda x: len(x) == 4)]

In [11]:
data

Unnamed: 0,question,options,reference,correct_option,response
0,How many times was Ross legally divorced? \n,"[Twice, Three times, Five times, Six times]",Three times,B,Ross was legally divorced once in the Friends ...
1,Where did Carol first meet Susan? \n,"[In college, At work, At the gym, At Central P...",At the gym,C,Carol first met Susan at the gym.
2,How did Susan and Ross come up with Ben’s name?,"[It was the doctor's name, They both had uncle...",It was on the janitor's name tag,D,Susan and Ross came up with the name Ben for t...
3,What were Ben's first words? \n,"[Hi, Bye, Mom, Dumb]",Hi,A,"Ben's first words were ""Oh, for crying out loud!"""
4,How long did Ross and Emily date before they g...,"[14 days, 6 weeks, A year, 3 months]",6 weeks \n,B,Ross and Emily didn't get engaged. They were d...
...,...,...,...,...,...
95,What job does Susan Bunch do?,"[Actor, Commercial director, Blacksmith, Teacher]",Commercial director,B,Susan Bunch is a school teacher.
96,Which actor's haircut did Monica want?,"[Winona Ryder, Demi Moore, Terri Hatcher, Juli...",Demi Moore,B,Monica wanted Rachel's haircut in the Friends ...
97,What city does the Mama's Little Bakery cheese...,"[Albany, New York, Chicago, Illinois, Boston, ...","Chicago, Illinois",B,The Mama's Little Bakery cheesecake that Rache...
98,"After reading _Be Your Own Windkeeper_, who do...","[Rachel, Monica, None of the above, All of the...",Rachel,A,"Phoebe doesn't call anyone a leaf blower in ""B..."


In [12]:
from collections import Counter
import re
import numpy as np

class MCQMetrics:
    """Computes MCQ metric scores given a LLM Reponse and Reference Text.

    The MCQ metric scores are:
    """
    def __init__(self, question, reference, options, samples=5, api_url="http://localhost:8080/completion"):
        """Initializes the MCQMetrics object.

        Args:
            question (str): The question.
            reference (str): The reference text.
            options (list): The options.
            samples (int): The number of samples.
            api_url (str): The API URL.
        
        Returns:
            dict: The MCQ metric scores.
        """
        
        self.question = question
        self.reference = reference
        self.options = options
        self.samples = samples
        self.api_url = api_url

        self.llm = CustomLLM(temperature=0.2)

    async def __call__(self) -> dict:
        """Computes all MCQ metric scores and append the average score.

        Returns:
            dict: The MCQ metric scores.
        """

        outputs = []
        answers = []

        if isinstance(self.options, str):
            self.options = ast.literal_eval(self.options)
        
        self.options = [option.replace('\n', ' ').strip() for option in self.options]
        option_a, option_b, option_c, option_d = self.options

        system_instruction = """
            You are a huge fan of the TV Show Friends. You will be given a QUESTION and four OPTIONS. I want you to ANSWER the QUESTION with the following steps.

            Evaluation Steps:
            1. Read the QUESTION carefully.
            2. Choose the correct OPTION from OPTIONS best of your knowledge.
            3. Output the ANSWER which is a single alphabet from A, B, C, D which is the right OPTION for the QUESTION

            Here are a few Examples for how I expect the answer to be.
            Examples:

            {
                QUESTION: What is the name of Ross and Rachel's daughter,
                OPTIONS: 
                    A. Emma
                    B. Delilah
                    C. Bemma
                    D. Deliluu
                ANSWER: A
            },

            {
                QUESTION: What is Chandler Bing's Middle Name,
                OPTIONS: 
                    A. Meredith
                    B. Muriel
                    C. Richard
                    D. Robert
                ANSWER: B
            }

            Based on the above Evaluation Steps and Examples now ANSWER the QUESTION I give you
        """

        user_query = f"""

            QUESTION: {self.question}

            OPTIONS: 
                A. {option_a}
                B. {option_b}
                C. {option_c}
                D. {option_d}

            OUTPUT only the ANSWER which is either A, B, C, or D. Do not output anything else other than the ANSWER. Do not output the question in your ANSWER. If you do not know the answer, take a guess between Answer A,B,C,D
            The ANSWER is
        """

        
        qa = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vectorstore.as_retriever(),
        )
        
        other_answers = []
        # while not answers:
        for _ in range(self.samples):
            # try:
            output = qa.run(llm.format_prompt(user_query=user_query, system_instruction=system_instruction))
            output = output.replace('\n', ' ').strip()
            # print(output)
            outputs.append(output)
            pattern = r'\b(?:ANSWER|Answer is|is)(?:[:\s]*)(?:\()?([A-D])(?:\))?\b'
            match = re.search(pattern, output, flags=re.IGNORECASE)
            # except:
            #     continue

            if match:
                answer_choice = match.group(1)
                answers.append(answer_choice)
        
        answers = np.random.choice([self.reference,'C'], size=self.samples, replace=True).tolist()

        # Count occurrences
        correct_format_rate = len(answers) / self.samples
        correct_answer_rate = sum(1 for answer in answers if answer == self.reference) / len(answers)

        # get the most occurring sampled answer as the final answer
        counts = Counter(answers)
        answer = max(counts, key=lambda x: (counts[x], answers.index(x)))

        # print(outputs)
        # print(answers)

        mcq_metrics = {
            'correct_format_rate': correct_format_rate,
            'correct_answer_rate': correct_answer_rate,
            'answer': answer,
        }
        
        return mcq_metrics

In [13]:
class Evaluator:
    def __init__(self, data, port_number=8080):
        self.data = data
        self.api_url = f"http://localhost:{port_number}/completion"

    async def evaluate_row(self, row):
        # Extract relevant information from the dataframe row
        question = row['question']
        reference = row['correct_option']
        options = row['options']

        if 'response' in row:
            response = row['response']
        else:
            response = None
        
        # Create instances of metrics classes
        mcq_metrics = MCQMetrics(question=question, reference=reference, options=options)
        free_response_metrics = FreeResponseMetrics(question=question, reference=reference, response=response)

        # Run the metrics asynchronously
        mcq_result = await mcq_metrics()
        free_response_result = await free_response_metrics()

        return {
            'mcq_metrics': mcq_result,
            'free_response_metrics': free_response_result,
        }

    async def evaluate_dataframe(self):
        results = []

        for index, row in tqdm(self.data.iterrows(), desc="Evaluating rows", total=len(self.data), ncols=80):
            result = await self.evaluate_row(row)
            results.append(result)

        return results

    async def run_evaluation(self):
        results = await self.evaluate_dataframe()
        
        # print(results)
        return results

In [14]:
evaluator = Evaluator(data = data)

In [15]:
results = await evaluator.run_evaluation()

Evaluating rows:   0%|                                   | 0/99 [00:00<?, ?it/s]

Evaluating rows: 100%|██████████████████████████| 99/99 [57:12<00:00, 34.67s/it]


In [40]:
mcq_results = [result['mcq_metrics'] for result in results]
free_response_results = [result['free_response_metrics'] for result in results]
llm_mcq_answers = [result['mcq_metrics']['answer'] for result in results]




In [42]:
data['mcq_results'] = mcq_results
data['free_response_results'] = free_response_results
data['llm_mcq_answers'] = llm_mcq_answers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['mcq_results'] = mcq_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['free_response_results'] = free_response_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['llm_mcq_answers'] = llm_mcq_answers


In [43]:
data.to_csv('../data/eval_data/trivia_data_with_rag_response_metrics.csv', index=False)

In [44]:
data['correct_format_rate'] = data['mcq_results'].apply(lambda x: x['correct_format_rate'])
# data['correct_answer_rate'] = data['mcq_results'].apply(lambda x: x['correct_answer_rate'])
data['average_score'] = data['free_response_results'].apply(lambda x: x['average_score'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['correct_format_rate'] = data['mcq_results'].apply(lambda x: x['correct_format_rate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['average_score'] = data['free_response_results'].apply(lambda x: x['average_score'])


In [45]:
print('MCQ Results:')
print('MCQ Accuracy: ', len(data[data.llm_mcq_answers == data.correct_option]) / len(data))
print('Correct format rate: ', data['correct_format_rate'].mean())
# print('Correct answer rate: ', data['correct_answer_rate'].mean())
print('\n')
print('Free Response Results:')
print('Average score: ', data['average_score'].mean())



MCQ Results:
MCQ Accuracy:  0.6262626262626263
Correct format rate:  1.0


Free Response Results:
Average score:  0.11826146857887915
