In [7]:
# !pip3 install langchain

In [2]:
import sys
sys.path.append('../')

In [11]:
from utils.llm import CustomLLM
import configparser

config = configparser.ConfigParser()
config.read('./config.ini')

['./config.ini']

In [20]:
llm = CustomLLM(api_url='http://localhost:8081/completion')

In [21]:
prompt = llm.format_prompt(user_query="Who is your favorite friends character", system_instruction=config.get('llm.prompt', 'system_instruction'))

In [22]:
print(llm(prompt))

 Rachel Green.


In [23]:
import pandas as pd
from pandas import json_normalize
from tqdm import tqdm

In [24]:
data = pd.read_csv('./data/eval_data/trivia_data.csv')
# data = pd.read_csv('../data/eval_data/trivia_data_with_response.csv')

In [25]:
data.columns = ['question', 'options', 'reference', 'correct_option']

In [26]:
# Get base model response

questions = data['question'].tolist()
responses = []

for question in tqdm(questions):
    prompt = llm.format_prompt(user_query=question, system_instruction=config.get('llm.prompt', 'system_instruction'))
    response = await llm.ainvoke(prompt,)
    responses.append(response)

100%|██████████| 100/100 [07:17<00:00,  4.38s/it]


In [28]:
print(llm.ainvoke)

<bound method BaseLLM.ainvoke of CustomLLM(api_url='http://localhost:8081/completion', custom_kwargs={'api_url': 'http://localhost:8081/completion'})>


In [29]:
data['response'] = responses

In [63]:
data.to_csv('./data/eval_data/trivia_data_with_ft_response.csv', index=False)

In [33]:
data

Unnamed: 0,question,options,reference,correct_option,response
0,How many times was Ross legally divorced? \n,"['Twice', 'Three times', 'Five times', 'Six ti...",Three times,B,2 times.
1,Where did Carol first meet Susan? \n,"['In college', 'At work', 'At the gym', 'At Ce...",At the gym,C,Carol first met Susan at the coffee shop wher...
2,How did Susan and Ross come up with Ben’s name?,"[""It was the doctor's name \n"", 'They both ha...",It was on the janitor's name tag,D,Susan and Ross came up with the name Ben for ...
3,What were Ben's first words? \n,"['Hi', 'Bye', 'Mom', 'Dumb']",Hi,A,"""Ben's first words were 'I love you.'"""
4,How long did Ross and Emily date before they g...,"['14 days', '6 weeks \n', 'A year \n', '3 mo...",6 weeks \n,B,2 years.
...,...,...,...,...,...
95,What job does Susan Bunch do?,"['Actor', 'Commercial director', 'Blacksmith',...",Commercial director,B,Susan Bunch is a masseuse.
96,Which actor's haircut did Monica want?,"['Winona Ryder', 'Demi Moore', 'Terri Hatcher'...",Demi Moore,B,Chandler Bing.
97,What city does the Mama's Little Bakery cheese...,"['Albany, New York', 'Chicago, Illinois', 'Bos...","Chicago, Illinois",B,New York City.
98,"After reading _Be Your Own Windkeeper_, who do...","['Rachel', 'Monica', 'None of the above', 'All...",Rachel,A,"Phoebe calls a leaf blower ""a windkeeper."""


In [37]:
# !pip3 install evaluate

In [40]:
# from evaluation.evaluator import Evaluator
import asyncio
from evaluation.metrics import FreeResponseMetrics

In [41]:
import ast

data['options'] = data['options'].apply(lambda x: ast.literal_eval(x))

In [42]:
data['options'] = data['options'].apply(lambda x: [option.replace('\n', ' ').strip() for option in x])

In [43]:
data = data[data['options'].apply(lambda x: len(x) == 4)]

In [44]:
data

Unnamed: 0,question,options,reference,correct_option,response
0,How many times was Ross legally divorced? \n,"[Twice, Three times, Five times, Six times]",Three times,B,2 times.
1,Where did Carol first meet Susan? \n,"[In college, At work, At the gym, At Central P...",At the gym,C,Carol first met Susan at the coffee shop wher...
2,How did Susan and Ross come up with Ben’s name?,"[It was the doctor's name, They both had uncle...",It was on the janitor's name tag,D,Susan and Ross came up with the name Ben for ...
3,What were Ben's first words? \n,"[Hi, Bye, Mom, Dumb]",Hi,A,"""Ben's first words were 'I love you.'"""
4,How long did Ross and Emily date before they g...,"[14 days, 6 weeks, A year, 3 months]",6 weeks \n,B,2 years.
...,...,...,...,...,...
95,What job does Susan Bunch do?,"[Actor, Commercial director, Blacksmith, Teacher]",Commercial director,B,Susan Bunch is a masseuse.
96,Which actor's haircut did Monica want?,"[Winona Ryder, Demi Moore, Terri Hatcher, Juli...",Demi Moore,B,Chandler Bing.
97,What city does the Mama's Little Bakery cheese...,"[Albany, New York, Chicago, Illinois, Boston, ...","Chicago, Illinois",B,New York City.
98,"After reading _Be Your Own Windkeeper_, who do...","[Rachel, Monica, None of the above, All of the...",Rachel,A,"Phoebe calls a leaf blower ""a windkeeper."""


In [49]:
from collections import Counter
import re


class MCQMetrics:
    """Computes MCQ metric scores given a LLM Reponse and Reference Text.

    The MCQ metric scores are:
    """
    def __init__(self, question, reference, options, samples=5, api_url="http://localhost:8081/completion"):
        """Initializes the MCQMetrics object.

        Args:
            question (str): The question.
            reference (str): The reference text.
            options (list): The options.
            samples (int): The number of samples.
            api_url (str): The API URL.
        
        Returns:
            dict: The MCQ metric scores.
        """
        
        self.question = question
        self.reference = reference
        self.options = options
        self.samples = samples
        self.api_url = api_url

        self.llm = CustomLLM(temperature=0.7, api_url=api_url)

    async def __call__(self) -> dict:
        """Computes all MCQ metric scores and append the average score.

        Returns:
            dict: The MCQ metric scores.
        """

        outputs = []
        answers = []

        if isinstance(self.options, str):
            self.options = ast.literal_eval(self.options)
        
        self.options = [option.replace('\n', ' ').strip() for option in self.options]
        option_a, option_b, option_c, option_d = self.options

        system_instruction = """
            You are a huge fan of the TV Show Friends. You will be given a QUESTION and four OPTIONS. I want you to ANSWER the QUESTION with the following steps.

            Evaluation Steps:
            1. Read the QUESTION carefully.
            2. Choose the correct OPTION from OPTIONS best of your knowledge.
            3. Output the ANSWER which is a single alphabet from A, B, C, D which is the right OPTION for the QUESTION
            4. The Output format for each OPTION is 
                for A: 'ANSWER: A'
                for B: 'ANSWER: B'
                for C: 'ANSWER: C'
                for D: 'ANSWER: D'

            Here are a few Examples for how I expect the answer to be.
            Examples:

            {
                QUESTION: What is the name of Ross and Rachel's daughter,
                OPTIONS: 
                    A. Emma
                    B. Delilah
                    C. Bemma
                    D. Deliluu
                ANSWER: A
            },

            {
                QUESTION: What is Chandler Bing's Middle Name,
                OPTIONS: 
                    A. Meredith
                    B. Muriel
                    C. Richard
                    D. Robert
                ANSWER: B
            }

            Based on the above Evaluation Steps and Examples now ANSWER the QUESTION I give you
        """

        user_query = f"""

            QUESTION: {self.question}

            OPTIONS: 
                A. {option_a}
                B. {option_b}
                C. {option_c}
                D. {option_d}

            OUTPUT only the ANSWER which is either A, B, C, or D. 
            The ANSWER is
        """
        
        other_answers = []
        while not answers:
            for _ in range(self.samples):
                prompt = self.llm.format_prompt(user_query=user_query, system_instruction=system_instruction)
                output = await self.llm.ainvoke(prompt)
                outputs.append(output)

                match = re.search(r'\b(?:ANSWER|Answer is)(?:\:|) ([A-D])\b', output, flags=re.IGNORECASE)

                if match:
                    answer_choice = match.group(1)
                    answers.append(answer_choice)

        # Count occurrences
        correct_format_rate = len(answers) / self.samples
        correct_answer_rate = sum(1 for answer in answers if answer == self.reference) / len(answers)

        # get the most occurring sampled answer as the final answer
        counts = Counter(answers)
        answer = max(counts, key=lambda x: (counts[x], answers.index(x)))

        # print(outputs)
        # print(answers)

        mcq_metrics = {
            'correct_format_rate': correct_format_rate,
            'correct_answer_rate': correct_answer_rate,
            'answer': answer,
        }
        
        return mcq_metrics

In [50]:
class Evaluator:
    def __init__(self, data, port_number=8081):
        self.data = data
        self.api_url = f"http://localhost:{port_number}/completion"

    async def evaluate_row(self, row):
        # Extract relevant information from the dataframe row
        question = row['question']
        reference = row['reference']
        options = row['options']

        if 'response' in row:
            response = row['response']
        else:
            response = None
        
        # Create instances of metrics classes
        mcq_metrics = MCQMetrics(question=question, reference=reference, options=options)
        free_response_metrics = FreeResponseMetrics(question=question, reference=reference, response=response)

        # Run the metrics asynchronously
        mcq_result = await mcq_metrics()
        free_response_result = await free_response_metrics()

        return {
            'mcq_metrics': mcq_result,
            'free_response_metrics': free_response_result,
        }

    async def evaluate_dataframe(self):
        results = []

        for index, row in tqdm(self.data.iterrows(), desc="Evaluating rows", total=len(self.data), ncols=80):
            result = await self.evaluate_row(row)
            results.append(result)

        return results

    async def run_evaluation(self):
        results = await self.evaluate_dataframe()
        
        # print(results)
        return results

In [51]:
evaluator = Evaluator(data = data)

In [55]:
# !pip3 install nltk

In [56]:
results = await evaluator.run_evaluation()

Evaluating rows: 100%|██████████████████████████| 99/99 [30:09<00:00, 18.28s/it]


In [57]:
mcq_results = [result['mcq_metrics'] for result in results]
free_response_results = [result['free_response_metrics'] for result in results]
llm_mcq_answers = [result['mcq_metrics']['answer'] for result in results]


In [59]:


data['mcq_results'] = mcq_results
data['free_response_results'] = free_response_results
data['llm_mcq_answers'] = llm_mcq_answers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['mcq_results'] = mcq_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['free_response_results'] = free_response_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['llm_mcq_answers'] = llm_mcq_answers


In [61]:
data.to_csv('./data/eval_data/trivia_data_with_ft_response_metrics.csv', index=False)

In [62]:
data

Unnamed: 0,question,options,reference,correct_option,response,mcq_results,free_response_results,llm_mcq_answers
0,How many times was Ross legally divorced? \n,"[Twice, Three times, Five times, Six times]",Three times,B,2 times.,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.41092867442125397, 'bleu':...",B
1,Where did Carol first meet Susan? \n,"[In college, At work, At the gym, At Central P...",At the gym,C,Carol first met Susan at the coffee shop wher...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.17789655988947148, 'bleu':...",B
2,How did Susan and Ross come up with Ben’s name?,"[It was the doctor's name, They both had uncle...",It was on the janitor's name tag,D,Susan and Ross came up with the name Ben for ...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.1096754383826632, 'bleu': ...",B
3,What were Ben's first words? \n,"[Hi, Bye, Mom, Dumb]",Hi,A,"""Ben's first words were 'I love you.'""","{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.009469696969696994, 'bleu'...",B
4,How long did Ross and Emily date before they g...,"[14 days, 6 weeks, A year, 3 months]",6 weeks \n,B,2 years.,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.01388888888888891, 'bleu':...",B
...,...,...,...,...,...,...,...,...
95,What job does Susan Bunch do?,"[Actor, Commercial director, Blacksmith, Teacher]",Commercial director,B,Susan Bunch is a masseuse.,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.013661202185792372, 'bleu'...",B
96,Which actor's haircut did Monica want?,"[Winona Ryder, Demi Moore, Terri Hatcher, Juli...",Demi Moore,B,Chandler Bing.,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.013440860215053784, 'bleu'...",B
97,What city does the Mama's Little Bakery cheese...,"[Albany, New York, Chicago, Illinois, Boston, ...","Chicago, Illinois",B,New York City.,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.00856164383561646, 'bleu':...",B
98,"After reading _Be Your Own Windkeeper_, who do...","[Rachel, Monica, None of the above, All of the...",Rachel,A,"Phoebe calls a leaf blower ""a windkeeper.""","{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.023946360153256727, 'bleu'...",B


In [65]:
data.loc[data['correct_option']==data['llm_mcq_answers']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   question               38 non-null     object
 1   options                38 non-null     object
 2   reference              38 non-null     object
 3   correct_option         38 non-null     object
 4   response               38 non-null     object
 5   mcq_results            38 non-null     object
 6   free_response_results  38 non-null     object
 7   llm_mcq_answers        38 non-null     object
dtypes: object(8)
memory usage: 2.7+ KB


In [66]:
data['correct_format_rate'] = data['mcq_results'].apply(lambda x: x['correct_format_rate'])
# data['correct_answer_rate'] = data['mcq_results'].apply(lambda x: x['correct_answer_rate'])
data['average_score'] = data['free_response_results'].apply(lambda x: x['average_score'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['correct_format_rate'] = data['mcq_results'].apply(lambda x: x['correct_format_rate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['average_score'] = data['free_response_results'].apply(lambda x: x['average_score'])


In [67]:
print('MCQ Results:')
print('MCQ Accuracy: ', len(data[data.llm_mcq_answers == data.correct_option]) / len(data))
print('Correct format rate: ', data['correct_format_rate'].mean())
# print('Correct answer rate: ', data['correct_answer_rate'].mean())
print('\n')
print('Free Response Results:')
print('Average score: ', data['average_score'].mean())


MCQ Results:
MCQ Accuracy:  0.3838383838383838
Correct format rate:  0.997979797979798


Free Response Results:
Average score:  0.0910046325201405


In [None]:
# correct = len(data[data.correct_op])

In [68]:
a=pd.read_csv('/Users/zoey/Persona-LLM-Chatbot-For-QA/data/eval_data/trivia_data.csv')

In [69]:
a.head()

Unnamed: 0,Question,Options,Correct Answer,Correct_Answer_no
0,How many times was Ross legally divorced? \n,"['Twice', 'Three times', 'Five times', 'Six ti...",Three times,B
1,Where did Carol first meet Susan? \n,"['In college', 'At work', 'At the gym', 'At Ce...",At the gym,C
2,How did Susan and Ross come up with Ben’s name?,"[""It was the doctor's name \n"", 'They both ha...",It was on the janitor's name tag,D
3,What were Ben's first words? \n,"['Hi', 'Bye', 'Mom', 'Dumb']",Hi,A
4,How long did Ross and Emily date before they g...,"['14 days', '6 weeks \n', 'A year \n', '3 mo...",6 weeks \n,B
