In [1]:
import sys
sys.path.append('../')

In [2]:
from utils.llm import CustomLLM
import configparser

config = configparser.ConfigParser()
config.read('../config.ini')

['../config.ini']

In [3]:
llm = CustomLLM()

In [4]:
prompt = llm.format_prompt(user_query="Who is your favorite friends character", system_instruction=config.get('llm.prompt', 'system_instruction'))

In [5]:
print(llm(prompt))

  My favorite Friends character is Joey Tribbiani. His funny one-liners and lovable personality make him a standout character in the show.


In [6]:
import pandas as pd
from pandas import json_normalize
from tqdm import tqdm

In [7]:
data = pd.read_csv('../data/eval_data/trivia_data.csv')
# data = pd.read_csv('../data/eval_data/trivia_data_with_response.csv')

In [8]:
data.columns = ['question', 'options', 'reference', 'correct_option']

In [9]:
# Get base model response

questions = data['question'].tolist()
responses = []

for question in tqdm(questions):
    prompt = llm.format_prompt(user_query=question, system_instruction=config.get('llm.prompt', 'system_instruction'))
    response = await llm.ainvoke(prompt)
    responses.append(response)

  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [04:16<00:00,  1.28s/it]


In [10]:
data['response'] = responses

In [27]:
data.to_csv('../data/eval_data/trivia_data_with_base_response.csv', index=False)

In [26]:
data

Unnamed: 0,question,options,reference,correct_option,response
0,How many times was Ross legally divorced? \n,"[Twice, Three times, Five times, Six times]",Three times,B,Ross was legally divorced twice on the show.
1,Where did Carol first meet Susan? \n,"[In college, At work, At the gym, At Central P...",At the gym,C,Carol first met Susan in the pilot episode o...
2,How did Susan and Ross come up with Ben’s name?,"[It was the doctor's name, They both had uncle...",It was on the janitor's name tag,D,"In the Friends TV show, Susan and Ross came ..."
3,What were Ben's first words? \n,"[Hi, Bye, Mom, Dumb]",Hi,A,Ben's first words on the Friends TV show wer...
4,How long did Ross and Emily date before they g...,"[14 days, 6 weeks, A year, 3 months]",6 weeks \n,B,Ross and Emily dated for approximately 6 mon...
...,...,...,...,...,...
96,Which actor's haircut did Monica want?,"[Winona Ryder, Demi Moore, Terri Hatcher, Juli...",Demi Moore,B,Sure! The answer to your question is:\n\nMon...
97,What city does the Mama's Little Bakery cheese...,"[Albany, New York, Chicago, Illinois, Boston, ...","Chicago, Illinois",B,The Mama's Little Bakery cheesecake that Rac...
98,"After reading _Be Your Own Windkeeper_, who do...","[Rachel, Monica, None of the above, All of the...",Rachel,A,Phoebe calls Joey a leaf blower in the episo...
99,"When Ben shows up with a Barbie doll, what dol...","[Action Man, GI Joe, Optimus Prime, Stretch Ar...",GI Joe,B,Ross wants Ben to play with the GI Joe doll ...


In [9]:
from evaluation.evaluator import Evaluator
import asyncio
from evaluation.metrics import MCQMetrics, FreeResponseMetrics

In [10]:
import ast

data['options'] = data['options'].apply(lambda x: ast.literal_eval(x))

In [11]:
data['options'] = data['options'].apply(lambda x: [option.replace('\n', ' ').strip() for option in x])

In [16]:
data = data[data['options'].apply(lambda x: len(x) == 4)]

In [17]:
data

Unnamed: 0,question,options,reference,correct_option,response
0,How many times was Ross legally divorced? \n,"[Twice, Three times, Five times, Six times]",Three times,B,Ross was legally divorced twice on the show.
1,Where did Carol first meet Susan? \n,"[In college, At work, At the gym, At Central P...",At the gym,C,Carol first met Susan in the pilot episode o...
2,How did Susan and Ross come up with Ben’s name?,"[It was the doctor's name, They both had uncle...",It was on the janitor's name tag,D,"In the Friends TV show, Susan and Ross came ..."
3,What were Ben's first words? \n,"[Hi, Bye, Mom, Dumb]",Hi,A,Ben's first words on the Friends TV show wer...
4,How long did Ross and Emily date before they g...,"[14 days, 6 weeks, A year, 3 months]",6 weeks \n,B,Ross and Emily dated for approximately 6 mon...
...,...,...,...,...,...
195,Which character claimed to have skipped fourth...,"[Joey, Monica, Chandler, Ross]",Ross,D,Joey Tribbiani.
196,Which character was shot in the butt by a tran...,"[Joey, Rachel, Phoebe, Ross]",Phoebe,C,Joey Tribbiani.
197,Who resolves to stop gossiping as their New Ye...,"[Monica, Ross, Rachel, Joey]",Rachel,C,Rachel Green.
198,Which friend has never kissed Joey?,"[Rachel, Ross, Monica, Phoebe]",Monica,C,Of course! The friend who has never kissed J...


In [18]:
class Evaluator:
    def __init__(self, data, port_number=8080):
        self.data = data
        self.api_url = f"http://localhost:{port_number}/completion"

    async def evaluate_row(self, row):
        # Extract relevant information from the dataframe row
        question = row['question']
        reference = row['reference']
        options = row['options']

        if 'response' in row:
            response = row['response']
        else:
            response = None
        
        # Create instances of metrics classes
        mcq_metrics = MCQMetrics(question=question, reference=reference, options=options)
        free_response_metrics = FreeResponseMetrics(question=question, reference=reference, response=response)

        # Run the metrics asynchronously
        mcq_result = await mcq_metrics()
        free_response_result = await free_response_metrics()

        return {
            'mcq_metrics': mcq_result,
            'free_response_metrics': free_response_result,
        }

    async def evaluate_dataframe(self):
        results = []

        for index, row in tqdm(self.data.iterrows(), desc="Evaluating rows", total=len(self.data), ncols=80):
            result = await self.evaluate_row(row)
            results.append(result)

        return results

    async def run_evaluation(self):
        results = await self.evaluate_dataframe()
        
        # print(results)
        return results

In [28]:
evaluator = Evaluator(data = data)

In [29]:
results = await evaluator.run_evaluation()

Evaluating rows: 100%|████████████████████████| 100/100 [23:44<00:00, 14.25s/it]


In [34]:
mcq_results = [result['mcq_metrics'] for result in results]
free_response_results = [result['free_response_metrics'] for result in results]
llm_mcq_answers = [result['mcq_metrics']['answer'] for result in results]

In [35]:
data['mcq_results'] = mcq_results
data['free_response_results'] = free_response_results
data['llm_mcq_answers'] = llm_mcq_answers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['mcq_results'] = mcq_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['free_response_results'] = free_response_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['llm_mcq_answers'] = llm_mcq_answers


In [36]:
data.to_csv('../data/eval_data/trivia_data_with_base_response_metrics.csv', index=False)

In [37]:
data

Unnamed: 0,question,options,reference,correct_option,response,mcq_results,free_response_results,llm_mcq_answers
0,How many times was Ross legally divorced? \n,"[Twice, Three times, Five times, Six times]",Three times,B,Ross was legally divorced twice on the show.,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.03004698047801498, 'bleu':...",B
1,Where did Carol first meet Susan? \n,"[In college, At work, At the gym, At Central P...",At the gym,C,Carol first met Susan in the pilot episode o...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.1284215806951859, 'bleu': ...",B
2,How did Susan and Ross come up with Ben’s name?,"[It was the doctor's name, They both had uncle...",It was on the janitor's name tag,D,"In the Friends TV show, Susan and Ross came ...","{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.12931685168564777, 'bleu':...",B
3,What were Ben's first words? \n,"[Hi, Bye, Mom, Dumb]",Hi,A,Ben's first words on the Friends TV show wer...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.013020833333333356, 'bleu'...",B
4,How long did Ross and Emily date before they g...,"[14 days, 6 weeks, A year, 3 months]",6 weeks \n,B,Ross and Emily dated for approximately 6 mon...,"{'correct_format_rate': 0.6, 'correct_answer_r...","{'average_score': 0.07069338899834257, 'bleu':...",B
...,...,...,...,...,...,...,...,...
96,Which actor's haircut did Monica want?,"[Winona Ryder, Demi Moore, Terri Hatcher, Juli...",Demi Moore,B,Sure! The answer to your question is:\n\nMon...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.02805983504512918, 'bleu':...",B
97,What city does the Mama's Little Bakery cheese...,"[Albany, New York, Chicago, Illinois, Boston, ...","Chicago, Illinois",B,The Mama's Little Bakery cheesecake that Rac...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.033257536720762, 'bleu': 0...",B
98,"After reading _Be Your Own Windkeeper_, who do...","[Rachel, Monica, None of the above, All of the...",Rachel,A,Phoebe calls Joey a leaf blower in the episo...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.019346834058839628, 'bleu'...",B
99,"When Ben shows up with a Barbie doll, what dol...","[Action Man, GI Joe, Optimus Prime, Stretch Ar...",GI Joe,B,Ross wants Ben to play with the GI Joe doll ...,"{'correct_format_rate': 1.0, 'correct_answer_r...","{'average_score': 0.20356027128813167, 'bleu':...",B


In [41]:
len(data[data.llm_mcq_answers == data.correct_option])

36

In [43]:
data['correct_format_rate'] = data['mcq_results'].apply(lambda x: x['correct_format_rate'])
# data['correct_answer_rate'] = data['mcq_results'].apply(lambda x: x['correct_answer_rate'])
data['average_score'] = data['free_response_results'].apply(lambda x: x['average_score'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['correct_format_rate'] = data['mcq_results'].apply(lambda x: x['correct_format_rate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['correct_answer_rate'] = data['mcq_results'].apply(lambda x: x['correct_answer_rate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['average_score']

In [45]:
print('MCQ Results:')
print('MCQ Accuracy: ', len(data[data.llm_mcq_answers == data.correct_option]) / len(data))
print('Correct format rate: ', data['correct_format_rate'].mean())
# print('Correct answer rate: ', data['correct_answer_rate'].mean())
print('Free Response Results:')
print('Average score: ', data['average_score'].mean())



MCQ Results:
MCQ Accuracy:  0.36
Correct format rate:  0.98
Correct answer rate:  0.0
Free Response Results:
Average score:  0.04264022092631699
