In [None]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm

from scipy.stats import pearsonr, spearmanr, kendalltau

In [None]:
float_formatter = "{:.3f}".format

csvPath = './data'
resultPath = './results'
url = "http://localhost:11434/api/generate"

In [None]:
def ollama_post_request(url: str, prompt: str):
    headers = {"Content-Type": "application/json; charset=utf-8"}
    payload = { 
     "model": "llama3", 
     "prompt": prompt,
     "stream": False,
     "options": {
         "temperature": 1.0
     }
    }
    res = requests.post(url, json=payload, headers=headers)

    return res
def ollama_generate(prompt='', n=1):
    scores = []
    for i in range(n):
        resp = ollama_post_request(url=url, prompt=prompt)
        result = resp.json()
        scores.append(result['response']) 
    return scores

In [None]:
def averageScores(arr):
    arrInt = []
    error = 0
    for a in arr:
        if type(a) is str and a.isnumeric() and int(a) < 4 and int(a) > -1:
            arrInt.append(int(a))
        else:
            error += 1
    if error > 2:
        print('error counter: ', error)
    return np.round(sum(arrInt) / len(arrInt), 3) if len(arrInt) > 0 else -1

## Demonstration examples

### Simple samples

In [None]:
# Standard deomonstraion examples for QA evaluation
simpleSample1 = """
    **Product Question**: Is this difficult to assemble? If yes, what kind of tools are required for assembly?
    **Model Generated Answer**: We typically ship within 1-2 business days.
    **Rating Score**: 0
"""
simpleSample2 = """
    **Product Question**: How difficult is it to put the roll of paper towels on and off the roller? Mine will be placed on the side of my clothes dryer, in a cramped space.
    **Model Generated Answer**: it's not that difficult in a small kitchen.
    **Rating Score**: 1     
"""
simpleSample3 = """
    **Product Question**: How wide is this seat?
    **Model Generated Answer**: it is less than 10" wide.
    **Rating Score**: 2
"""
simpleSample4 = """
    **Product Question**: Is this difficult to clean?
    **Model Generated Answer**: it can be a little tedious to clean.
    **Rating Score**: 3
"""

### ELECT samples

In [None]:
# Those demonstration examples were originally generated by chatGPT. 
# We refine it with a more detailed thinking process specific to each certain product question.
# See our paper with more detials. 
electSample1 = """
    **Product Question**: Is this difficult to assemble? If yes, what kind of tools are required for assembly?
    **Model Generated Answer**: We typically ship within 1-2 business days.
    **Chain of Thought**:
    ***1. Understanding the Product Question***
    The question asks whether the product is difficult to assemble and, if so, what kind of tools are required for assembly. The user is looking for specific information regarding the assembly process.
    ***2. Analyzing the Model Generated Answer***
    The model generated answer states, "We typically ship within 1-2 business days."
    The model generated answer is completely irrelevant to the question about assembly difficulty and tools needed. It addresses shipping information instead of the assembly process.
    ***3. Rating the Model Generated Answer Based on the Evaluation Criteria***
    Score 0: The model generated answer does not relate to the question at all. It fails to address assembly difficulty or the tools required for assembly.
    ***4. Conclusion***
    The model generated answer does not answer the user's question and provides unrelated information about shipping.
    Rating: 0 (Irrelevant to the question).
"""
electSample2 = """
    **Product Question**: How difficult is it to put the roll of paper towels on and off the roller? Mine will be placed on the side of my clothes dryer, in a cramped space.
    **Model Generated Answer**: it's not that difficult in a small kitchen.
    **Chain of Thought**:
    ***1. Understanding the Product Question***
    The question asks about the difficulty of placing a roll of paper towels on and off the roller, particularly in a cramped space next to a clothes dryer. The user is looking for specific insights into the ease of use of the product in a limited area.
    ***2. Analyzing the Model Generated Answer***
    The model generated answer states, "it's not that difficult in a small kitchen."
    While the model generated answer suggests that the task is relatively easy, it does not specifically address the user's context of a cramped space next to a clothes dryer.
    The mention of a "small kitchen" does not directly correlate to the user's question about difficulty in a cramped space, which could potentially be different.
    ***3. Rating the Model Generated Answer Based on the Evaluation Criteria***
    Score 1: The model generated answer is related to the question but does not directly answer it. It suggests ease of use but fails to consider the specific context of the user (a cramped space next to a dryer) and does not provide a definitive answer regarding the difficulty in that situation.
    ***4. Conclusion***
    The model generated answer is somewhat relevant but lacks specificity and does not adequately address the user’s unique context or concern.
    Rating: 1 (Related but didn’t answer the question).    
"""
electSample3 = """
    **Product Question**: How wide is this seat?
    **Model Generated Answer**: it is less than 10" wide.
    **Chain of Thought**:
    ***1. Understanding the Product Question***
    The question asks specifically about the width of the seat. The user is looking for a clear measurement indicating how wide the seat is.
    ***2. Analyzing the Model Generated Answer***
    The model generated answer states, "it is less than 10" wide."
    The model generated answer provides a direct measurement related to the user's question, indicating that the seat width is under 10 inches. However, it does not give a specific measurement, which might leave the user wanting more precise information.    
    ***3. Rating the Model Generated Answer Based on the Evaluation Criteria***
    Score 2: The response somewhat answers the question by giving an indication of the width. While it does not provide a specific width, the phrase "less than 10 inches" gives a general idea of the size, which is still helpful.
    ***4. Conclusion***
    The model generated answer is relevant and provides useful information about the seat width, even if it lacks a specific measurement.
    Rating: 2 (Somehow answered the question).
"""
electSample4 = """
    **Product Question**: Is this difficult to clean?
    **Model Generated Answer**: it can be a little tedious to clean.
    **Chain of Thought**:
    ***1. Understanding the Product Question***
    The question asks, "Is this difficult to clean?" The user is seeking information about the ease or difficulty of cleaning the product. The expected answer should clearly address whether the product is easy or difficult to clean.
    ***2. Analyzing the Model Generated Answer***
    The model generated answer says, "it can be a little tedious to clean."
    The model generated answer indicates that cleaning the product may not be easy and can take some effort, which aligns with the user's inquiry about difficulty.
    The phrase "a little tedious" implies some difficulty, though not extreme, and provides a direct answer to the question.
    ***3. Rating the Model Generated Answer Based on the Evaluation Criteria***
    Score 3: The model generated answer directly answers the question by explaining that cleaning the product is somewhat difficult or tedious. It addresses the key aspect of the user's question.
    ***4. Conclusion***
    The model generated answer is relevant, clear, and directly addresses the question about difficulty in cleaning.
    Rating: 3 (Directly answers the question).
"""

## CoT and prompting templates

### Generating CoT steps

In [None]:
# The CoT steps were created by ChatGPT in 2024 
# The the following prompt was used to create the general CoT steps
cot_gen_v1 = """
    You are a product question answering task evaluator.
    You will be given a product question and a model generated answer.
    Your task is to rate how well the model generated answer can answer the question based on the evaluation criteria.

    Evaluation Criteria: Rating Score (0-3)
    0: the answer was irrelevant to the question;
    1: the answer was related but didn't answer the question;
    2: the answer somehow answered the question;
    3: the answer directly answer the question.
    
    Please generate the Chain of Thought steps for the evaluation task
"""

# General CoT steps
general_cot = """
    1. Understanding the Product Question
    Clearly define the product-related question being asked.
    Identify any specific details or requirements the question is asking for (e.g., features, specifications, price, availability).

    2. Analyzing the Model Generated Answer
    Check whether the model generated answer addresses the key aspects of the question.
    Determine if the model generated answer provides the necessary information (or if something is missing).
    Verify if the model generated answer is directly related to the product in question.
    Evaluate if the model generated answer includes irrelevant or extraneous information.

    3. Rating the Model Generated Answer Based on the Evaluation Criteria
    Score 0: If the model’s response is entirely irrelevant or doesn’t address any part of the question.
    Score 1: If the model’s response is somewhat related to the question, but it doesn’t answer it.
    Score 2: If the model’s response partially answers the question but is incomplete or lacks clarity.
    Score 3: If the model’s response fully and directly answers the question with clear and relevant information.

    4. Conclusion
    Based on the analysis, assign a score (0–3) to the model generated answer.
"""

# The the following prompt could be used to create the CoT steps against a specific question.
# Parameter 1: cot_gen
# parameter 2: a question (in the samples above)
# parameter 3: generated answer (in the samples above)
cot_gen_v2 = """
    You are a product question answering task evaluator.
    You will be given a product question and a model generated answer.
    Your task is to rate how well the model generated answer can answer the question based on the evaluation criteria.

    Evaluation Criteria: Rating Score (0-3)
    0: the answer was irrelevant to the question;
    1: the answer was related but didn't answer the question;
    2: the answer somehow answered the question;
    3: the answer directly answer the question.
    
    Please break down the evaluation process using the Chain of Thought steps for the given question and generated answer:
    
    Chain of Thought:
    {}
    
    Question:
    {}
    
    Generated answer:
    {}
"""

# CoT steps for specific QA pair
cot_sample = """
    1. Understanding the Product Question
    The question asks, "Is this difficult to clean?" The user is seeking information about the ease or difficulty of cleaning the product. The expected answer should clearly address whether the product is easy or difficult to clean.
    
    2. Analyzing the Model Generated Answer
    The model-generated answer says, "it can be a little tedious to clean."
    The phrase "a little tedious" suggests that cleaning is not extremely difficult but requires some effort.

    3. Rating the Model Generated Answer Based on the Evaluation Criteria
    Score 3: The model generated answer directly answers the question by explaining that cleaning the product is somewhat difficult or tedious. It addresses the key aspect of the user's question.
    
    4. Conclusion
    The model generated answer is relevant, clear, and directly addresses the question about difficulty in cleaning.
    Rating: 3 (Directly answers the question).
"""

### Prompt template

In [None]:
llama_output = "Please do not generate any opening, closing, and explanations. The response should be one word with a number between 0 and 3 only."

# input: general CoT steps
cotTemplete = """
    Chain of Thought:
    {}
"""

# Parameter 1. desired output format
# Parameter 2. cot
# Parameter 3. demonstrain examples
# Parameter 4. input: product question
# Parameter 5. input: model generated answer

generic_prompt = """
    You are a product question answering task evaluator.
    You will be given a product question, and a model generated answer.
    Your task is to rate how well the model generated answer can answer the question based on the evaluation criteria.
    {}

    Evaluation Criteria: Rating Score (0-3)
    0: the answer was irrelevant to the question;
    1: the answer was related but didn't answer the question;
    2: the answer somehow answered or partially answered the question;
    3: the answer directly answer the question.
    
    {}
    
    {}
    
    Product Question:
    {}

    Model Generated Answer:
    {}
"""

## Experiments -- Llama

In [None]:
product = 'garden'
csvFile = 'QA_{}'.format(product)
llamaDF = pd.read_csv('{}/{}.csv'.format(csvPath, csvFile), sep='\t')
print('result original:', len(llamaDF))

questions = llamaDF['question'].tolist()
answers = llamaDF['answer'].tolist()

In [None]:
results = []
all_scores = []
url = "http://localhost:11434/api/generate"
agents = 10 # 20
i = 0
    
cot = cotTemplete.format(general_cot)

# examplesTemplete = """
#     Here are some examples using chain of thought for the evaluation process:
    
#     ###Example One###:
#     {}
#     ###Example Two###:
#     {}
#     ###Example Three###:
#     {}
#     ###Example Four###:
#     {}
# """
# electExamples = examplesTemplete.format(electSample1, electSample2, electSample3, electSample4)

examplesTemplete = """
    Here are some examples using chain of thought for the evaluation process:
    
    ###Example One###:
    {}
"""
electExamples = examplesTemplete.format(electSample4)

# simpleExamplesTemplete = """
#     Here are some examples:
    
#     ###Example One###:
#     {}
#     ###Example Two###:
#     {}
#     ###Example Three###:
#     {}
#     ###Example Four###:
#     {}
# """
# simpleExamples = simpleExamplesTemplete.format(simpleSample1, simpleSample2, simpleSample3, simpleSample4)

simpleExamplesTemplete = """
    Here are some examples:
    
    ###Example One###:
    {}
"""
simpleExamples = simpleExamplesTemplete.format(simpleSample4)

### Table 2
# ELECT: prompt = generic_prompt.format(llama_output, electExamples, "", q, a) ## using electSample4
# G-Eval: prompt = generic_prompt.format(llama_output, cot, "", q, a)
# DP(direct prompt): prompt = generic_prompt.format(llama_output, "" , "", q, a)
# One-shot DP: prompt = generic_prompt.format(llama_output, simpleExamples, "", q, a) ## using simpleSample4

for q, a in tqdm(zip(questions, answers)):
    prompt = generic_prompt.format(llama_output, simpleExamples, "", q, a)
    if i == 0:
        print(prompt)
        i += 1
    scores = ollama_generate(prompt=prompt, n=agents)
    all_scores.append(averageScores(scores))
    results.append(scores)

In [None]:
fixedStr = 'llama_dp4'
llamaDF[fixedStr] = results
llamaDF['{}_score'.format(fixedStr)] = all_scores
llamaDF.columns

In [None]:
print('=======> ', product)
# human judgement
human_judgements = list(llamaDF['annotated'])
human_judgements = [float(float_formatter(num)) for num in np.round(human_judgements, 2)]
for metric in [
    'llama_elect4_score',
    'llama_geval_score',
    'llama_dp_score',
    'llama_dp4_score'
    ]:
    metric_score = list(llamaDF['{}'.format(metric)])

    pScore, pvpScore = pearsonr(metric_score, human_judgements)
    sScore, pvsScore = spearmanr(metric_score, human_judgements)
    kScore, pvkScore = kendalltau(metric_score, human_judgements)
    print('%s: spearman %.3f pv %.3f, pearson %.3f pv %.3f, kendalltau %.3f pv %.3f' % 
          (metric, sScore, pvsScore, pScore, pvpScore, kScore, pvkScore)
    )

In [None]:
csvFile = 'llama3_{}_{}_{}.csv'.format(product, 'Table2', agents)
llamaDF.to_csv('{}/{}'.format(resultPath, csvFile), index=None, sep='\t')

print(csvFile)