### Extract Claims

In [1]:
from metrics.extract_claims import extract_claim
import json


linguistic_control = "short"
# load results
with open(f'eval_data/results_{linguistic_control}_queries.json') as f:
    results = json.load(f)


In [5]:
for entity_with_spec in results:
    print(f"Entity with spec: {entity_with_spec}")
    for result in results[entity_with_spec]:
        text = "Query:" + result['query'] + "\n" + "Answer:" + result['gpt_response']
        claims_in_response = extract_claim([text])
        result['claims_in_gpt_response'] = eval(claims_in_response)

Entity with spec: Project_select1
Entity with spec: Project_select2
Entity with spec: Employee_select1
Entity with spec: Company
Entity with spec: Employee_select2


### RAGAS-Fact

In [21]:
from typing import Any, List
from indo_eval.llm import OpenAILLMAgent, AnyOpenAILLM, gpt4_llm
import numpy as np


class RagasFact:
    def __init__(self, valid_gen=['Yes', 'No', 'Null']) -> None:
        self.num_claims = 0
        self.num_eval_instances = 0
        self.failure_gen = 0
        self.llm = gpt4_llm
        self.valid_gen = valid_gen
    
    def valid_gen_str(self):
        return ', '.join(['"'+s+'"' for s in self.valid_gen[:-1]]) + ' and "' + self.valid_gen[-1] + '"'

    def eval(self, context: str, facts: List[str]):
        results = []
        
        for fact in facts:
            self.num_claims += 1
            verdict = None
            while verdict not in self.valid_gen:
                prompt = """Natural language inference. Use only {valid_gen_str} as verdict. 
                \ncontext: {context} 
                \nstatement: {statement}
                \nverdict:""".format(valid_gen_str=self.valid_gen_str(), context=context, statement=fact)
                verdict = self.llm(prompt)
                if verdict not in self.valid_gen:
                    self.failure_gen += 1
            results.append(verdict )
        self.num_eval_instances += 1
        
        return results, self.compute_faithfulness_score(results)
    
    def compute_faithfulness_score(self, output: List[str]):
        assert self.valid_gen == ["Yes", "No", "Null"]
        total_null = sum(1 for validation in output if validation == "Null")
        # check the verdicts and compute the score
        verdict_score_map = {"Yes": 1, "No": 0}
        faithful_statements = sum(
            verdict_score_map.get( validation)
            for validation in output if validation != "Null"
        )
        
        num_statements = len(output) -  total_null
        if num_statements:
            score = faithful_statements  / num_statements
        else:
            score = None
       
        return score
    
    def reset(self):
        self.failure_gen = 0

In [22]:
num_queries = 0
ragas_fact = RagasFact()
for entity_with_spec in results:
    print(f"Entity with spec: {entity_with_spec}")
    for result in results[entity_with_spec]:
        num_queries+=1
        # text = "Query:" + result['query'] + "\n" + "Answer:" + result['gpt_response']
        # claims_in_response = extract_claim([text])
        if "raga_faithfulness_score" in result:
            continue
        result["raga_judgement"], result["raga_faithfulness_score"] = ragas_fact.eval(result['retrieved_documents'], result['claims_in_gpt_response'])
        print(result["raga_judgement"])

print("Number of queries: ", num_queries)
print("Failure rate: ", ragas_fact.failure_gen/num_queries)

Entity with spec: Project_select1
['Yes']
['Yes']
['Null']
['Yes', 'Yes', 'Null']
['Yes']
['Yes', 'Yes', 'Yes']
['Yes', 'Yes']
['Yes']
['Yes']
['Yes']
['Yes']
['Yes']
['Null']
['Null', 'Null']
['Yes', 'Yes', 'Null', 'Null']
['Yes', 'Yes', 'Null']
['Yes']
['Yes']
['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
['Yes']
['Yes', 'Yes', 'Null']
['Null']
['Null']
['Yes']
Entity with spec: Project_select2
['Yes']
['Null', 'Null', 'Null']
['Null', 'Null', 'Null']
['Yes']
['Yes']
['Yes', 'Yes']
['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes']
['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes']
['Yes', 'Yes']
['Null', 'Yes', 'Null', 'Null']
['Yes', 'Null']
['Yes', 'Yes', 'Yes']
Entity with spec: Employee_select1
['Yes', 'Yes', 'Yes', 'Null', 'Yes']
['Yes']
['Yes']
['Yes', 'Yes']
['Yes', 'Yes', 'No']
['Yes']
['Yes']
['Yes']
['Yes']
['Yes']
['Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes']
['Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Null', 'Null', 'No', 'No', 'Null']
['Yes']
['Yes', 'Null']

In [45]:
with open(f'eval_data/results_{linguistic_control}_queries_raga.json', 'w') as f:
    json.dump(results, f, indent=4)

Entity with spec: Project_select1
Entity with spec: Project_select2
Entity with spec: Employee_select1
Entity with spec: Company
Entity with spec: Employee_select2
Number of faithful examples:  58
Number of correct examples in faithful examples:  11
Percentage of correct examples in faithful examples:  0.1896551724137931


### SelfCheck

In [64]:
print()

Query: {query}            
Answer A: {answer}       
Answer B: {stochastic_answer}       
Do both answers address the query with equivalent meaning?       
Use only "Yes" or "No" for your evaluation:


In [70]:
from typing import Any, List
from indo_eval.llm import OpenAILLMAgent, AnyOpenAILLM, gpt4_llm
import numpy as np


class SelfCheck:
    def __init__(self, valid_gen=['Yes', 'No']) -> None:
        self.total_gen = 0
        self.failure_gen = 0
        self.llm = gpt4_llm
        self.valid_gen = valid_gen
    
    def valid_gen_str(self):
        return ', '.join(['"'+s+'"' for s in self.valid_gen[:-1]]) + ' or "' + self.valid_gen[-1] + '"'

    def eval(self, answer: str, query: str, stochastic_answers: List[str]):
        results = []

        for stochastic_answer in stochastic_answers:
            while True:
                prompt = """Query: {query} \
                      \nAnswer A: {answer} \
                      \nAnswer B: {stochastic_answer} \
                      \nDo both answers address the query with equivalent meaning? \
                      \nUse only "Yes" or "No" for your evaluation:""".format( query=query, answer=answer, stochastic_answer=stochastic_answer, valid_gen_str=self.valid_gen_str())
                verdict = self.llm(prompt)
                self.total_gen += 1
                if verdict not in self.valid_gen:
                    self.failure_gen += 1
                else:
                    results.append(verdict)
                    break
                
        assert len(results) == len(stochastic_answers)
        return results, self.compute_inconsistency_score(results)
    
    def compute_inconsistency_score(self, output: List[str]):
        assert self.valid_gen == ["Yes", "No"]
        
        # check the verdicts and compute the score
        verdict_score_map = {"Yes": 1, "No": 0}
        num_cosistent_samples = sum(
            verdict_score_map.get( validation)
            for validation in output
        )
        
        num_samples = len(output) 
        if num_samples:
            score = 1- (num_cosistent_samples  / num_samples)
        else:
            score = None
       
        return score
    
    def reset(self):
        self.failure_gen = 0

In [73]:
N = 4
file_name = f'eval_data/results_{linguistic_control}_queries_selfcheck_N{N}.json'
with open(file_name) as f:
    results_selfcheck = json.load(f)


In [78]:
import time
selfcheck = SelfCheck()
for entity_with_spec in results_selfcheck:
    print(f"Entity with spec: {entity_with_spec}")
    for result in results_selfcheck[entity_with_spec]:

        if "selfcheck_inconsistency_score" not in result:
            time.sleep(5)  
            result["selfcheck_judgement"], result["selfcheck_inconsistency_score"] = selfcheck.eval(result['gpt_response'], result['query'], result["stochastic_answers"])
            print(result["selfcheck_judgement"])
            print(result["selfcheck_inconsistency_score"])
        else:
            print('Exists~')
if selfcheck.total_gen:
    print("Failure rate: ", selfcheck.failure_gen/selfcheck.total_gen)
with open(file_name, 'w') as f:
    json.dump(results_selfcheck, f, indent=4)

Entity with spec: Project_select1
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Entity with spec: Project_select2
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Exists~
Entity with spec: Employee_select1
Exists~
['No', 'No', 'No']
1.0
['Yes', 'Yes', 'No']
0.33333333333333337
['Yes', 'Yes', 'No']
0.33333333333333337
['Yes', 'No', 'No']
0.6666666666666667
['Yes', 'Yes', 'No']
0.33333333333333337
['Yes', 'Yes', 'No']
0.33333333333333337
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'No', 'No']
0.6666666666666667
['No', 'No', 'No']
1.0
['Yes', 'No', 'Yes']
0.33333333333333337
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'Yes', 'Yes']
0.0
['Yes', 'No', 'No']
0.6666666666666667
['Yes', 'Yes', 'No']
0.33

### Evaluating Reliability Of Reference-free Evaluation Protocols

In [86]:
def precision_recall_reference_free_eval(results, score_name, score_evaluated_as_correct_pred):
    """Reference free evaluation of precision and recall
    Args:
        results: list of dict, each dict has a score_name key
        score_name: str, name of the score
        score_for_accurate: float, the score indicated that the prediction is accurate"""
    num_correctly_predicted = 0
    num_evaluated_as_correct_pred = 0
    num_correctly_evaluated_examples = 0
    for entity_with_spec in results:
        print(f"Entity with spec: {entity_with_spec}")
        for result in results[entity_with_spec]:
            if result[score_name] == score_evaluated_as_correct_pred:
                num_evaluated_as_correct_pred += 1
            
                if result["judgement"] == "Correct":
                    num_correctly_evaluated_examples += 1

            if result["judgement"] == "Correct":
                num_correctly_predicted += 1
    print("Number of examples evaluated as correct predictions: ", num_evaluated_as_correct_pred)
    print("Number of correctly-evaluated examples: ", num_correctly_evaluated_examples)
    print("Percentage of correctly-evaluated examples in examples evaluated as correct predictions (Precision): ", num_correctly_evaluated_examples/num_evaluated_as_correct_pred)
    print("Percentage of correctly-evaluated examples in examples that are correctly predicted (Recall): ", num_correctly_evaluated_examples/num_correctly_predicted)

with open(f'eval_data/results_{linguistic_control}_queries_raga.json') as f:
    results = json.load(f)
precision_recall_reference_free_eval(results, score_name="raga_faithfulness_score", score_evaluated_as_correct_pred=1)


Entity with spec: Project_select1
Entity with spec: Project_select2
Entity with spec: Employee_select1
Entity with spec: Company
Entity with spec: Employee_select2
Number of examples evaluated as correct predictions:  58
Number of correctly-evaluated examples:  11
Percentage of correctly-evaluated examples in examples evaluated as correct predictions (Precision):  0.1896551724137931
Percentage of correctly-evaluated examples in examples that are correctly predicted (Recall):  0.9166666666666666


In [87]:
file_name = f'eval_data/results_{linguistic_control}_queries_selfcheck_N{N}_scores.json'
with open(file_name) as f:
    results = json.load(f)

precision_recall_reference_free_eval(results, score_name="selfcheck_inconsistency_score", score_evaluated_as_correct_pred=0)

Entity with spec: Project_select1
Entity with spec: Project_select2
Entity with spec: Employee_select1
Entity with spec: Company
Entity with spec: Employee_select2
Number of examples evaluated as correct predictions:  48
Number of correctly-evaluated examples:  7
Percentage of correctly-evaluated examples in examples evaluated as correct predictions (Precision):  0.14583333333333334
Percentage of correctly-evaluated examples in examples that are correctly predicted (Recall):  0.5
