In [1]:
import json
import os
import sys
import random
from tqdm import tqdm
sys.path.append('..')
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'
import logging
from typing import List, Union, Tuple
from rag_systems.retrieval.retrieval import Qdrant, KeywordMatching
logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# read documents from benchmarks/aurp/aurp-database/documents.json
root_dir = "aurp/"
for_validation = True
documents = json.load(open(f'{root_dir}/retrieval_database/documents.json'))

In [8]:
retrieval = KeywordMatching(documents)
def retrieve(query: str, answer:  Union[List, Tuple] = None, true_document_ids:List[int]=None, retrieved_documents=None):
    result = {
        "query": query, 
        "answer": answer, 
        "true_document_ids": set(true_document_ids),
        "retrieved_documents": [], 
        "gpt_response": None, 
        "retrieval_judgement": 1 # correct
    }
    # 1 - Retrieval
    if not retrieved_documents:
        items = retrieval.search(query)
        # get type of element of true_document_ids
        true_document_ids = type(list(true_document_ids)[0])
        result['retrieved_document_ids'] = set([true_document_ids(retrieval.rank(items)[0].id)])
        
        retrieved_documents: str = retrieval.format_sources(items)
    result["retrieved_documents"] = retrieved_documents 

    # 2 - Check if the answer exists in the retrieved documents

    if result['retrieved_document_ids']!=result['true_document_ids']:
        result['retrieval_judgement'] = 0 # incorrect
    return result

from grammar.generator import Generator
from grammar.llm import OpenAILLMAgent, role
gen_model = OpenAILLMAgent(model_name = "chatgpt-16k", role=role)
gen_model.temperature = 0
class RetrievalAugGen(Generator):
    verbalizer = {
        "short": "",
        "long": ""
    }
    def __init__(self, llm=None, verbalize_attrs=''):
        llm = llm or gen_model
        super().__init__( llm=gen_model, verbalize_attrs=verbalize_attrs)

    def _generate(self, context_query:tuple, num_generations=None, verbose=False):
        context, query = context_query
        prompt = """### Sources:\n"""+ context + "\n\n### Question:\n" + query
        return [gen_model(prompt, temperature=0)]

linguistic_control = "long"
ragen = RetrievalAugGen.from_file(root_dir=root_dir, verbalize_attrs=linguistic_control)
# load evaluation data
if  for_validation:
    print("Loading validation data")
    file_path =  f'{root_dir}/QADataGenerator/{linguistic_control}_with_ids_unbalanced.json' 
else:
    
    file_path = f'{root_dir}/QADataGenerator/{linguistic_control}.json'
with open(file_path) as f:
    answers_to_text_queries = json.load(f)

# reduce the dimension over SQL templates 
semantics_groups = {answer: query_list for i in range(len(answers_to_text_queries)) for answer, query_list in answers_to_text_queries[i]}
print(f"Number of semantics groups: {len(semantics_groups)}")

Loading validation data
Number of semantics groups: 157


In [9]:
results = [] 
group_tag = -1

for answer, query_list in semantics_groups.items():
    group_tag += 1 
    answer = eval(answer)[0]  # only 1 answer; no query multiplicity
    answer_txt = answer[0] 
    print(f"{len(query_list)} queries in group {group_tag}")
    if len(answer) >= 2: # known true documents
        try:
            true_document_ids = [int(i) for i in answer[1:]]
        except:
            true_document_ids = [i for i in answer[1:]]
        
    
    for query in query_list:
        # if linguistic_control == "long":
        #     # 0.3 chance to select the example for eval; otherwise, skip
        #     if group_tag >= 38 and random.random() > 0.3:
        #         continue
        # else:
        #     # 0.75 chance to select the example for eval; otherwise, skip
        #     if random.random() > 0.75:
        #         continue
        result = retrieve(query, answer_txt, true_document_ids=true_document_ids)
        if result['retrieval_judgement'] == 0:
            result['gpt_response'] = None
        else:
            result["gpt_response"] =  ragen.generate((result["retrieved_documents"], result["query"]), verbose=True)[0]
        result['query_tag'] = group_tag
        results.append(result)    
        print(result['retrieved_document_ids'])
        print(result['true_document_ids'])
        print(result['retrieval_judgement'])
        print(result['gpt_response'])


1 queries in group 0
{'Blue Lagoon Luxury Resort'}
{'Blue Horizon Hotels'}
0
None
1 queries in group 1
{'Central City Medical Hub'}
{'EcoSpace Real Estate'}
0
None
1 queries in group 2
{'Innovative Tech Park'}
{'Future Tech Innovations'}
0
None
1 queries in group 3
{'Eco Retreat Development'}
{'Greenworld Resorts'}
0
None
1 queries in group 4
{'Central City Medical Hub'}
{'Pinnacle Health Group'}
0
None
1 queries in group 5
{'Quantum Data Center Expansion'}
{'Quantum Communications Corp'}
0
None
1 queries in group 6
{'Central City Medical Hub'}
{'Serene Stays Hospitality'}
0
None
1 queries in group 7
{'Dubai Skyline Tower'}
{'Skyline Developers'}
0
None
1 queries in group 8
{'Central City Medical Hub'}
{'Urban Oasis Developments'}
0
None
1 queries in group 9
{'Advanced Health Research Facility'}
{'Vista Healthcare Solutions'}
0
None
1 queries in group 10
{'Blue Lagoon Luxury Resort'}
{'Blue Horizon Hotels'}
0
None
1 queries in group 11
{'Dubai Skyline Tower'}
{'EcoSpace Real Estate'}
0

In [6]:
ragen.save(root_dir=root_dir, override=True)
gen_model.gpt_usage_record.write_usage(model_name='chatgpt16k' )

In [10]:
# ensure json serializable
for result in results:
    result['true_document_ids'] = list(result['true_document_ids'])
    result['retrieved_document_ids'] = list(result['retrieved_document_ids'])


# save results
with open(f'{root_dir}/eval_results/results_{linguistic_control}.json', 'w') as f:
    json.dump(results, f, indent=4)