## Populate Task

In [None]:
import sys
from pathlib import Path

# Đường dẫn tới thư mục gốc của dự án
project_root = Path('C:/Users/ADMIN/Desktop/DATN/Extract_Information')
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [None]:
task = 'generated_knowledge_one_shot'
model_name = 'Llama-3'

In [45]:
PATH_FILE_TASK = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/task_template/'
PATH_FILE_TEMPLATE = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/tasks/task_template.json'
PATH_FILE_DATA = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/data/mave_filtered_test.jsonl'

if model_name == 'GPT':
    PATH_FILE_RESULT = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/result/GPT'
else:
    PATH_FILE_RESULT = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/result/Llama'

In [None]:
from dotenv import load_dotenv

# Load OPENAI_API_KEY from .env file
load_dotenv()

In [48]:
import json
from pieutils.pieutils import combine_example, calculate_recall_precision_f1, update_task_dict_from_file

# Load task template
with open(PATH_FILE_TEMPLATE, 'r') as f:
    task_dict = json.load(f)

task_dict['task_prefix'] = "Extract all attribute-value pairs from the product title. "\
                            "Afterwards, return only the exact attribute value of the requested attribute. " \
                            "Your answer must only contain the attribute value. If the attribute is not applicable " \
                            "or the attribute value cannot be derived from the product title, respond with 'I do not know.'."
task_dict['shots'] = 1

# Generate examples
task_dict = update_task_dict_from_file(PATH_FILE_DATA, task_dict, True)

with open(PATH_FILE_TASK + 'task_{}.json'.format(task), 'w', encoding='utf-8') as f:
    json.dump(task_dict, f, indent=4)

## Evaluate Task

In [None]:
# Initialize processing
from tqdm import tqdm
import torch
import transformers
from langchain import LLMChain, HuggingFacePipeline
from langchain.chat_models import ChatOpenAI

from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    AIMessagePromptTemplate
)
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.callbacks import get_openai_callback

In [None]:
# Model
if model_name == 'GPT':
    llm = ChatOpenAI(model_name='gpt-3.5-turbo-0125', temperature=0)
else:
    Llama_checkpoint = '/ceph/alebrink/cache/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6'
    tokenizer = AutoTokenizer.from_pretrained(Llama_checkpoint)
    model = AutoModelForCausalLM.from_pretrained(
        Llama_checkpoint,
        torch_dtype=torch.float32,
        device_map="auto",
        offload_folder="save_folder",
    )
    model.tie_weights()

    hf_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto"
    )

    terminators = [
        hf_pipeline.tokenizer.eos_token_id,
        hf_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    llm = HuggingFacePipeline(pipeline=hf_pipeline)

In [49]:

# Put task prefix into system message.
prompt_list = []
system_message_prompt = SystemMessagePromptTemplate.from_template(task_dict['task_prefix'])
prompt_list.append(system_message_prompt)

# Prepare one-shot  
human_example_template="{human_msg_0}"
human_example_message_prompt = HumanMessagePromptTemplate.from_template(human_example_template)
prompt_list.append(human_example_message_prompt)

ai_example_template="{ai_msg_0}"
ai_example_message_prompt = AIMessagePromptTemplate.from_template(ai_example_template)
prompt_list.append(ai_example_message_prompt)

prompt_list.append(system_message_prompt)

# Add 
human_template="{input_string}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
prompt_list.append(human_message_prompt)

chat_prompt = ChatPromptTemplate.from_messages(prompt_list)


llm_chain = LLMChain(
    prompt=chat_prompt,
    llm=llm
)

# Save model
task_dict['model'] = model_name

In [50]:
def select_and_run_llm(attribute, human, system, input):
        pred = None
        if len(input) == 0:
            # No input text provided.
            return pred
        
        if model_name == 'GPT':
            response = llm_chain.run(input_string = input, human_msg_0=human, ai_msg_0=system)
        else:
            messages = [{"role": "system", "content": chat_prompt.messages[0].format().content},
                        {"role": "system", "content": chat_prompt.messages[1].format(human_msg_0=human).content},
                        {"role": "system", "content": chat_prompt.messages[2].format(ai_msg_0=system).content},
                        {"role": "system", "content": chat_prompt.messages[3].format().content},
                        {"role": "human", "content": chat_prompt.messages[4].format(input_string = input).content}]

            hf_prompt = hf_pipeline.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            hf_outputs = hf_pipeline(hf_prompt, max_new_tokens=256,
                                    eos_token_id=terminators,
                                    do_sample=True,
                                    temperature=0,
                                    top_p=0.9
                                    )
            response = hf_outputs[0]["generated_text"][len(hf_prompt):]
            
        try:
            pred ={}
            pred[attribute] = response
        except Exception as e:
            print(e)
            print('Response: ')
            print(response)
        return pred

In [51]:
# General task prefix
from tqdm import tqdm
from langchain.callbacks import get_openai_callback

if model_name == 'GPT':
    with get_openai_callback() as cb:
        preds = [select_and_run_llm(example['attribute'], example['human_0'], example['system_0'], example['input']) for example in tqdm(task_dict['examples'])]

        task_dict['total_tokens'] = cb.total_tokens
        print(f"Total Tokens: {cb.total_tokens}")
else:
    preds = [select_and_run_llm(example['attribute'], example['human_0'], example['system_0'], example['input']) for example in tqdm(task_dict['examples'])]

100%|██████████| 562/562 [05:13<00:00,  1.79it/s]

Total Tokens: 135395





In [52]:
print(preds)

[{'SD Format': 'SDHC'}, {'SD Format': 'Standard'}, {'SD Format': 'I do not know.'}, {'SD Format': 'Standard SD Adapter'}, {'Capacity': '4GB'}, {'Capacity': '4GB'}, {'Capacity': '32GB'}, {'SD Format': 'Standard'}, {'Capacity': '64GB'}, {'SD Format': 'Standard SD Adapter'}, {'SD Format': 'SDHC'}, {'SD Format': 'SDHC'}, {'SD Format': 'MicroSDHC'}, {'Capacity': '16 MB'}, {'Optical Zoom': '3x'}, {'Capacity': '64 MB'}, {'Capacity': '64 MB'}, {'Optical Zoom': '3x'}, {'Resolution': '2.3 MP'}, {'Optical Zoom': '3x'}, {'Capacity': '128 MB'}, {'Optical Zoom': '3x'}, {'Resolution': '3.3 MP'}, {'Optical Zoom': '4x'}, {'Optical Zoom': '10x'}, {'Capacity': '256 MB'}, {'Capacity': '128 MB'}, {'Resolution': '3.1 MP'}, {'Optical Zoom': '3x'}, {'Optical Zoom': '12x'}, {'Resolution': '4MP'}, {'Optical Zoom': '3x'}, {'Resolution': '3.2 MP'}, {'Optical Zoom': '6x'}, {'Optical Zoom': '3x'}, {'Optical Zoom': '5x'}, {'Optical Zoom': '3x'}, {'Resolution': '7.1MP'}, {'Optical Zoom': '12x'}, {'Resolution': '1024 

In [53]:
targets = [example['target_scores'] for example in task_dict['examples']]
categories = [example['category'] for example in task_dict['examples']]
attributes = [example['attribute'] for example in task_dict['examples']]

postprocessed_preds = [pred[attribute] for pred, attribute in zip(preds, attributes)]
#postprocessed_preds = [example['post_pred'] for example in task_dict['examples']]

task_dict['examples'] = [combine_example(example, pred, post_pred) 
                    for example, pred, post_pred in zip(task_dict['examples'], preds, postprocessed_preds)]

#print(task_dict['examples'])

results = calculate_recall_precision_f1(targets, postprocessed_preds, categories, attributes)

Attribute: Capacity - Category: Flash Memory Cards
{'precision': 100.0, 'recall': 100.0, 'f1': 100.0}
Attribute: SD Format - Category: Flash Memory Cards
{'precision': 57.14, 'recall': 48.78, 'f1': 52.63}
Attribute: Screen Size - Category: Laptops
{'precision': 100.0, 'recall': 100.0, 'f1': 100.0}
Attribute: Processor Speed - Category: Laptops
{'precision': 97.56, 'recall': 90.91, 'f1': 94.12}
Attribute: Processor Brand - Category: Laptops
{'precision': 85.42, 'recall': 93.18, 'f1': 89.13}
Attribute: Battery Life - Category: Laptops
{'precision': 100.0, 'recall': 100.0, 'f1': 100.0}
Attribute: Number of Cores - Category: Laptops
{'precision': 97.5, 'recall': 97.5, 'f1': 97.5}
Attribute: Resolution - Category: Laptops
{'precision': 100.0, 'recall': 100.0, 'f1': 100.0}
Attribute: Sensor Size - Category: Digital Cameras
{'precision': 86.67, 'recall': 92.86, 'f1': 89.66}
Attribute: Optical Zoom - Category: Digital Cameras
{'precision': 100.0, 'recall': 100.0, 'f1': 100.0}
Attribute: Sensor

In [54]:
# Save populated task
result_file = PATH_FILE_RESULT + '{}_{}_tenth.json'.format(task, task_dict['model'])
with open(result_file, 'w') as fp:
    json.dump(task_dict, fp, indent=4)

In [55]:
# Error Analysis
print('Prompts for which target and postprocessed prediction do not match.')
print('-----------')
input_texts = [example['input'] for example in task_dict['examples']]

for input_text, target, pred, post_pred in zip(input_texts, targets, preds, postprocessed_preds):
    if post_pred not in target.keys():
        #print('Prompt: {}'.format(chat_prompt.format(task_prefix=task_dict['task_prefix'], input_string= input_text, human_msg_0= human_text_0, ai_msg=system_text,)))
        print('Input: {}'.format(input_text))
        print('Prediction: \n {}'.format(pred))
        print('Prediction 2: {}'.format(post_pred))
        print('Target: {}'.format(target))
        print('-----------')

Prompts for which target and postprocessed prediction do not match.
-----------
Input: Essential 64GB Kyocera Hydro VIBE Micro SDHC Card is custom formatted for high speed, lossless recording! Includes Standard SD Adapter. (Class 10 Certified 38MB/sec) 
 Requested Attribute: SD Format
Prediction: 
 {'SD Format': 'SDHC'}
Prediction 2: SDHC
Target: {'Micro SDHC Card': 1, 'Micro SDHC': 1, 'MicroSDHC': 1}
-----------
Input: Professional Kingston 16GB MicroSDHC LG G Stylo with custom formatting and Standard SD Adapter! (32Mbps / Class 4) 
 Requested Attribute: SD Format
Prediction: 
 {'SD Format': 'Standard'}
Prediction 2: Standard
Target: {'MicroSDHC': 1}
-----------
Input: Essential ULTRA 32GB Best Buy Insignia Flex MicroSDHC Card with custom format for Hi-Speed Lossless certified recording! With SD Adapter. (Class 10, up to 500x or 70MB/sec) 
 Requested Attribute: SD Format
Prediction: 
 {'SD Format': 'I do not know.'}
Prediction 2: I do not know.
Target: {'MicroSDHC Card': 1, 'MicroSDHC