## Populate Task

In [1]:
import sys
from pathlib import Path

# Đường dẫn tới thư mục gốc của dự án
project_root = Path('C:/Users/ADMIN/Desktop/DATN/Extract_Information')
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [2]:
task = 'open'
model_name = 'GPT'

In [3]:
PATH_FILE_TASK = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/task_template/'
PATH_FILE_TEMPLATE = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/tasks/task_template.json'
PATH_FILE_DATA = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/data/mave_filtered_test.jsonl'

if model_name == 'GPT':
    PATH_FILE_RESULT = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/result/GPT'
else:
    PATH_FILE_RESULT = 'C:/Users/ADMIN/Desktop/DATN/Extract_Information/tasks/result/Llama'

In [4]:
from dotenv import load_dotenv

# Load OPENAI_API_KEY from .env file
load_dotenv()

True

In [5]:
import json
from pieutils.pieutils import update_task_dict_from_file, combine_example, calculate_recall_precision_f1, parse_gpt_response_to_dict

# Load task template
with open(PATH_FILE_TEMPLATE, 'r') as f:
    task_dict = json.load(f)

task_dict['task_prefix'] = "Extract all important attribute name : attribute value pairs mentioned in the product title " \
                            "below and return the attribute name : attribute value pairs. All parts of the product title " \
                            "should be assigned to an attribute. \n Desired format: <attribute name>: <attribute value> " \
                            "\n <attribute name>: <attribute value>"
task_dict['shots'] = 0

# Generate examples
task_dict = update_task_dict_from_file(PATH_FILE_DATA, task_dict, False)

with open(PATH_FILE_TASK + 'task_{}.json'.format(task), 'w', encoding='utf-8') as f:
    json.dump(task_dict, f, indent=4)

## Evaluate Task

In [1]:
# Initialize processing
from tqdm import tqdm
import torch
import transformers
from langchain import LLMChain, HuggingFacePipeline
from langchain.chat_models import ChatOpenAI

from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.callbacks import get_openai_callback

In [None]:
# Model
if model_name == 'GPT':
    llm = ChatOpenAI(model_name='gpt-3.5-turbo-0125', temperature=0)
else:
    tokenizer = AutoTokenizer.from_pretrained('/ceph/alebrink/cache/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45')
    model = AutoModelForCausalLM.from_pretrained(
        '/ceph/alebrink/cache/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45',
        cache_dir='/cephh/alebrink/cache',
        torch_dtype=torch.float16,
        device_map="auto",
        offload_folder="save_folder",
    )
    model.tie_weights()

    hf_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto"
    )

    terminators = [
        hf_pipeline.tokenizer.eos_token_id,
        hf_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    llm = HuggingFacePipeline(pipeline=hf_pipeline)

In [8]:
# Put task prefix into system message.
prompt_list = []
system_message_prompt = SystemMessagePromptTemplate.from_template(task_dict['task_prefix'])
prompt_list.append(system_message_prompt)

# Add 
human_template="{input_string}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
prompt_list.append(human_message_prompt)

chat_prompt = ChatPromptTemplate.from_messages(prompt_list)

llm_chain = LLMChain(
    prompt=chat_prompt,
    llm=llm
)

# Save model
task_dict['model'] = model_name

In [9]:
def select_and_run_llm(input):
        pred = None
        if len(input) == 0:
            # No input text provided.
            return pred
        if model_name == 'GPT':
            response = llm_chain.run(input_string = input)
        else:
            messages = [{"role": "system", "content": chat_prompt.messages[0].format().content},
                        {"role": "human", "content": chat_prompt.messages[1].format(input_string = input).content}]

            hf_prompt = hf_pipeline.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            hf_outputs = hf_pipeline(hf_prompt, max_new_tokens=256,
                                    eos_token_id=terminators,
                                    do_sample=True,
                                    temperature=0,
                                    top_p=0.9
                                    )
            response = hf_outputs[0]["generated_text"][len(hf_prompt):]  
            
        try:
            print(response)
            pred = parse_gpt_response_to_dict(response)
        except Exception as e:
            print(e)
            print('Response: ')
            print(response)
        return pred

In [None]:
if model_name == 'GPT':
    with get_openai_callback() as cb:
        preds = [select_and_run_llm(example['attribute'], example['input']) for example in tqdm(task_dict['examples'])]
    
    task_dict['total_tokens'] = cb.total_tokens
    print(f"Total Tokens: {cb.total_tokens}")
else:
    preds = [select_and_run_llm(example['attribute'], example['input']) for example in tqdm(task_dict['examples'])]

In [None]:
print(preds)

[{'Capacity': '64GB', 'Brand': 'Kyocera', 'Model': 'Hydro VIBE', 'Type': 'Micro SDHC Card', 'Format': 'custom formatted', 'Speed': 'high speed', 'Recording quality': 'lossless', 'Includes': 'Standard SD Adapter', 'Certification': 'Class 10', 'Speed rating': '38MB/sec'}, {'Brand': 'Kingston', 'Capacity': '16GB', 'Memory Card Type': 'MicroSDHC', 'Compatible Device': 'LG G Stylo', 'Special Feature': 'Custom formatting', 'Included Adapter': 'Standard SD Adapter', 'Speed': '32Mbps / Class 4'}, {'Capacity': '32GB', 'Brand': 'Insignia', 'Type': 'MicroSDHC Card', 'Custom Format': 'Hi-Speed Lossless certified recording', 'Included': 'SD Adapter', 'Speed Class': 'Class 10', 'Speed': 'up to 500x or 70MB/sec'}, {'Brand': 'SanDisk', 'Capacity': '32GB', 'Memory Card Type': 'MicroSDHC', 'Compatibility': 'Nokia Lumia 521 Smartphone', 'Formatting': 'Custom formatted for high speed, lossless recording', 'Included': 'Standard SD Adapter', 'Speed Class': 'UHS-1 Class 10 Certified', 'Speed': '30MB/sec'}, {

In [None]:
# # Load task template
# with open(PATH_FILE_RESULT + 'task_run_chat_open_extraction_gpt-3.5-turbo-0125.json', 'r') as f:
#     task_dict = json.load(f)

targets = [example['target_scores'] for example in task_dict['examples']]
categories = [example['category'] for example in task_dict['examples']]
attributes = [example['attribute'] for example in task_dict['examples']]

postprocessed_preds = [pred[attribute] if attribute in pred else "I do not know." for pred, attribute in zip(preds, attributes)]
#postprocessed_preds = [example['post_pred'] for example in task_dict['examples']]

task_dict['examples'] = [combine_example(example, pred, post_pred) 
                     for example, pred, post_pred in zip(task_dict['examples'], preds, postprocessed_preds)]

#print(task_dict['examples'])

results = calculate_recall_precision_f1(targets, postprocessed_preds, categories, attributes)

Attribute: Capacity - Category: Flash Memory Cards
{'precision': 97.06, 'recall': 82.5, 'f1': 89.19}
Attribute: SD Format - Category: Flash Memory Cards
{'precision': 0, 'recall': 0.0, 'f1': 0}
Attribute: Sensor Size - Category: Digital Cameras
{'precision': 0, 'recall': 0.0, 'f1': 0}
Attribute: Resolution - Category: Digital Cameras
{'precision': 100.0, 'recall': 56.1, 'f1': 71.88}
Attribute: Optical Zoom - Category: Digital Cameras
{'precision': 100.0, 'recall': 22.5, 'f1': 36.73}
Attribute: Sensor Type - Category: Digital Cameras
{'precision': 91.89, 'recall': 85.0, 'f1': 88.31}
Attribute: Screen Size - Category: Laptops
{'precision': 4.88, 'recall': 4.88, 'f1': 4.88}
Attribute: Number of Cores - Category: Laptops
{'precision': 0, 'recall': 0.0, 'f1': 0}
Attribute: Resolution - Category: Laptops
{'precision': 75.0, 'recall': 36.59, 'f1': 49.18}
Attribute: Processor Speed - Category: Laptops
{'precision': 100.0, 'recall': 4.55, 'f1': 8.7}
Attribute: Processor Brand - Category: Laptop

In [None]:
# Save populated task
result_file = PATH_FILE_RESULT + '{}_{}.json'.format(task, task_dict['model'])
with open(result_file, 'w') as fp:
    json.dump(task_dict, fp, indent=4)

In [None]:
# Error Analysis
print('Prompts for which target and postprocessed prediction do not match.')
print('-----------')
input_texts = [example['input'] for example in task_dict['examples']]
preds = [example['pred'] for example in task_dict['examples']]

for input_text, target, pred, post_pred in zip(input_texts, targets, preds, postprocessed_preds):
    if post_pred not in target.keys():
        #print('Prompt: {}'.format(chat_prompt.format(task_prefix=task_dict['task_prefix'], input_string= input_text, human_msg_0= human_text_0, ai_msg=system_text,)))
        print('Input: {}'.format(input_text))
        print('Prediction: \n {}'.format(pred))
        print('Prediction 2: {}'.format(post_pred))
        print('Target: {}'.format(target))
        print('-----------')

Prompts for which target and postprocessed prediction do not match.
-----------
Input: Essential 64GB Kyocera Hydro VIBE Micro SDHC Card is custom formatted for high speed, lossless recording! Includes Standard SD Adapter. (Class 10 Certified 38MB/sec)
Prediction: 
 {'Capacity': '64GB', 'Brand': 'Kyocera', 'Model': 'Hydro VIBE', 'Type': 'Micro SDHC Card', 'Format': 'custom formatted', 'Speed': 'high speed', 'Recording quality': 'lossless', 'Includes': 'Standard SD Adapter', 'Certification': 'Class 10', 'Speed rating': '38MB/sec'}
Prediction 2: I do not know.
Target: {'Micro SDHC Card': 1, 'Micro SDHC': 1, 'MicroSDHC': 1}
-----------
Input: Professional Kingston 16GB MicroSDHC LG G Stylo with custom formatting and Standard SD Adapter! (32Mbps / Class 4)
Prediction: 
 {'Brand': 'Kingston', 'Capacity': '16GB', 'Memory Card Type': 'MicroSDHC', 'Compatible Device': 'LG G Stylo', 'Special Feature': 'Custom formatting', 'Included Adapter': 'Standard SD Adapter', 'Speed': '32Mbps / Class 4'}
P