## Prepare Task

In [25]:
import json

# Update Task Dict
def update_task_dict_from_file(file_name, task_dict):
    with open(file_name, 'r') as f:
        for line in f:
            record = json.loads(line)
            example = {}
            for attribute in record['attributes']:
                if attribute['key'] != record['attribute']:
                    # Select correct attribute
                    continue
                example['input'] = record['title']
                example['category'] = record['category']
                example['attribute'] = attribute['key']
                example['target_scores'] = {}
    
                for evidence in attribute['evidences']:
                    if evidence['pid'] == 0:
                        example['target_scores'][evidence['value']] = 1
                if len(example['target_scores']) == 0:
                    example['target_scores']['I do not know.'] = 1
                task_dict['examples'].append(example)
    return task_dict

In [26]:
import json

task_dict ={}
task_dict['examples'] =[]

# Generate examples
task_dict = update_task_dict_from_file('C:/Users/ADMIN/Desktop/DATN/Extract_Information/data/mave_filtered_test.jsonl', task_dict)

In [27]:
with open('task.json', 'w', encoding='utf-8') as f:
    json.dump(task_dict, f, indent=4)

## Inference

In [28]:
import os

from transformers import pipeline

# Replace this with your own checkpoint
#model_checkpoint = "C:/Users/ADMIN/Desktop/DATN/Extract_Information/baseline/deberta-v3-large-finetuned-ner-10epochs-V2/checkpoint"
model_checkpoint = "C:/Users/ADMIN/Desktop/DATN/Extract_Information/baseline/bert_fine_turn_ner/checkpoint"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [29]:
token_classifier("Essential 64GB Samsung Galaxy Tab 10.1 Micro SDHC Card is custom formatted for high speed, lossless recording! Includes Standard SD Adapter. (Class 10 Certified 38MB/sec)")

[{'entity_group': 'Flash_Memory_Cards_Capacity',
  'score': 0.999712,
  'word': '64GB',
  'start': 10,
  'end': 14},
 {'entity_group': 'Flash_Memory_Cards_SD_Format',
  'score': 0.9774015,
  'word': 'Micro',
  'start': 39,
  'end': 44},
 {'entity_group': 'Flash_Memory_Cards_Format',
  'score': 0.55479825,
  'word': 'SDHC',
  'start': 45,
  'end': 49},
 {'entity_group': 'Flash_Memory_Cards_SD_Format',
  'score': 0.87198853,
  'word': 'Card',
  'start': 50,
  'end': 54}]

In [30]:
from tqdm import tqdm
preds = []

for example in tqdm(task_dict['examples']):
    token_classifications = token_classifier(example['input'])
    target_entity_group = '{}_{}'.format('_'.join(example['category'].split(' ')), '_'.join(example['attribute'].split(' ')))
    # Check if prediction of target entity group was done.
    if target_entity_group in [token_classification['entity_group'] for token_classification in token_classifications]:
        for token_classification in token_classifications:
            if token_classification['entity_group'] == target_entity_group:
                preds.append(token_classification['word'])
                break
    else:
        preds.append("I do not know.")

  0%|          | 0/562 [00:00<?, ?it/s]

100%|██████████| 562/562 [01:15<00:00,  7.41it/s]


In [31]:
# Format examples to save the predictions
def combine_example(example, pred, post_pred):
    example['pred'] = pred
    example['post_pred'] = post_pred
    return example

In [32]:
from itertools import product

def calculate_recall_precision_f1(targets, preds, categories, attributes):
    unique_attributes = list(set(attributes))
    unique_categories = list(set(categories))
    
    result_dict = {}
    total_eval_nn = 0 # the model can predict No value (I do not know.) when ground truth is No attribute value
    total_eval_nv = 0 # some incorrect Value when ground truth is No attribute value
    total_eval_vn = 0 # the model can predict No value (I do not know.) when the ground truth has attribute Values
    total_eval_vc = 0 # Correct values when the ground truth has attribute Values
    total_eval_vw = 0 # Wrong values when the ground truth has attribute Values
    
    for unique_category in unique_categories:
        for unique_attribute in unique_attributes:

            eval_nn = 0 # the model can predict No value (I do not know.) when ground truth is No attribute value (TN)
            eval_nv = 0 # some incorrect Value when ground truth is No attribute value (FP)
            eval_vn = 0 # the model can predict No value (I do not know.) when the ground truth has attribute Values (FN)
            eval_vc = 0 # Correct values when the ground truth has attribute Values (TP)
            eval_vw = 0 # Wrong values when the ground truth has attribute Values (FN-FP)

            for target, pred, category, attribute in zip(targets, preds, categories, attributes):
                if unique_attribute != attribute or unique_category != category:
                    # Evaluate per attribute/category
                    continue

                target_values = [value if value != "I do not know." else None for value in target]
                prediction = pred if pred != "I do not know." else None

                if target_values[0] is None and prediction is None:
                    eval_nn += 1
                elif target_values[0] is None and prediction is not None:
                    eval_nv += 1
                elif target_values[0] is not None and prediction is None:
                    eval_vn += 1
                elif prediction in target_values:
                    eval_vc += 1
                else:
                    eval_vw += 1

            precision = round((eval_vc / (eval_nv + eval_vc + eval_vw))*100, 2) if (eval_nv + eval_vc + eval_vw) > 0 else 0 
            recall = round((eval_vc / (eval_vn + eval_vc + eval_vw))*100, 2) if (eval_vn + eval_vc + eval_vw) > 0 else 0
            f1 = round(2* precision* recall/ (precision + recall), 2) if (precision + recall) > 0 else 0
            
            total_eval_nn += eval_nn
            total_eval_nv += eval_nv
            total_eval_vn += eval_vn
            total_eval_vc += eval_vc
            total_eval_vw += eval_vw
            
            if (eval_nv + eval_vc + eval_vw) == 0 and (eval_vn + eval_vc + eval_vw) == 0:
                # Combination does not exist
                continue
            
            result_dict['{}_{}'.format(unique_attribute, unique_category)] = {'precision': precision, 'recall': recall, 'f1': f1}

            print('Attribute: {} - Category: {}'.format(unique_attribute, unique_category))
            print(result_dict['{}_{}'.format(unique_attribute, unique_category)])
    
    #Calculate macro scores
    precision_scores = [result_dict['{}_{}'.format(attribute, category)]['precision'] for attribute, category in product(unique_attributes, unique_categories) 
                        if '{}_{}'.format(attribute, category) in result_dict]
    macro_precision = round(sum(precision_scores)/ len(precision_scores), 2)
    
    recall_scores = [result_dict['{}_{}'.format(attribute, category)]['recall'] for attribute, category in product(unique_attributes, unique_categories) 
                        if '{}_{}'.format(attribute, category) in result_dict]
    macro_recall = round(sum(recall_scores)/ len(recall_scores), 2)
    
    f1_scores = [result_dict['{}_{}'.format(attribute, category)]['f1'] for attribute, category in product(unique_attributes, unique_categories)
                        if '{}_{}'.format(attribute, category) in result_dict]
    macro_f1 = round(sum(f1_scores)/ len(f1_scores), 2)
    
    # Calculate micro scores
    micro_precision = round((total_eval_vc / (total_eval_nv + total_eval_vc + total_eval_vw))*100, 2) if (total_eval_nv + total_eval_vc + total_eval_vw) > 0 else 0 
    micro_recall = round((total_eval_vc / (total_eval_vn + total_eval_vc + total_eval_vw))*100, 2) if (total_eval_vn + total_eval_vc + total_eval_vw) > 0 else 0
    micro_f1 = round(2* micro_precision* micro_recall/ (micro_precision + micro_recall), 2) if (micro_precision + micro_recall) > 0 else 0
    
    result_dict['macro'] = {'macro_precision': macro_precision, 'macro_recall': macro_recall, 'macro_f1': macro_f1}
    result_dict['micro'] = {'micro_precision': micro_precision, 'micro_recall': micro_recall, 'micro_f1': micro_f1}
    
    print('Macro:')
    print(result_dict['macro'])
    print('Micro:')
    print(result_dict['micro'])
    
    return result_dict

In [33]:
targets = [example['target_scores'] for example in task_dict['examples']]

categories = [example['category'] for example in task_dict['examples']]
attributes = [example['attribute'] for example in task_dict['examples']]
inputs = [example['input'] for example in task_dict['examples']]

In [34]:
postprocessed_preds = ['' if pred is None else pred.replace('. ', '.').replace(' "', '"').replace(' - ', '-').replace('(', '').replace(')', '').replace(',', '') for pred in preds]

task_dict['examples'] = [combine_example(example, pred, post_pred) 
                    for example, pred, post_pred in zip(task_dict['examples'], preds, postprocessed_preds)]

# with open('C:/Users/ADMIN/Desktop/DATN/Extract_Information/baseline/results/results.json', 'w') as file:
#     json.dump(task_dict, file, indent=4)

In [35]:
results = calculate_recall_precision_f1(targets, postprocessed_preds, categories, attributes)

Attribute: Capacity - Category: Flash Memory Cards
{'precision': 97.5, 'recall': 97.5, 'f1': 97.5}
Attribute: SD Format - Category: Flash Memory Cards
{'precision': 50.0, 'recall': 34.15, 'f1': 40.58}
Attribute: Battery Life - Category: Laptops
{'precision': 100.0, 'recall': 85.71, 'f1': 92.31}
Attribute: Processor Speed - Category: Laptops
{'precision': 100.0, 'recall': 84.09, 'f1': 91.36}
Attribute: Processor Brand - Category: Laptops
{'precision': 64.1, 'recall': 56.82, 'f1': 60.24}
Attribute: Number of Cores - Category: Laptops
{'precision': 94.87, 'recall': 92.5, 'f1': 93.67}
Attribute: Screen Size - Category: Laptops
{'precision': 100.0, 'recall': 97.56, 'f1': 98.76}
Attribute: Resolution - Category: Laptops
{'precision': 85.0, 'recall': 82.93, 'f1': 83.95}
Attribute: Sensor Size - Category: Digital Cameras
{'precision': 100.0, 'recall': 100.0, 'f1': 100.0}
Attribute: Optical Zoom - Category: Digital Cameras
{'precision': 94.74, 'recall': 90.0, 'f1': 92.31}
Attribute: Resolution 

In [36]:
# Error Analysis
print('Prompts for which target and postprocessed prediction do not match.')
print('-----------')
input_texts = [example['input'] for example in task_dict['examples']]
preds = [example['pred'] for example in task_dict['examples']]

for input_text, target, pred, post_pred in zip(input_texts, targets, preds, postprocessed_preds):
    if post_pred not in target.keys():
        #print('Prompt: {}'.format(chat_prompt.format(task_prefix=task_dict['task_prefix'], input_string= input_text, human_msg_0= human_text_0, ai_msg=system_text,)))
        print('Input: {}'.format(input_text))
        print('Prediction: \n {}'.format(pred))
        print('Prediction 2: {}'.format(post_pred))
        print('Target: {}'.format(target))
        print('-----------')

Prompts for which target and postprocessed prediction do not match.
-----------
Input: Essential 64GB Kyocera Hydro VIBE Micro SDHC Card is custom formatted for high speed, lossless recording! Includes Standard SD Adapter. (Class 10 Certified 38MB/sec)
Prediction: 
 Micro SD
Prediction 2: Micro SD
Target: {'Micro SDHC Card': 1, 'Micro SDHC': 1, 'MicroSDHC': 1}
-----------
Input: Professional Kingston 16GB MicroSDHC LG G Stylo with custom formatting and Standard SD Adapter! (32Mbps / Class 4)
Prediction: 
 MicroSD
Prediction 2: MicroSD
Target: {'MicroSDHC': 1}
-----------
Input: Professional Ultra SanDisk 64GB MicroSDXC LG G Pad 8.3 card is custom formatted for high speed, lossless recording! Includes Standard SD Adapter. (UHS-1 Class 10 Certified 30MB/sec)
Prediction: 
 I do not know.
Prediction 2: I do not know.
Target: {'MicroSDXC': 1}
-----------
Input: Professional Ultra SanDisk 64GB MicroSDXC Motorola Moto G LTE card is custom formatted for high speed, lossless recording! Includes