# <font color='orange'>Run FTS-OBP evaluation of model output file</font>

In [1]:
from util.absa_evaluator import ABSAEvaluator
from util.dictToExcel import *

#===================================================================================
model_output_py_file_dir = '../OUTPUTS/OUTPUT_1_final_test_output_files/_demo/test_input_lora_adaptor_for_phi4mini_A46.2_train2000_R64_multitask_OE-AOPE-AOC-ASTE-ASQE_fewshot_(2025-11-06).py'

eval_xlsx_file_dir = '../OUTPUTS/OUTPUT_2a_final_test_output_FTS-0BP_evaluation/_demo/script_3a_eval_output.xlsx'

#===============================================
# Set up evaluator and evaluation output dict
evaluator = ABSAEvaluator(equal_weights=True, partial_category_score=0.3, allow_partial_category_for_unit_match=False)
sheetname_df_dict = dict()
#===============================================

print(f"\nEval input dir:      {model_output_py_file_dir}\n"
      f"Eval output dir:     {eval_xlsx_file_dir}\n"
      )


Eval input dir:      ../OUTPUTS/OUTPUT_1_final_test_output_files/_demo/test_input_lora_adaptor_for_phi4mini_A46.2_train2000_R64_multitask_OE-AOPE-AOC-ASTE-ASQE_fewshot_(2025-11-06).py
Eval output dir:     ../OUTPUTS/OUTPUT_2a_final_test_output_FTS-0BP_evaluation/_demo/script_3a_eval_output.xlsx



## <font color='cornflowerblue'>Example of using FTS-OBP core function (vs. direct Rouge-L score)</font>


In [2]:
from rouge_score import rouge_scorer

input_text = "a b c u v w x y z a b c"
gold = " u v w "
pred = "c u v w"
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)


rouge_l_score = scorer.score(gold, pred)['rougeL'].fmeasure
fts_obp_score = evaluator.flexible_text_similarity(input_text=input_text, pred=pred, gold=gold)

print(f"RougeL F:      \033[36m{rouge_l_score}\033[0m\n\n"
      f"pred: {len(pred.split())}    gold: {len(gold.split())}\n\n"
      f"FTS-OBP score: \033[32m{fts_obp_score}\033[0m\n")

RougeL F:      [36m0.8571428571428571[0m

pred: 4    gold: 3

FTS-OBP score: [32m0.8571428571428571[0m



In [3]:
evaluator.flexible_text_similarity(input_text=input_text, pred='B C D E', gold='really X B C D E')

0.0


# <font color='cornflowerblue'>Run eval function</font>

In [4]:
import importlib.util

def load_dict_from_py_file(filepath):
    """Load a specific variable from a .py file."""
    spec = importlib.util.spec_from_file_location("temp_module", filepath)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    
    # List all non-private attributes
    all_vars = [name for name in dir(module) if not name.startswith('_')]
    print("All variables:", all_vars)
    
    # Get only dictionaries
    dicts = {name: getattr(module, name) for name in all_vars 
             if isinstance(getattr(module, name), dict)}
    print("Dictionary names:", list(dicts.keys()))

    return dicts

# Usage
# input_dict = load_dict_from_py_file(input_dict_dir)


In [5]:
all_dicts = load_dict_from_py_file(model_output_py_file_dir)


All variables: ['sft_eval_input', 'sft_input_dict']
Dictionary names: ['sft_eval_input', 'sft_input_dict']


In [6]:
input_dict = all_dicts['sft_eval_input']
print(len(input_dict))

###############################################
output = evaluator.evaluate_from_saved_dict(input_dict)
###############################################

output.keys()

20


dict_keys(['all_preds', 'all_labels', 'eval_input', 'results'])

In [7]:
# sft_output = {"all_preds": all_preds, "all_labels": all_labels, "eval_input": eval_input, "results": results}
eval_results = output['results']

print(f"output:             {output.keys()}\n")
print(f"output['results']:  {eval_results.keys()}\n")

output:             dict_keys(['all_preds', 'all_labels', 'eval_input', 'results'])

output['results']:  dict_keys(['dataframes', 'match_details_dict'])



### <font color='magenta'>Write df to dict</font>

In [8]:
eval_dfdict = eval_results.get('dataframes', dict())

print(f"{eval_dfdict.keys()}\n\n")

eval_dfdict['task_summary_df'] = evaluator._transpose_df(eval_dfdict['task_summary_df'])

for eval_dfname, eval_df in eval_dfdict.items(): 
    sheetname_df_dict[f"SFT_{eval_dfname}"] = eval_df
    print(f'\033[36m{eval_dfname:<20}   len = \033[0m{len(eval_df)}')

dict_keys(['task_summary_df', 'entries_metrics_df', 'pairs_metrics_df', 'match_details_df'])


[36mtask_summary_df        len = [0m60
[36mentries_metrics_df     len = [0m20
[36mpairs_metrics_df       len = [0m90
[36mmatch_details_df       len = [0m98


In [9]:
# evaluator._transpose_df(eval_dfdict['task_summary_df'], 'SFT')

eval_dfdict['task_summary_df']

Unnamed: 0,task_type,OE,AOPE,AOC,ASTE,ASQE
0,entry_count,4.0,4.0,4.0,4.0,4.0
1,total_pred,20.0,18.0,20.0,20.0,20.0
2,total_gold,18.0,18.0,18.0,18.0,18.0
3,total_matched,18.0,18.0,18.0,18.0,18.0
4,unit_match_TP,15.0,15.0,10.0,15.0,11.0
5,unit_match_FP,5.0,3.0,10.0,5.0,9.0
6,unit_match_FN,3.0,3.0,8.0,3.0,7.0
7,unit_match_micro_precision,0.75,0.833333,0.5,0.75,0.55
8,unit_match_micro_recall,0.833333,0.833333,0.555556,0.833333,0.611111
9,unit_match_micro_f1,0.789474,0.833333,0.526316,0.789474,0.578947


In [10]:
print(list(eval_dfdict['entries_metrics_df'].columns.values), '\n')

eval_dfdict['entries_metrics_df']

['entry_id', 'task_type', 'n_pairs', 'input_text', 'gold', 'pred', 'pred_count', 'gold_count', 'matched_count', 'unit_match_tp', 'unit_match_fp', 'unit_match_fn', 'unit_match_precision', 'unit_match_recall', 'unit_match_f1', 'weighted_component_precision', 'weighted_component_recall', 'weighted_component_f1', 'aspect_tp', 'aspect_fp', 'aspect_fn', 'aspect_precision', 'aspect_recall', 'aspect_f1', 'aspect_avg_similarity', 'aspect_std_similarity', 'opinion_tp', 'opinion_fp', 'opinion_fn', 'opinion_precision', 'opinion_recall', 'opinion_f1', 'opinion_avg_similarity', 'opinion_std_similarity', 'category_tp', 'category_fp', 'category_fn', 'category_precision', 'category_recall', 'category_f1', 'category_avg_similarity', 'category_std_similarity', 'sentiment_tp', 'sentiment_fp', 'sentiment_fn', 'sentiment_precision', 'sentiment_recall', 'sentiment_f1', 'sentiment_avg_similarity', 'sentiment_std_similarity'] 



Unnamed: 0,entry_id,task_type,n_pairs,input_text,gold,pred,pred_count,gold_count,matched_count,unit_match_tp,...,category_avg_similarity,category_std_similarity,sentiment_tp,sentiment_fp,sentiment_fn,sentiment_precision,sentiment_recall,sentiment_f1,sentiment_avg_similarity,sentiment_std_similarity
0,0,OE,5,Studied for the exam literally the day of... e...,[<opn>Not as interesting as I thought it would...,"[<opn>good mark booster</opn>, <opn>not partic...",7,5,5,5,...,,,,,,,,,,
1,1,AOPE,5,Studied for the exam literally the day of... e...,[<asp>null</asp><opn>Not as interesting as I t...,[<asp>null</asp><opn>not as interesting as I t...,5,5,5,5,...,,,,,,,,,,
2,2,AOC,5,Studied for the exam literally the day of... e...,[<asp>null</asp><opn>Not as interesting as I t...,[<asp>exam</asp><opn>literally the day of</opn...,7,5,5,2,...,0.86,0.28,,,,,,,,
3,3,ASTE,5,Studied for the exam literally the day of... e...,[<asp>null</asp><opn>Not as interesting as I t...,[<asp>null</asp><opn>literally the day of</opn...,7,5,5,5,...,,,5.0,2.0,0.0,0.714286,1.0,0.833333,1.0,0.0
4,4,ASQE,5,Studied for the exam literally the day of... e...,[<asp>null</asp><opn>Not as interesting as I t...,[<asp>exam</asp><opn>literally the day of</opn...,7,5,5,2,...,0.86,0.28,5.0,2.0,0.0,0.714286,1.0,0.833333,1.0,0.0
5,5,OE,4,"The campus is very green, and a beautiful plac...","[<opn>a beautiful place to work</opn>, <opn>ve...","[<opn>very green</opn>, <opn>beautiful</opn>, ...",4,4,4,3,...,,,,,,,,,,
6,6,AOPE,4,"The campus is very green, and a beautiful plac...",[<asp>campus</asp><opn>a beautiful place to wo...,"[<asp>campus</asp><opn>very green</opn>, <asp>...",4,4,4,4,...,,,,,,,,,,
7,7,AOC,4,"The campus is very green, and a beautiful plac...",[<asp>campus</asp><opn>a beautiful place to wo...,[<asp>campus</asp><opn>very green</opn><cat>Un...,4,4,4,3,...,0.825,0.303109,,,,,,,,
8,8,ASTE,4,"The campus is very green, and a beautiful plac...",[<asp>campus</asp><opn>a beautiful place to wo...,[<asp>campus</asp><opn>very green</opn><sen>po...,4,4,4,4,...,,,4.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
9,9,ASQE,4,"The campus is very green, and a beautiful plac...",[<asp>campus</asp><opn>a beautiful place to wo...,[<asp>campus</asp><opn>very green</opn><cat>Un...,4,4,4,4,...,1.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0


In [11]:
eval_dfdict['match_details_df']

Unnamed: 0,entry_id,Task,PairID,is_optimal_match,is_unit_match,Input Text,Match Quality,Gold-Aspect,Pred-Aspect,Aspect Score,Gold-Opinion,Pred-Opinion,Opinion Score,Gold-Category,Pred-Category,Category Score,Gold-Sentiment,Pred-Sentiment,Sentiment Score,Overall Score
0,0,OE,0-g0-p2,True,True,Studied for the exam literally the day of... e...,Strong Match,,,,Not as interesting as I thought it would be tho,Not as interesting as I thought it would be,0.933333,,,,,,,0.933333
1,0,OE,0-g1-p3,True,True,Studied for the exam literally the day of... e...,Full Match,,,,dry,dry,1.000000,,,,,,,1.000000
2,0,OE,0-g2-p4,True,True,Studied for the exam literally the day of... e...,Full Match,,,,put me to sleep,put me to sleep,1.000000,,,,,,,1.000000
3,0,OE,0-g3-p0,True,True,Studied for the exam literally the day of... e...,Full Match,,,,good mark booster,good mark booster,1.000000,,,,,,,1.000000
4,0,OE,0-g4-p1,True,True,Studied for the exam literally the day of... e...,Full Match,,,,not particularly useful,not particularly useful,1.000000,,,,,,,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,17,AOC,17-g1-p1,True,False,"You can get a decent grade, but end up forgett...",Good Match,,,1.0,forgetting everything,forget everything,0.000000,Course - Overall,Course - Overall,1.0,,,,0.666667
94,18,ASTE,18-g0-p0,True,True,"You can get a decent grade, but end up forgett...",Strong Match,,,1.0,can get a decent grade,decent grade,0.666667,,,,positive,positive,1.0,0.888889
95,18,ASTE,18-g1-p1,True,False,"You can get a decent grade, but end up forgett...",Good Match,,,1.0,forgetting everything,forget everything,0.000000,,,,negative,negative,1.0,0.666667
96,19,ASQE,19-g0-p0,True,False,"You can get a decent grade, but end up forgett...",Good Match,,,1.0,can get a decent grade,decent grade,0.666667,Course - Difficulty,Course - Overall,0.3,positive,positive,1.0,0.741667


# <font color='gold'>Write to Excel</font>

In [12]:
df_dictToExcel(excelfilename=eval_xlsx_file_dir, sheetname_df_dict=sheetname_df_dict, freezeheader=True, headerfilter=True, locksheet=False, editable_range=None)
print(f"\nDone!  :)") 

[33mSFT_task_summary_df[0m written to [33m../OUTPUTS/OUTPUT_2a_final_test_output_FTS-0BP_evaluation/_demo/script_3a_eval_output.xlsx.[0m
[33mSFT_entries_metrics_df[0m written to [33m../OUTPUTS/OUTPUT_2a_final_test_output_FTS-0BP_evaluation/_demo/script_3a_eval_output.xlsx.[0m
[33mSFT_pairs_metrics_df[0m written to [33m../OUTPUTS/OUTPUT_2a_final_test_output_FTS-0BP_evaluation/_demo/script_3a_eval_output.xlsx.[0m
[33mSFT_match_details_df[0m written to [33m../OUTPUTS/OUTPUT_2a_final_test_output_FTS-0BP_evaluation/_demo/script_3a_eval_output.xlsx.[0m



Done!  :)
