In [25]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.neighbors import NearestNeighbors
import requests
import json
import os
import re
from sklearn.metrics import f1_score as sklearn_f1_score
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Dict
from collections import Counter
import string

In [37]:
#load the test answer data set
with open('true_answers.json', 'r') as f:
    true_answers = json.load(f)

In [159]:
print(true_answers)

[['Southend United F.C. (2010, 2011, 2012)', 'Stevenage F.C. (2012, 2013)', 'Crewe Alexandra F.C. (2013, 2014, 2015)', 'Port Vale F.C. (2015, 2016, 2017, 2018, 2019, 2020)'], ['Prime Minister of Ukraine (2010)', 'First Deputy Prime Minister of Ukraine (2010)', "People's Deputy of Ukraine (2012, 2013, 2014, 2015)", 'Chairman of the Verkhovna Rada (2014, 2015, 2016, 2017, 2018, 2019, 2020)', 'President of Ukraine (2014)'], ['Socialist Party (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019)', 'RISE (Ireland) (2019, 2020)'], ['Newcastle United F.C. (2010, 2011, 2019, 2020)', 'England national under-21 football team (2010)', 'England national association football team (2010, 2011, 2012)', 'Liverpool F.C. (2011, 2012, 2013)', 'West Ham United F.C. (2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019)'], ['Tom Marshall (2013, 2014)', 'Paul Davis (2014, 2015)', 'Andrew Furey (2020)'], ['University of Melbourne (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017)', 'King Saud University (2010

In [4]:
def load_predictions(file_path: str) -> List[str]:
    with open(file_path, 'r') as file:
        predictions = json.load(file)
    return predictions

In [66]:
def parse_predictions(info: List[str]) -> List[str]:
    return [entry.strip() for entry in info]

In [5]:
# parse the data set
def parse_years(years_str: str) -> List[int]:
    
    years_str = years_str.replace('–', '-').replace('—', '-').replace('−', '-')
    # 处理 "since" 关键字
    years_str = re.sub(r'since (\d{4})', r'\1-', years_str)
    years = []
    for part in years_str.split(', '):
        if '-' in part:
            try:
                start, end = map(int, part.split('-'))
                years.extend(range(start, end + 1))
            except ValueError as e:
                continue
                #print(f"Skipping invalid year range part: {part}. Error: {e}")
        else:
            try:
                years.append(int(part))
            except ValueError as e:
                #print(f"Skipping invalid year part: {part}. Error: {e}")
                continue
    return years

def parse_answer(answer: str) -> Tuple[List[str], Dict[str, List[int]]]:
    entities = []
    timelines = {}
    parts = re.split(r'\),\s*', answer)
    
    parts = [part + ')' if not part.endswith(')') else part for part in parts]

    for part in parts:
        entity_match = re.match(r'(.+?)\s+\((.+)\)', part)
        if entity_match:
            entity = entity_match.group(1).strip()
            years_str = entity_match.group(2).strip()
            years = parse_years(years_str)
            entities.append(entity)
            timelines[entity] = years
        else:
            continue
            #print(f"Skipping invalid part: {part}")
    return entities, timelines

In [6]:
# EM of entities
def evaluate_entities_em(parsed_predictions: List[Tuple[List[str], Dict[str, List[int]]]], parsed_true_answers: List[Tuple[List[str], Dict[str, List[int]]]]) -> float:
    total_questions = 0
    matching_questions = 0

    for pred, gt in zip(parsed_predictions, parsed_true_answers):
        pred_entities, _ = pred
        gt_entities, _ = gt

        if set(pred_entities) == set(gt_entities) and set(pred_entities):
            matching_questions += 1
            print(f"Matched entities: {set(pred_entities)}")
        total_questions += 1

    entity_em_score = matching_questions / total_questions if total_questions > 0 else 0
    return entity_em_score


# EM of timeline 
def evaluate_timeline_em(parsed_predictions: List[Tuple[List[str], Dict[str, List[int]]]], parsed_true_answers: List[Tuple[List[str], Dict[str, List[int]]]]) -> float:
    total_entities = 0
    matching_timelines = 0

    for pred, gt in zip(parsed_predictions, parsed_true_answers):
        pred_entities, pred_timelines = pred
        gt_entities, gt_timelines = gt

        if set(pred_entities) == set(gt_entities) and set(pred_entities):
            for entity in gt_entities:
                if entity in pred_entities and entity in pred_timelines and entity in gt_timelines:
                    if pred_timelines[entity] == gt_timelines[entity]:
                        matching_timelines += 1
                    total_entities += 1

    timeline_em_score = matching_timelines / total_entities if total_entities > 0 else 0
    return timeline_em_score




In [45]:
def evaluate_completeness(parsed_predictions: List[Tuple[List[str], Dict[str, List[int]]]], parsed_true_answers: List[Tuple[List[str], Dict[str, List[int]]]]) -> float:
    total_completeness = 0
    num_questions = len(parsed_true_answers)

    for pred, gt in zip(parsed_predictions, parsed_true_answers):
        pred_entities, pred_timelines = pred
        gt_entities, gt_timelines = gt

        
        correct_entities = set(pred_entities).intersection(set(gt_entities))
        correct_timelines = 0
        total_gt_timelines = len(gt_entities)  

        for entity in correct_entities:
            if entity in pred_timelines and entity in gt_timelines:
                if pred_timelines[entity] == gt_timelines[entity]:
                    correct_timelines += 1

        completeness = correct_timelines / total_gt_timelines if total_gt_timelines > 0 else 0
        total_completeness += completeness

    average_completeness = total_completeness / num_questions if num_questions > 0 else 0
    return average_completeness

In [49]:
def evaluate_entity_completeness(parsed_predictions: List[Tuple[List[str], Dict[str, List[int]]]], parsed_true_answers: List[Tuple[List[str], Dict[str, List[int]]]]) -> float:
    total_completeness = 0
    num_questions = len(parsed_true_answers)

    for pred, gt in zip(parsed_predictions, parsed_true_answers):
        pred_entities, _ = pred
        gt_entities, _ = gt

        correct_entities = set(pred_entities).intersection(set(gt_entities))
        total_gt_entities = len(gt_entities)  

        completeness = len(correct_entities) / total_gt_entities if total_gt_entities > 0 else 0
        total_completeness += completeness

    average_completeness = total_completeness / num_questions if num_questions > 0 else 0
    return average_completeness



In [152]:
# F1 score 

def evaluate_entity_f1(parsed_predictions: List[Tuple[List[str], Dict[str, List[int]]]], parsed_true_answers: List[Tuple[List[str], Dict[str, List[int]]]]) -> Tuple[float, float, float]:
    total_true_positives = 0
    total_predicted = 0
    total_actual = 0

    for pred, gt in zip(parsed_predictions, parsed_true_answers):
        pred_entities, _ = pred
        gt_entities, _ = gt

        pred_str = " ".join(pred_entities)
        gt_str = " ".join(gt_entities)

        common = Counter(pred_str) & Counter(gt_str)
        num_same = sum(common.values())

        total_true_positives += num_same
        total_predicted += len(pred_str)
        total_actual += len(gt_str)

    return precision_recall_f1(total_true_positives, total_predicted, total_actual)

def evaluate_timeline_f1(parsed_predictions: List[Tuple[List[str], Dict[str, List[int]]]], parsed_true_answers: List[Tuple[List[str], Dict[str, List[int]]]]) -> Tuple[float, float, float]:
    total_true_positives = 0
    total_predicted = 0
    total_actual = 0

    for pred, gt in zip(parsed_predictions, parsed_true_answers):
        pred_entities, pred_timelines = pred
        gt_entities, gt_timelines = gt

        for entity in gt_entities:
            if entity in pred_entities and entity in pred_timelines and entity in gt_timelines:
                if pred_timelines[entity] == gt_timelines[entity]:
                    total_true_positives += 1
                total_actual += 1

        total_predicted += len(pred_timelines)

    return precision_recall_f1(total_true_positives, total_predicted, total_actual)

def precision_recall_f1(true_positives, predicted, actual):
    precision = true_positives / predicted if predicted > 0 else 0
    recall = true_positives / actual if actual > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1



def process_answer(answers):
    final_answer = ""
    for answer in answers:
        final_answer += (answer + ' ')
    return final_answer.strip()

def normalize_answer(s):
    """
    Lower text and remove punctuation, articles and extra whitespace.

    Args:
        s: String to normalize.

    Returns:
        Cleaned string with lowercase, no punctuations, no articles, and
            and extraneous whitespace.
    """

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    """Calculates F1 score.

    Args:
        prediction: Predicted answer span (string).
        ground_truth: True answer span (string).

    Returns:
        F1 score.
    """
    prediction_tokens = normalize_answer(prediction).split()

    ground_truth = process_answer(ground_truth)
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [156]:
def calculate_scores(predictions: List[str], true_answers: List[List[str]]) -> Tuple[float, float, float]:
    parsed_predictions = [parse_answer(pred) for pred in predictions]
    parsed_true_answers = [
        (
            list(map(lambda x: re.match(r'(.+?)\s+\((.+)\)', x).group(1).strip(), ans)), 
            {re.match(r'(.+?)\s+\((.+)\)', x).group(1).strip(): parse_years(re.match(r'(.+?)\s+\((.+)\)', x).group(2).strip()) for x in ans}
        ) 
        for ans in true_answers
    ]
    overall_f1_score = 0
    for i in range(len(predictions)):
        overall_f1_score += f1_score(predictions[i], true_answers[i])
    overall_f1_score /= len(predictions)

    
    #_, _, entity_f1_score = evaluate_entity_f1(parsed_predictions, parsed_true_answers)
    
    #_, _, timeline_f1_score = evaluate_timeline_f1(parsed_predictions, parsed_true_answers)
    
    
    entities_em_score = evaluate_entities_em(parsed_predictions, parsed_true_answers)
    
    timeline_em_score =  evaluate_timeline_em(parsed_predictions, parsed_true_answers)


    completeness_score = evaluate_completeness(parsed_predictions, parsed_true_answers)
    entity_completeness_score = evaluate_entity_completeness(parsed_predictions, parsed_true_answers)
    
    return overall_f1_score, entities_em_score, timeline_em_score, completeness_score, entity_completeness_score


In [69]:
parsed_true_answers = [
    (
        list(map(lambda x: re.match(r'(.+?)\s+\((.+)\)', x).group(1).strip(), ans)), 
        {re.match(r'(.+?)\s+\((.+)\)', x).group(1).strip(): parse_years(re.match(r'(.+?)\s+\((.+)\)', x).group(2).strip()) for x in ans}
    ) 
    for ans in true_answers
]

In [157]:
# closed book
# withouot fine-tuning
# Evaluation of flan xl

files_xl = [
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanxl/predictions_3shots_flanxl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanxl/predictions_5shots_flanxl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanxl/predictions_7shots_flanxl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanxl/predictions_10shots_flanxl.json"
]

for file in files_xl:
    
    predictions = load_predictions(file)
    
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")


Matched entities: {'Jim Prentice', 'Rachel Notley', 'Jason Kenney'}
Matched entities: {'Democratic Party', 'Republican Party'}
Matched entities: {'United States representative', 'United States senator'}
Matched entities: {'Chief of the Defence Staff', 'Chief of the General Staff'}
File: /Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanxl/predictions_3shots_flanxl.json
Entity EM Score: 0.003734827264239029
Timeline EM Score:0.0
Overall F1 Score: 0.3232543640688145
Completeness: 0.007847583477835578
Entity Completness: 0.03968470579114838

Matched entities: {'Democratic Party', 'Republican Party'}
Matched entities: {'United States representative', 'United States senator'}
Matched entities: {'Chief of the Defence Staff', 'Chief of the General Staff'}
Matched entities: {'Serhiy Arbuzov', 'Yulia Tymoshenko', 'Arseniy Yatsenyuk', 'Oleksiy Honcharuk', 'Mykola Azarov', 'Denys Shmyhal', 'Volodymyr Groysman'}
File: /Users/wangchenwei/Downloads/TLQA-main/predictions/pred

In [158]:
# Evaluation of flan large

files_large = [
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanlarge/predictions_3shot_flanlarge.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanlarge/predictions_5shot_flanlarge.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanlarge/predictions_7shot_flanlarge.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanlarge/predictions_10shot_flanlarge.json"
]

for file in files_large:
    
    predictions = load_predictions(file)
    
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")


File: /Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanlarge/predictions_3shot_flanlarge.json
Entity EM Score: 0.0
Timeline EM Score:0
Overall F1 Score: 0.25901695053099344
Completeness: 0.005239132690113081
Entity Completness: 0.023889185303751122

File: /Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanlarge/predictions_5shot_flanlarge.json
Entity EM Score: 0.0
Timeline EM Score:0
Overall F1 Score: 0.3166077123691116
Completeness: 0.008092125739184563
Entity Completness: 0.02448090347250011

File: /Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_fewshot_flanlarge/predictions_7shot_flanlarge.json
Entity EM Score: 0.0
Timeline EM Score:0
Overall F1 Score: 0.27323196730469584
Completeness: 0.004668534080298785
Entity Completness: 0.024513509107346638

Matched entities: {'Libertarian Party', 'Republican Party'}
Matched entities: {'United States representative', 'United States senator'}
File: /Users/wangchenwei/Downloads/TLQ

In [160]:
files_base = [
    "predictions_3shot_flanbase.json",
    "predictions_5shot_flanbase.json",
    "predictions_7shot_flanbase.json",
    "predictions_10shot_flanbase.json"
]

for file in files_base:
    
    predictions = load_predictions(file)
    
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")


Matched entities: {'United States representative', 'United States senator'}
Matched entities: {'Berkshire Hathaway', 'Lee Enterprises'}
File: predictions_3shot_flanbase.json
Entity EM Score: 0.0018674136321195146
Timeline EM Score:0.5
Overall F1 Score: 0.18581318478601164
Completeness: 0.005835667600373482
Entity Completness: 0.016535503090124942

Matched entities: {'Berkshire Hathaway', 'Lee Enterprises'}
File: predictions_5shot_flanbase.json
Entity EM Score: 0.0009337068160597573
Timeline EM Score:1.0
Overall F1 Score: 0.1808868347335365
Completeness: 0.005846783157707528
Entity Completness: 0.016800053354675205

Matched entities: {'Berkshire Hathaway', 'Lee Enterprises'}
Matched entities: {'Democratic Party', 'Republican Party'}
File: predictions_7shot_flanbase.json
Entity EM Score: 0.0018674136321195146
Timeline EM Score:0.5
Overall F1 Score: 0.18486110223649366
Completeness: 0.005135387488328665
Entity Completness: 0.014315355771938404

Matched entities: {'Libertarian Party', 'Rep

In [161]:
files_finetuned_base = [
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_base_finetund/prediction_fine_tuned.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_base_finetund/f5base_finetuned_3shot_answers.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_base_finetund/f5base_finetuned_5shot_answers.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_base_finetund/f5base_finetuned_7shot_answers.json",
    "/Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_base_finetund/f5base_finetuned_10shot_answers.json"
]

for file in files_finetuned_base:
    predictions= load_predictions(file)
    #predictions = parse_predictions(data['info'])
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")


Matched entities: {'Bane NOR', 'Norwegian National Rail Administration'}
Matched entities: {'REFER', 'Infrastructures of Portugal'}
Matched entities: {'Bane NOR', 'Norwegian National Rail Administration'}
Matched entities: {'Bane NOR', 'Norwegian National Rail Administration'}
Matched entities: {'REFER', 'Infrastructures of Portugal'}
Matched entities: {'Bane NOR', 'Norwegian National Rail Administration'}
File: /Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_base_finetund/prediction_fine_tuned.json
Entity EM Score: 0.0056022408963585435
Timeline EM Score:0.0
Overall F1 Score: 0.5612880185216205
Completeness: 0.004025313829235397
Entity Completness: 0.049164480607057644

File: /Users/wangchenwei/Downloads/TLQA-main/predictions/predictions_base_finetund/f5base_finetuned_3shot_answers.json
Entity EM Score: 0.0
Timeline EM Score:0
Overall F1 Score: 0.5581449488479628
Completeness: 0.00046685340802987864
Entity Completness: 0.028454715219421098

Matched entities: {'Liberal P

In [162]:
files_rag_top5 = [
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_top5_flanxl/predictions_top5_RAG_fewshot3_flanXl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_top5_flanxl/predictions_top5_RAG_fewshot5_flanXl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_top5_flanxl/predictions_top5_RAG_fewshot7_flanXl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_top5_flanxl/predictions_top5_RAG_fewshot10_flanXl.json"
]

for file in files_rag_top5:
    data = load_predictions(file)
    predictions = parse_predictions(data['info'])
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")


Matched entities: {'Democratic Labour Party', 'Australian Conservatives'}
Matched entities: {'REFER', 'Infrastructures of Portugal'}
Matched entities: {'Bane NOR', 'Norwegian National Rail Administration'}
Matched entities: {'Berkshire Hathaway', 'Lee Enterprises'}
Matched entities: {'Bane NOR', 'Norwegian National Rail Administration'}
Matched entities: {'New Centre-Right', 'The People of Freedom'}
Matched entities: {'YSR Congress Party', 'Indian National Congress'}
Matched entities: {'REFER', 'Infrastructures of Portugal'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_top5_flanxl/predictions_top5_RAG_fewshot3_flanXl.json
Entity EM Score: 0.007469654528478058
Timeline EM Score:0.5
Overall F1 Score: 0.2260824493249519
Completeness: 0.00575785869903517
Entity Completness: 0.027510263364605096

Matched entities: {'REFER', 'Infrastructures of Portugal'}
Matched entities: {'Bane NOR', 'Norwegian National Rail Administration'}
Matched entities: {'Berkshire 

In [163]:
files_rag_topk = [
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanxl/predictions_top3_RAG_flanXl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanxl/predictions_top5_RAG_flanXl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanxl/predictions_top7_RAG_flanXl.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanxl/predictions_top10_RAG_flanXl.json"
]

for file in files_rag_topk:
    data = load_predictions(file)
    predictions = parse_predictions(data['info'])
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")

    

Matched entities: {'Democratic Labour Party', 'Australian Conservatives'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanxl/predictions_top3_RAG_flanXl.json
Entity EM Score: 0.0009337068160597573
Timeline EM Score:0.0
Overall F1 Score: 0.14997565146131525
Completeness: 0.0
Entity Completness: 0.01151571739807034

Matched entities: {'Democratic Labour Party', 'Australian Conservatives'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanxl/predictions_top5_RAG_flanXl.json
Entity EM Score: 0.0009337068160597573
Timeline EM Score:0.0
Overall F1 Score: 0.15857213496036798
Completeness: 0.0
Entity Completness: 0.01027077497665733

Matched entities: {'Democratic Labour Party', 'Australian Conservatives'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanxl/predictions_top7_RAG_flanXl.json
Entity EM Score: 0.0009337068160597573
Timeline EM Score:0.0
Overall F1 Score: 0.162376

In [164]:
files_rag_topk_large = [
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanlarge/predictions_top3_RAG_flanLarge.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanlarge/predictions_top5_RAG_flanLarge.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanlarge/predictions_top7_RAG_flanLarge.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanlarge/predictions_top10_RAG_flanLarge.json"
]

for file in files_rag_topk_large:
    data = load_predictions(file)
    predictions = parse_predictions(data['info'])
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")


Matched entities: {'Democratic Labour Party', 'Australian Conservatives'}
Matched entities: {'Dialogue for Hungary', 'Politics Can Be Different'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanlarge/predictions_top3_RAG_flanLarge.json
Entity EM Score: 0.0018674136321195146
Timeline EM Score:0.0
Overall F1 Score: 0.14983048044345995
Completeness: 0.00023342670401493932
Entity Completness: 0.013616557734204796

Matched entities: {'Democratic Labour Party', 'Australian Conservatives'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_flanlarge/predictions_top5_RAG_flanLarge.json
Entity EM Score: 0.0009337068160597573
Timeline EM Score:0.0
Overall F1 Score: 0.15778007683773246
Completeness: 0.00023342670401493932
Entity Completness: 0.013305322128851542

Matched entities: {'Democratic Labour Party', 'Australian Conservatives'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_topk_fla

In [165]:
files_rag_topk_base = [
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_fintuned_base/predictions_finetuned_top3_RAG_flanBase.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_fintuned_base/predictions_finetuned_top5_RAG_flanBase.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_fintuned_base/predictions_finetuned_top7_RAG_flanBase.json",
    "/Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_fintuned_base/predictions_finetuned_top10_RAG_flanBase.json"
]

for file in files_rag_topk_base:
    data = load_predictions(file)
    predictions = parse_predictions(data['info'])
    overall_f1, entity_em, timeline_em, completeness,entity_com = calculate_scores(predictions, true_answers)
    print(f"File: {file}\nEntity EM Score: {entity_em}\nTimeline EM Score:{timeline_em}\nOverall F1 Score: {overall_f1}\nCompleteness: {completeness}\nEntity Completness: {entity_com}\n")


Matched entities: {'University of Southern Denmark'}
Matched entities: {'Urmas Klaas'}
Matched entities: {'Liberal Party of Australia'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_fintuned_base/predictions_finetuned_top3_RAG_flanBase.json
Entity EM Score: 0.0028011204481792717
Timeline EM Score:0.0
Overall F1 Score: 0.6083508218601764
Completeness: 0.00018674136321195146
Entity Completness: 0.09320765343174302

Matched entities: {'University of Southern Denmark'}
Matched entities: {'Urmas Klaas'}
Matched entities: {'Liberal Party of Australia'}
File: /Users/wangchenwei/Downloads/TLQA-main/RAG/predictions_result/predictions_fintuned_base/predictions_finetuned_top5_RAG_flanBase.json
Entity EM Score: 0.0028011204481792717
Timeline EM Score:0.0
Overall F1 Score: 0.6128286798317496
Completeness: 0.00018674136321195146
Entity Completness: 0.10093518889037088

Matched entities: {'University of Southern Denmark'}
Matched entities: {'Urmas Klaas'}
Matched ent