In [115]:
# read in the predictions and test data from json files
# read the json files D:\Study\q4\NLP\Timeline-based-List-Question-Answering\data\test_TLQA.json
from datasets import load_dataset
import json
import os
import sys
import numpy as np
import pandas as pd

def read_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
import re

test_data = load_dataset('json', data_files='data/test_TLQA.json')
# predictions = load_dataset('json', data_files='mid-output-update/results/onlyQA_predictions-FlanT5-large-k=10.json')

predictions = load_dataset('json', data_files='mid-output-update/results/onlyQA_predictions-FlanT5-xl-k=10.json')



Generating train split: 1071 examples [00:00, 89685.94 examples/s]


In [116]:
# print example predictions data and test data

def print_sample_data(dataset, num_samples=1):
    print(dataset['train'][:num_samples])
    print('\n')

print_sample_data(test_data, num_samples=1)
print_sample_data(predictions, num_samples=1)

print(type(test_data))

{'text': ['Birmingham City F.C.']}


{'text': ['Answer: Manchester City F.C.']}


<class 'datasets.dataset_dict.DatasetDict'>


In [117]:
print(test_data['train'][0].keys())

print(test_data)

dict_keys(['text'])
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1071
    })
})


In [118]:
# compute the entity match
# Define metric functions
def entity_match(predictions, references):
    def extract_entity(text):
        # Remove "Answer:" prefix if present
        if text.lower().startswith("answer:"):
            text = text[len("answer:"):].strip()
        # Extract the entity part before any timeline information
        entity = re.sub(r'\s*\(.*?\)', '', text).strip().lower()
        return entity

    pred_entities = [extract_entity(pred) for pred in predictions]
    ref_entities = [extract_entity(ref) for ref in references]
    
    # Convert lists to sets
    pred_entities_set = set(pred_entities)
    ref_entities_set = set(ref_entities)
    
    # Calculate matches
    matches = len(pred_entities_set & ref_entities_set)  # Intersection of sets
    
    return matches / len(ref_entities_set) if ref_entities_set else 0

test_references = [entry['text'] for entry in test_data['train']]
predictions = [entry['text'] for entry in predictions['train']]




In [119]:
# 
def timeline_match(predictions, references):
    def extract_years(text):
        matches = re.findall(r'\d{4}', text)
        if matches:
            return set(map(int, matches))
        return set()

    match_scores = []

    for pred, ref in zip(predictions, references):
        ref_years = extract_years(ref)
        pred_years = extract_years(pred)

        matches = len(pred_years & ref_years)
        match_scores.append(matches / len(ref_years) if ref_years else 0)

    return sum(match_scores) / len(match_scores) if match_scores else 0




In [120]:
def f1_metric(predictions, references):
    def compute_f1(pred_list, ref_list):
        pred_tokens = set(pred_list)
        ref_tokens = set(ref_list)

        if not ref_tokens:
            return 0.0

        true_positives = len(pred_tokens & ref_tokens)
        precision = true_positives / len(pred_tokens) if pred_tokens else 0
        recall = true_positives / len(ref_tokens) if ref_tokens else 0

        if precision + recall == 0:
            return 0.0

        return 2 * (precision * recall) / (precision + recall)

    f1_scores = []

    for pred, ref in zip(predictions, references):
        # Flatten the lists for token-level comparison
        all_preds = [token for token in re.findall(r'\w+', pred)]
        all_refs = [token for token in re.findall(r'\w+', ref)]
        f1_scores.append(compute_f1(all_preds, all_refs))

    return sum(f1_scores) / len(f1_scores) if f1_scores else 0




In [121]:

def extract_years(text):
    matches = re.findall(r'\d{4}', text)
    if matches:
        years = list(map(int, matches))
        return min(years), max(years) 
    return None, None


def time_metric(predictions, references):
    time_diffs = []

    for pred, ref in zip(predictions, references):
        pred_start, pred_end = extract_years(pred)
        ref_start, ref_end = extract_years(ref)

        # Debugging prints
        # print(f"Prediction: {pred}, Extracted Years: ({pred_start}, {pred_end})")
        # print(f"Reference: {ref}, Extracted Years: ({ref_start}, {ref_end})")

        if pred_start is not None and ref_start is not None:
            time_diff = abs(pred_start - ref_start) + abs(pred_end - ref_end)
            time_diffs.append(time_diff)
        else:
            time_diffs.append(float('inf'))  

    valid_diffs = [diff for diff in time_diffs if diff != float('inf')]
    if not valid_diffs:
        return float('inf') 
    return sum(valid_diffs) / len(valid_diffs)  



In [122]:
def completeness(predictions, references):
    scores = []
    
    for pred_list, ref_list in zip(predictions, references):
        pred_items = set([item.strip().lower() for pred in pred_list for item in pred.split(", ")])
        ref_items = set([item.strip().lower() for ref in ref_list for item in ref.split(", ")])
        
        correct_count = len(pred_items.intersection(ref_items))
        total_count = len(ref_items)
        
        scores.append(correct_count / total_count if total_count > 0 else 0)
    return sum(scores) / len(scores)

In [123]:
entity_match_score = entity_match(predictions, test_references)
print(f"Entity Match Score: {entity_match_score}")
timeline_match_score = timeline_match(predictions, test_references)
print(f"Timeline Match Score: {timeline_match_score}")
f1_score_value = f1_metric(predictions, test_references)
print(f"F1 Score: {f1_score_value}")
time_metric_value = time_metric(predictions, test_references)
print(f"Time Metric: {time_metric_value}")
completeness_score = completeness(predictions, test_references)
print(f"Completeness Score: {completeness_score}")

Entity Match Score: 0.07113543091655267
Timeline Match Score: 0.2217353944944981
F1 Score: 0.16831609640270248
Time Metric: 6.419064748201439
Completeness Score: 0.5779583311043701
