## Prediction

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import json
import requests
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Load the test dataset
test_dataset = load_dataset('json', data_files='./data/test_TLQA.json')['train']

In [3]:
# Load the generative model and tokenizer
generative_model_name = './results-FlanT5-small'
generative_tokenizer = T5Tokenizer.from_pretrained(generative_model_name)
generative_model = T5ForConditionalGeneration.from_pretrained(generative_model_name)
generative_model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [4]:
def preprocess_function(examples):
    inputs = examples['question']
    targets = ["; ".join(ans) for ans in examples['answers']]  # Join list of answers with a separator
    model_inputs = generative_tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with generative_tokenizer.as_target_tokenizer():
        labels = generative_tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    model_inputs['question'] = inputs  # Keep the original question
    return model_inputs

In [5]:
# Tokenize the test dataset
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1071 [00:00<?, ? examples/s]



In [6]:
# Load the retriever model
retriever_model_name = "facebook/contriever"
retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
retriever_model = AutoModel.from_pretrained(retriever_model_name)

In [7]:

# Function to get context from Wikidata based on the type
def get_wikidata_context(wikidata_id, type_id):
    sparql_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?item ?itemLabel ?relatedItem ?relatedItemLabel WHERE {{
      wd:{wikidata_id} wdt:{type_id} ?relatedItem .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    headers = {'Accept': 'application/json'}
    response = requests.get(sparql_url, params={'query': query}, headers=headers)

    # Debugging: Print the response
    # print(f"Wikidata ID: {wikidata_id}")
    # print(f"Response: {response.text}")

    data = response.json()

    if 'results' in data and 'bindings' in data['results'] and data['results']['bindings']:
        related_items = []
        for binding in data['results']['bindings']:
            if 'relatedItemLabel' in binding:
                related_items.append(binding['relatedItemLabel']['value'])
                # print(f"Related item: {binding['relatedItemLabel']['value']}")
        return ", ".join(related_items)
    return ''

In [8]:
# Function to embed text using the retriever model
def embed_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [9]:
# Function to retrieve context using the retriever model
def retrieve_context(question, contexts, top_k=1):
    question_embedding = embed_text(question, retriever_tokenizer, retriever_model)
    context_embeddings = [embed_text(context, retriever_tokenizer, retriever_model) for context in contexts]
    similarities = cosine_similarity(question_embedding.detach().numpy(), np.vstack([ce.detach().numpy() for ce in context_embeddings]))
    top_k_indices = similarities[0].argsort()[-top_k:][::-1]
    return [contexts[i] for i in top_k_indices]

In [10]:
# Function to generate answers with context using the generative model
def generate_answers_with_context(question, context):
    combined_input = question + " " + context
    inputs = generative_tokenizer(combined_input, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = generative_model.generate(
        inputs['input_ids'],
        max_length=512,
        num_beams=5,
        early_stopping=True,
        repetition_penalty=2.5,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
    )
    return generative_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [14]:
import requests
# Generate predictions with context
predictions = []
for example in test_dataset:
    question = example['question']
    wikidata_id = example['wikidata_ID'] 
    type_id = example['type']  
    context = get_wikidata_context(wikidata_id, type_id)
    
    relevant_context = retrieve_context(question, [context], top_k=1)
    print(f"Context: {relevant_context}")
    prediction = generate_answers_with_context(question, " ".join(relevant_context))
    predictions.append(prediction)

Context: ['Stevenage F.C., Chelsea F.C., Luton Town F.C., Crewe Alexandra F.C., Oldham Athletic A.F.C., Port Vale F.C., Southend United F.C., Wycombe Wanderers F.C.']
Context: ["President of Ukraine, Chairman of the Verkhovna Rada, Prime Minister of Ukraine, People's Deputy of Ukraine, First Deputy Prime Minister of Ukraine, Head of the Security Service of Ukraine, Secretary of National Security and Defense Council of Ukraine"]
Context: ['Socialist Party, RISE (Ireland)']
Context: ['Amiens SC']
Context: ['Andrew Furey']
Context: ['University of Melbourne, University of Sydney, University of Toyama, Universiti Teknologi MARA, King Saud University']
Context: ['Taoiseach, deputy mayor, Teachta Dála, Tánaiste, Leader of Fine Gael, Minister for Defence, Irish Minister for Health, Minister for Jobs, Enterprise and Innovation, Minister for Social Protection, Minister for Transport, Tourism and Sport']
Context: ['Sergey Yaromko']
Context: ['Newport County A.F.C., Cardiff City F.C., Crystal Pal

In [15]:
# Save predictions to a JSON file
with open('predictions-FlanT5-small-RAG.json', 'w') as f:
    json.dump(predictions, f)

## Evaluation

In [16]:
from datasets import load_dataset
import json

In [17]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
import re

# Define metric functions
def entity_match(predictions, references):
    scores = []
    for pred_list, ref_list in zip(predictions, references):
        pred_entities = set([re.sub(r'\s*\(.*?\)', '', pred.strip().lower()) for pred in pred_list])
        ref_entities = set([re.sub(r'\s*\(.*?\)', '', ref.strip().lower()) for ref in ref_list])

        # Debug prints
        # print("Pred Entities:", pred_entities)
        # print("Ref Entities:", ref_entities)

        matches = len(pred_entities & ref_entities)  # Intersection of sets
        scores.append(matches / len(ref_entities) if ref_entities else 0)
    return sum(scores) / len(scores)


def timeline_match(predictions, references):
    def extract_years(text):
        matches = re.findall(r'\d{4}', text)
        if matches:
            return set(map(int, matches))
        return set()

    scores = []
    for pred_list, ref_list in zip(predictions, references):
        match_scores = []
        for i, ref in enumerate(ref_list):
            ref_years = extract_years(ref)
            try:
                pred_years = extract_years(pred_list[i])
                # print(f"Matching prediction timeline {i + 1} with reference timeline {i + 1}")
                # print("Pred Years:", pred_years)
                # print("Ref Years:", ref_years)
                matches = len(pred_years & ref_years)
                match_scores.append(matches / len(ref_years) if ref_years else 0)
            except IndexError:
                # print(f"No prediction timeline for reference timeline {i + 1}")
                match_scores.append(0)
        scores.append(sum(match_scores) / len(match_scores) if match_scores else 0)
    return sum(scores) / len(scores)

def f1_metric(predictions, references):
    def compute_f1(pred_list, ref_list):
        pred_tokens = set(pred_list)
        ref_tokens = set(ref_list)

        if not ref_tokens:
            return 0.0

        true_positives = len(pred_tokens & ref_tokens)
        precision = true_positives / len(pred_tokens) if pred_tokens else 0
        recall = true_positives / len(ref_tokens) if ref_tokens else 0

        if precision + recall == 0:
            return 0.0

        return 2 * (precision * recall) / (precision + recall)

    f1_scores = []

    for pred_list, ref_list in zip(predictions, references):
        # Flatten the lists for token-level comparison
        all_preds = [token for pred in pred_list for token in pred.split()]
        all_refs = [token for ref in ref_list for token in ref.split()]
        f1_scores.append(compute_f1(all_preds, all_refs))

    return sum(f1_scores) / len(f1_scores) if f1_scores else 0

def extract_years(text):
    matches = re.findall(r'\d{4}', text)
    if matches:
        years = list(map(int, matches))
        return min(years), max(years)  # Return the earliest and latest years
    return None, None

def time_metric(predictions, references):
    time_diffs = []
    
    for pred_list, ref_list in zip(predictions, references):
        for pred, ref in zip(pred_list, ref_list):
            pred_start, pred_end = extract_years(pred)
            ref_start, ref_end = extract_years(ref)
            
            # print(f"Prediction: {pred}, Extracted Years: ({pred_start}, {pred_end})")  # Debugging
            # print(f"Reference: {ref}, Extracted Years: ({ref_start}, {ref_end})")  # Debugging
            
            if pred_start is not None and ref_start is not None:
                time_diff = abs(pred_start - ref_start) + abs(pred_end - ref_end)
                time_diffs.append(time_diff)
            else:
                time_diffs.append(float('inf'))  # Invalid prediction or reference

    valid_diffs = [diff for diff in time_diffs if diff != float('inf')]
    if not valid_diffs:
        return float('inf')  # If all are invalid, return inf
    return sum(valid_diffs) / len(valid_diffs) # Average time difference

def completeness(predictions, references):
    scores = []
    
    for pred_list, ref_list in zip(predictions, references):
        pred_items = set([item.strip().lower() for pred in pred_list for item in pred.split(", ")])
        ref_items = set([item.strip().lower() for ref in ref_list for item in ref.split(", ")])
        
        correct_count = len(pred_items.intersection(ref_items))
        total_count = len(ref_items)
        
        scores.append(correct_count / total_count if total_count > 0 else 0)
    return sum(scores) / len(scores)

In [18]:
# Load the test dataset
test_dataset = load_dataset('json', data_files='./data/test_TLQA.json')['train']

In [19]:
# References
references = [ans for ans in test_dataset['answers']]  # Ensure references are lists of answers

In [20]:
# Load predictions from a JSON file
with open('predictions-FlanT5-small-RAG.json', 'r') as f:
    predictions = json.load(f)

In [21]:
# Convert predictions to the required list format
predictions = [pred.split("; ") for pred in predictions]

In [22]:
# Debug: Print some predictions and references to ensure alignment
for pred, ref in zip(predictions, references):
    print("Prediction:", pred)
    print("Reference:", ref)

Prediction: ['United F.C. (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)']
Reference: ['Southend United F.C. (2010, 2011, 2012)', 'Stevenage F.C. (2012, 2013)', 'Crewe Alexandra F.C. (2013, 2014, 2015)', 'Port Vale F.C. (2015, 2016, 2017, 2018, 2019, 2020)']
Prediction: ['President of Ukraine (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)']
Reference: ['Prime Minister of Ukraine (2010)', 'First Deputy Prime Minister of Ukraine (2010)', "People's Deputy of Ukraine (2012, 2013, 2014, 2015)", 'Chairman of the Verkhovna Rada (2014, 2015, 2016, 2017, 2018, 2019, 2020)', 'President of Ukraine (2014)']
Prediction: ['Labour Party (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)']
Reference: ['Socialist Party (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019)', 'RISE (Ireland) (2019, 2020)']
Prediction: ['United States national association football team (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)']
Refere

In [23]:
# Evaluate predictions with custom metrics
entity_score = entity_match(predictions, references)
timeline_score = timeline_match(predictions, references)
f1 = f1_metric(predictions, references)
time_metric_score = time_metric(predictions, references)
completeness_score = completeness(predictions, references)

print(f"Entity Match: {entity_score}")
print(f"Timeline Match: {timeline_score}")
print(f"F1 Score: {f1}")
print(f"Time Metric: {time_metric_score}")
print(f"Completeness: {completeness_score}")

Entity Match: 0.11079746713500209
Timeline Match: 0.41833382256468027
F1 Score: 0.5699669409151066
Time Metric: 5.1786600496277915
Completeness: 0.5661654373072899
