# Step 3: Semantic related evidence retrieval

# Readme
*This notebook focusing on utilised an encoder only transformer to find 1 more evidences that are not included in top4 selection from BM25*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [23]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization, TextVectorization, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Lambda, Dot, Reshape, GlobalAveragePooling1D, Flatten
from tensorflow.keras import Model, Input, optimizers, layers, metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from tensorflow.keras import backend as K

import spacy
import string
import time
from collections import Counter
import string
import ast

Load the csv that produced by the step 2

In [24]:
train_lemmatize = pd.read_csv("data_folder/train_k05b085_bm25_top20.csv")
dev_lemmatize = pd.read_csv("data_folder/dev_k05b085_bm25_top20.csv")
test_lemmatize = pd.read_csv("data_folder/test_k05b085_bm25_top20.csv")

Load the csv that produced by the step 1

In [25]:
train = pd.read_csv("data_folder/train.csv")
evidence = pd.read_csv("data_folder/evidence.csv")
dev = pd.read_csv("data_folder/dev.csv")
test = pd.read_csv("data_folder/test.csv")

Append the top20 evidences retrieved to step 1 dataframe

In [26]:
train['prefilter_evidence'] = train_lemmatize['prefilter_evidence']
dev['prefilter_evidence'] = dev_lemmatize['prefilter_evidence']
test['prefilter_evidence'] = test_lemmatize['prefilter_evidence']

arrange the evidence list in the structure of list of integer

In [27]:
train['evidences'] = train['evidences'].apply(lambda x: [int(e.split('-')[-1]) for e in eval(x)])
dev['evidences'] = dev['evidences'].apply(lambda x: [int(e.split('-')[-1]) for e in eval(x)])

In [28]:
dev['prefilter_evidence'] = dev['prefilter_evidence'].apply(lambda x: ast.literal_eval(x))
train['prefilter_evidence'] = train['prefilter_evidence'].apply(lambda x: ast.literal_eval(x))
test['prefilter_evidence'] = test['prefilter_evidence'].apply(lambda x: ast.literal_eval(x))

# Check peformance of top 4 evidences retrieval before we add 1 more

In [29]:
def print_retrival_result(dev):
    evidence_recall_scores = []
    evidence_precision_scores = []
    evidence_fscore_scores = []
    
    # Iterate over each row in the dev DataFrame to evaluate evidence retrieval
    for index, row in dev.iterrows():
        true_evidences = set(row['evidences'])    
        predicted_evidences = set(row['prefilter_evidence'][:4])
        # Initialize counters for correct predictions
        evidence_correct = len(true_evidences & predicted_evidences)
        
        # Calculate recall, precision, and F-score
        if len(true_evidences) > 0 and len(predicted_evidences) > 0:
            evidence_recall = evidence_correct / len(true_evidences)
            evidence_precision = evidence_correct / len(predicted_evidences)
            if evidence_recall + evidence_precision > 0:
                evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall)
            else:
                evidence_fscore = 0.0
        else:
            evidence_recall = 0.0
            evidence_precision = 0.0
            evidence_fscore = 0.0
    
        # Store the scores
        evidence_recall_scores.append(evidence_recall)
        evidence_precision_scores.append(evidence_precision)
        evidence_fscore_scores.append(evidence_fscore)
        
    # Calculate mean scores across all instances
    mean_recall = np.mean(evidence_recall_scores)
    mean_precision = np.mean(evidence_precision_scores)
    mean_fscore = np.mean(evidence_fscore_scores)
    
    # Output the aggregate performance
    print(f"Average Evidence Recall    = {mean_recall}")
    print(f"Average Evidence Precision = {mean_precision}")
    print(f"Average Evidence F-Score   = {mean_fscore}")
print_retrival_result(train)
print_retrival_result(dev)

Average Evidence Recall    = 0.14605048859934852
Average Evidence Precision = 0.11197068403908794
Average Evidence F-Score   = 0.11874967685228272
Average Evidence Recall    = 0.17034632034632033
Average Evidence Precision = 0.1266233766233766
Average Evidence F-Score   = 0.13530199958771388


Define top k of evidences you want to keep from BM25 retrieval, that is the search range of the rerank model

For example, the top20 rerank result shown in the report should be change RANK_RANGE to 20

In [30]:
TOP_K_BM25 = 4
RANK_RANGE = 10

In [31]:
dev['prefilter_evidence'] = dev['prefilter_evidence'].apply(lambda x: x[:RANK_RANGE])
train['prefilter_evidence'] = train['prefilter_evidence'].apply(lambda x: x[:RANK_RANGE])
test['prefilter_evidence'] = test['prefilter_evidence'].apply(lambda x: x[:RANK_RANGE])

In [32]:
dev['predict_evidence_bm25'] = dev['prefilter_evidence'].apply(lambda x: x[:TOP_K_BM25])
train['predict_evidence_bm25'] = train['prefilter_evidence'].apply(lambda x: x[:TOP_K_BM25])
test['predict_evidence_bm25'] = test['prefilter_evidence'].apply(lambda x: x[:TOP_K_BM25])

Append the true evidences and false evidences to a new column in our search range

In [33]:
def filter_evidence(row):
    prefilter_list = row['prefilter_evidence']
    evidence_list = row['evidences']

    # Convert to set for efficient operations
    evidence_set = set(evidence_list)

    # Create prefilter true evidence list
    prefilter_true_evidence = [ev for ev in prefilter_list if ev in evidence_set]

    # Update prefilter_evidence by removing the prefilter true evidences from the first k
    # and also ensure no element from prefilter true evidence is in the top k of the updated list
    updated_prefilter_evidence = [ev for ev in prefilter_list if ev not in prefilter_true_evidence]

    return pd.Series([prefilter_true_evidence, updated_prefilter_evidence])
train[['prefilter_true_evidence', 'prefilter_false_evidence']] = train.apply(filter_evidence, axis=1)
dev[['prefilter_true_evidence', 'prefilter_false_evidence']] = dev.apply(filter_evidence, axis=1)

Only remove punctuation

In [34]:
import nltk
def remove_punctuation(input_text):
    # Lowercase the input text to standardize it
    input_text = input_text.lower()
    
    # Remove punctuation using a translation table
    translator = str.maketrans('', '', string.punctuation)
    input_text = input_text.translate(translator)
    
    # Tokenize the text into words by splitting on whitespace
    tokens = input_text.split()
    
    # Join words back into one string and return
    return ' '.join(tokens)

# Apply the function to your dataframes
train['claim_text'] = train['claim_text'].apply(remove_punctuation)
dev['claim_text'] = dev['claim_text'].apply(remove_punctuation)
test['claim_text'] = test['claim_text'].apply(remove_punctuation)
evidence['evidence_text'] = evidence['evidence_text'].apply(remove_punctuation)

For each claim, we will concat it with each top k evidences one by one, and we will give a label 0 or 1 to each concatnated text

In [35]:
# Process each row to create new DataFrame entries
def create_new_df(df, train = True):    
    new_data = {
        'concatenated_text': [],
        'evidence_label': [],
        'instance_id': [],
        'evidence_id': [],
        'claim_label': []
    }
    
    for index, row in df.iterrows():
        # Process evidences (label 1)
        for ev_index in row['evidences'] if train else row['prefilter_true_evidence']:
            concatenated_text = "<CLS> " + row['claim_text'] + " <SEP> " + evidence.iloc[ev_index].evidence_text
            new_data['concatenated_text'].append(concatenated_text)
            new_data['evidence_label'].append(1)
            new_data['instance_id'].append(index)
            new_data['evidence_id'].append(ev_index)
            new_data['claim_label'].append(row['claim_label'])
    
        # Process prefilter_evidence (label 0)
        for ev_index in row['prefilter_false_evidence']:
            concatenated_text = "<CLS> " + row['claim_text']  + " <SEP> " + evidence.iloc[ev_index].evidence_text
            new_data['concatenated_text'].append(concatenated_text)
            new_data['evidence_label'].append(0)
            new_data['instance_id'].append(index)
            new_data['evidence_id'].append(ev_index)
            new_data['claim_label'].append('NOT_RELEVANT')
    new_df = pd.DataFrame(new_data)
    return new_df

# Create new DataFrame from processed data
train_for_retrival = create_new_df(train)
dev_for_retrival = create_new_df(dev, False)

In [36]:
def create_test_df(df):    
    new_data = {
        'concatenated_text': [],
        'instance_id': [],
        'evidence_id': []
    }
    
    for index, row in df.iterrows():   
        evidence_index = row['prefilter_evidence']
        for ev_index in evidence_index:
            concatenated_text = "<CLS> " + row['claim_text']  + " <SEP> " + evidence.iloc[ev_index].evidence_text
            new_data['concatenated_text'].append(concatenated_text)
            new_data['instance_id'].append(index)
            new_data['evidence_id'].append(ev_index)
    new_df = pd.DataFrame(new_data)
    return new_df
test_for_retrival = create_test_df(test)

In [37]:
train_for_retrival.evidence_label.value_counts()

0    11424
1     4122
Name: evidence_label, dtype: int64

Imbalance evidence label, since marjority of top evidences are fake evidences

Therefore we duplicate the true evidences data for balancing data

In [38]:
def duplicate_true(df):
    filter_label_1 = df[df['evidence_label'] == 1]
    
    # Duplicate these rows 58 times
    duplicated_rows = pd.concat([filter_label_1] * 2, ignore_index=True)
    
    # Concatenate the duplicated rows back to the original DataFrame
    new_df = pd.concat([df, duplicated_rows], ignore_index=True)
    
    return new_df
train_for_retrival_duplicate_true = duplicate_true(train_for_retrival)

In [39]:
# Define file paths
sequence_length = 96
print("Start processing!")
# Concatenate priority texts
priority_texts = pd.concat([train_for_retrival['concatenated_text'], dev_for_retrival['concatenated_text'], test_for_retrival['concatenated_text']])

# Create the TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    output_mode="int",
    output_sequence_length=sequence_length
)

# Adapt the vectorization layer on priority texts first
vectorize_layer.adapt(priority_texts)
print("Vocabulary size on all claim text:", len(vectorize_layer.get_vocabulary()))
max_features = len(vectorize_layer.get_vocabulary())
# Encode texts
train_encoded = vectorize_layer(train_for_retrival_duplicate_true['concatenated_text'].to_numpy())
dev_encoded = vectorize_layer(dev_for_retrival['concatenated_text'].to_numpy())
print("Processed and encoded data.")

Start processing!
Vocabulary size on all claim text: 19141
Processed and encoded data.


In [40]:
def prepare_dataset(encoded_data, df):
    features = tf.convert_to_tensor(encoded_data)
    evidence_labels = tf.convert_to_tensor(df['evidence_label'].values, dtype=tf.float32)
    
    # Combine features and labels into a dataset
    dataset = tf.data.Dataset.from_tensor_slices((features, {"evidence_label": evidence_labels}))
    
    # Batch the dataset
    dataset = dataset.batch(32).shuffle(buffer_size=len(df))
    return dataset
train_dataset = prepare_dataset(train_encoded, train_for_retrival_duplicate_true)
dev_dataset = prepare_dataset(dev_encoded, dev_for_retrival)

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

Self implemented metrics for tracking the TP and TN rate

In [41]:
class Label1Accuracy(tf.keras.metrics.Metric):
    def __init__(self, name='label_1_acc', **kwargs):
        super(Label1Accuracy, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.total_label_1 = self.add_weight(name='total', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.bool)
        y_pred = tf.cast(tf.round(y_pred), tf.bool)
        values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
        self.true_positives.assign_add(tf.reduce_sum(tf.cast(values, self.dtype)))
        self.total_label_1.assign_add(tf.reduce_sum(tf.cast(y_true, self.dtype)))

    def result(self):
        return self.true_positives / (self.total_label_1 + tf.keras.backend.epsilon())

    def reset_states(self):
        self.true_positives.assign(0)
        self.total_label_1.assign(0)

class Label0Accuracy(tf.keras.metrics.Metric):
    def __init__(self, name='label_0_acc', **kwargs):
        super(Label0Accuracy, self).__init__(name=name, **kwargs)
        self.true_negatives = self.add_weight(name='tn', initializer='zeros')
        self.total_label_0 = self.add_weight(name='total', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.bool)
        y_pred = tf.cast(tf.round(y_pred), tf.bool)
        values = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, False))
        self.true_negatives.assign_add(tf.reduce_sum(tf.cast(values, self.dtype)))
        self.total_label_0.assign_add(tf.reduce_sum(tf.cast(tf.logical_not(y_true), self.dtype)))

    def result(self):
        return self.true_negatives / (self.total_label_0 + tf.keras.backend.epsilon())

    def reset_states(self):
        self.true_negatives.assign(0)
        self.total_label_0.assign(0)

In [92]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)  # Pass any extra arguments to the superclass
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate
        })
        return config

def create_embedding_model(sequence_length, max_features, num_transformer_blocks, embedding_dim, num_heads, ff_dim, rate):
    input_layer = Input(shape=(sequence_length,), dtype="int64")
    
    # Embedding layer with positional encoding
    embedding_layer = Embedding(max_features, embedding_dim)
    x = embedding_layer(input_layer)
    # Adding positional encoding
    
    position_embedding = Embedding(input_dim=sequence_length, output_dim=embedding_dim)
    positions = tf.range(start=0, limit=sequence_length, delta=1)
    x += position_embedding(positions)
    
    # Transformer blocks
    for _ in range(num_transformer_blocks):
        x = TransformerBlock(embedding_dim, num_heads, ff_dim, rate)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dropout(0.1)(x)
    # Dense layers and output
    evidence_output = Dense(1, activation='sigmoid', name="evidence_label")(x)
    
    # Compile model
    model = Model(inputs=input_layer, outputs=evidence_output)
    return model

# Initialize and compile the model as before
embedding_model = create_embedding_model(sequence_length=sequence_length, max_features=max_features, num_transformer_blocks = 2,
                                         embedding_dim = 1024, num_heads=4, ff_dim=128, rate=0.1)
embedding_model.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 96)]              0         
_________________________________________________________________
embedding_16 (Embedding)     (None, 96, 1024)          19600384  
_________________________________________________________________
tf.__operators__.add_8 (TFOp (None, 96, 1024)          0         
_________________________________________________________________
transformer_block_16 (Transf (None, 96, 1024)          17057920  
_________________________________________________________________
transformer_block_17 (Transf (None, 96, 1024)          17057920  
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_44 (Dropout)         (None, 1024)              0   

In [93]:
embedding_model.compile(
    optimizer=optimizers.Adam(1e-5),
    loss={
        'evidence_label': 'binary_crossentropy'
    },
    metrics={
        'evidence_label': ['accuracy', Label1Accuracy(), Label0Accuracy()]
    }
)

Typically 10-20 epochs could get the following result

In [94]:
embedding_model.fit(
    train_dataset,
    validation_data=dev_dataset,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x168459fc160>

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

# Valid actual performence of dev data

In [100]:
def attached_transformer_prediction(df, df_retrival, embedding_model, text_encoded, top_add_k=5, top_remove_k = 5):
    # Predict the probabilities for each text
    prob_predictions = embedding_model.predict(text_encoded)
    df_retrival['predicted_label'] = prob_predictions.flatten()
    
    # Get the top k evidence by probability for each instance
    top_true_evidences = df_retrival.sort_values(by=['instance_id', 'predicted_label'], ascending=[True, False])
    top_true_evidences = top_true_evidences.groupby('instance_id').head(top_add_k)

    top_false_evidences = df_retrival.sort_values(by=['instance_id', 'predicted_label'], ascending=[True, True])
    top_false_evidences = top_false_evidences.groupby('instance_id').head(top_remove_k)
    
    # Aggregate the top k evidences and their probabilities into lists
    aggregated_add_evidences = top_true_evidences.groupby('instance_id')['evidence_id'].apply(list).reset_index(name='predicted_add_evidence_transformer')
    aggregated_remove_evidences = top_false_evidences.groupby('instance_id')['evidence_id'].apply(list).reset_index(name='predicted_rm_evidence_transformer')
    
    # Merge the aggregated data back into the original DataFrame using the index (instance_id)
    df = df.merge(aggregated_add_evidences, how='left', left_index=True, right_on='instance_id')
    df = df.merge(aggregated_remove_evidences, how='left', left_index=True, right_on='instance_id')

    # Cleanup the DataFrame to remove any temporary columns or duplicate index columns
    df.drop(columns=['instance_id_x', 'instance_id_y'], errors='ignore', inplace=True)
    
    return df

In [101]:
test_with_result = attached_transformer_prediction(test, test_for_retrival , embedding_model, 
                                                   vectorize_layer(test_for_retrival['concatenated_text'].to_numpy()))
dev_with_result = attached_transformer_prediction(dev, dev_for_retrival , embedding_model, 
                                                   dev_encoded)
train_with_result = attached_transformer_prediction(train, train_for_retrival , embedding_model, 
                                                   vectorize_layer(train_for_retrival['concatenated_text'].to_numpy()))

top 4 fully rerank by transformer

In [102]:
evidence_recall_scores = []
evidence_precision_scores = []
evidence_fscore_scores = []

# Iterate over each row in the dev DataFrame to evaluate evidence retrieval
for index, row in dev_with_result.iterrows():
    true_evidences = set(row['evidences'])
    # Create copies of the lists to avoid modifying the original DataFrame
    # Combine the filtered transformer evidences with BM25 predictions
    predicted_evidences = []
    for ev in row['predicted_add_evidence_transformer'][:4]:
        if ev not in predicted_evidences:
            predicted_evidences.append(ev)
    # Initialize counters for correct predictions
    evidence_correct = len(true_evidences & set(predicted_evidences))
    
    # Calculate recall, precision, and F-score
    if len(true_evidences) > 0 and len(predicted_evidences) > 0:
        evidence_recall = evidence_correct / len(true_evidences)
        evidence_precision = evidence_correct / len(predicted_evidences)
        if evidence_recall + evidence_precision > 0:
            evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall)
        else:
            evidence_fscore = 0.0
    else:
        evidence_recall = 0.0
        evidence_precision = 0.0
        evidence_fscore = 0.0

    # Store the scores
    evidence_recall_scores.append(evidence_recall)
    evidence_precision_scores.append(evidence_precision)
    evidence_fscore_scores.append(evidence_fscore)
    
# Calculate mean scores across all instances
mean_recall = np.mean(evidence_recall_scores)
mean_precision = np.mean(evidence_precision_scores)
mean_fscore = np.mean(evidence_fscore_scores)

# Output the aggregate performance
print(f"Average Evidence Recall    = {mean_recall}")
print(f"Average Evidence Precision = {mean_precision}")
print(f"Average Evidence F-Score   = {mean_fscore}")

Average Evidence Recall    = 0.14653679653679655
Average Evidence Precision = 0.1038961038961039
Average Evidence F-Score   = 0.11231704803133374


top 1 rerank model recall

In [98]:
evidence_recall_scores = []
evidence_precision_scores = []
evidence_fscore_scores = []

# Iterate over each row in the dev DataFrame to evaluate evidence retrieval
for index, row in dev_with_result.iterrows():
    true_evidences = set(row['evidences'])
    # Create copies of the lists to avoid modifying the original DataFrame
    # Combine the filtered transformer evidences with BM25 predictions
    predicted_evidences = []
    for ev in row['predicted_add_evidence_transformer'][:1]:
        if ev not in predicted_evidences:
            predicted_evidences.append(ev)
    # Initialize counters for correct predictions
    evidence_correct = len(true_evidences & set(predicted_evidences))
    
    # Calculate recall, precision, and F-score
    if len(true_evidences) > 0 and len(predicted_evidences) > 0:
        evidence_recall = evidence_correct / len(true_evidences)
        evidence_precision = evidence_correct / len(predicted_evidences)
        if evidence_recall + evidence_precision > 0:
            evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall)
        else:
            evidence_fscore = 0.0
    else:
        evidence_recall = 0.0
        evidence_precision = 0.0
        evidence_fscore = 0.0

    # Store the scores
    evidence_recall_scores.append(evidence_recall)
    evidence_precision_scores.append(evidence_precision)
    evidence_fscore_scores.append(evidence_fscore)
    
# Calculate mean scores across all instances
mean_recall = np.mean(evidence_recall_scores)
mean_precision = np.mean(evidence_precision_scores)
mean_fscore = np.mean(evidence_fscore_scores)

# Output the aggregate performance
print(f"Average Evidence Recall    = {mean_recall}")
print(f"Average Evidence Precision = {mean_precision}")
print(f"Average Evidence F-Score   = {mean_fscore}")

Average Evidence Recall    = 0.039718614718614716
Average Evidence Precision = 0.12337662337662338
Average Evidence F-Score   = 0.05670995670995672


# Final ensemble result of top 4 BM25 and top 1 rerank model

In [462]:
evidence_recall_scores = []
evidence_precision_scores = []
evidence_fscore_scores = []

# Iterate over each row in the dev DataFrame to evaluate evidence retrieval
for index, row in dev_with_result.iterrows():
    true_evidences = set(row['evidences'])
    # Create copies of the lists to avoid modifying the original DataFrame
    # Combine the filtered transformer evidences with BM25 predictions
    predicted_evidences = []
    for ev in row['predict_evidence_bm25']:
        #if ev not in row['predicted_rm_evidence_transformer'][:1]:
        predicted_evidences.append(ev)
    for ev in row['predicted_add_evidence_transformer'][:1]:
        if ev not in predicted_evidences:
            predicted_evidences.append(ev)
    # Initialize counters for correct predictions
    evidence_correct = len(true_evidences & set(predicted_evidences))
    
    # Calculate recall, precision, and F-score
    if len(true_evidences) > 0 and len(predicted_evidences) > 0:
        evidence_recall = evidence_correct / len(true_evidences)
        evidence_precision = evidence_correct / len(predicted_evidences)
        if evidence_recall + evidence_precision > 0:
            evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall)
        else:
            evidence_fscore = 0.0
    else:
        evidence_recall = 0.0
        evidence_precision = 0.0
        evidence_fscore = 0.0

    # Store the scores
    evidence_recall_scores.append(evidence_recall)
    evidence_precision_scores.append(evidence_precision)
    evidence_fscore_scores.append(evidence_fscore)
    
# Calculate mean scores across all instances
mean_recall = np.mean(evidence_recall_scores)
mean_precision = np.mean(evidence_precision_scores)
mean_fscore = np.mean(evidence_fscore_scores)

# Output the aggregate performance
print(f"Average Evidence Recall    = {mean_recall}")
print(f"Average Evidence Precision = {mean_precision}")
print(f"Average Evidence F-Score   = {mean_fscore}")

Average Evidence Recall    = 0.20086580086580083
Average Evidence Precision = 0.13084415584415585
Average Evidence F-Score   = 0.1473768295196867


The F-score imporve from 0.135 to 0.147

# The step 4 notebook will use the data that contains 1 more evidences added by transformer

In [457]:
train_with_result.to_csv("train_retrival_result_with_transformer.csv")
dev_with_result.to_csv("dev_retrival_result_with_transformer.csv")
test_with_result.to_csv("test_retrival_result_with_transformer.csv")

In [None]:
# Create test file