# Step 2: BM25 top 20 evidences selection

# Readme
*This notebook focusing on the initial top 20 evidences ranking by BM25, and output the data that contains the top 20 evidence for each claim*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import ast
import spacy
import string
import time
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

Please modify the data path that produced from the step 1

In [2]:
train = pd.read_csv("data_folder/train.csv")
evidence = pd.read_csv("data_folder/evidence.csv")
dev = pd.read_csv("data_folder/dev.csv")
test = pd.read_csv("data_folder/test.csv")

# Stop word and punct removal

In [3]:
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Prepare a set of punctuation marks for removal
punctuations = string.punctuation

def spacy_lemmatize_text(input_text):
    # Remove punctuation and convert to lowercase
    translator = str.maketrans('', '', string.punctuation)
    input_text = input_text.lower().translate(translator)
    
    # Process text using SpaCy
    doc = nlp(input_text)
    
    # Lemmatize text and remove stopwords, punctuation, specified POS tags
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    # Join words back into one string
    return ' '.join(lemmatized_tokens)

# Example usage for preprocessing and timing
import time

def process_and_time(df, column_name):
    start_time = time.time()
    df[column_name] = df[column_name].apply(spacy_lemmatize_text)
    print(f"{column_name.capitalize()} lemmatize finished in {time.time() - start_time:.2f} seconds")

In [4]:
# Applying to each DataFrame
process_and_time(dev, 'claim_text')
process_and_time(test, 'claim_text')
process_and_time(train, 'claim_text')
process_and_time(evidence, 'evidence_text')

Claim_text lemmatize finished in 0.70 seconds
Claim_text lemmatize finished in 0.54 seconds
Claim_text lemmatize finished in 3.94 seconds
Evidence_text lemmatize finished in 3548.07 seconds


In [5]:
# rearrange the evidences into a list of integer
train['evidences'] = train['evidences'].apply(lambda x: [int(e.split('-')[-1]) for e in eval(x)])
dev['evidences'] = dev['evidences'].apply(lambda x: [int(e.split('-')[-1]) for e in eval(x)])

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

# Search the most similar 20 evidences for each claim by BM25

In [6]:
def tokenize_evidence(evidence_df):
    start_time = time.time()
    evidence_df['evidence_text'] = evidence_df['evidence_text'].astype(str)
    tokenized_evidence = [word_tokenize(doc.lower()) if doc != 'nan' else [] for doc in evidence_df['evidence_text']]
    print(f"Tokenized evidence finished in {time.time() - start_time:.2f} seconds")
    return tokenized_evidence

def prefilter_top_evidence_with_bm25(df, tokenized_evidence, top_k=20, k1=1.2, b=0.75):
    start_time = time.time()
    # Initialize BM25 with dynamic k1 and b values
    bm25 = BM25Okapi(tokenized_evidence, k1=k1, b=b)
    
    # Tokenize the claim texts and query the BM25 model
    df['prefilter_evidence'] = df['claim_text'].apply(lambda x: query_bm25(x, bm25, top_k))
    print(f"Retrieved top {top_k} evidence finished in {time.time() - start_time:.2f} seconds")
    return df

def query_bm25(query, bm25, top_k):
    # Tokenize the query
    tokenized_query = word_tokenize(query.lower())
    
    # Get scores and sort by scores
    doc_scores = bm25.get_scores(tokenized_query)
    return sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_k]

# The evaluation function is directly retrieved from the eval.py 
def print_retrieval_result(df, top_k):
    evidence_recall_scores = []
    evidence_precision_scores = []
    evidence_fscore_scores = []
    
    # Evaluate evidence retrieval
    for index, row in df.iterrows():
        true_evidences = set(row['evidences'])
        predicted_evidences = set(row['prefilter_evidence'][:top_k])
        
        evidence_correct = len(true_evidences & predicted_evidences)
        
        # Calculate recall, precision, and F-score
        evidence_recall = evidence_correct / len(true_evidences) if true_evidences else 0
        evidence_precision = evidence_correct / len(predicted_evidences) if predicted_evidences else 0
        evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall) if (evidence_recall + evidence_precision) > 0 else 0
        
        # Store the scores
        evidence_recall_scores.append(evidence_recall)
        evidence_precision_scores.append(evidence_precision)
        evidence_fscore_scores.append(evidence_fscore)
    
    # Print mean scores
    print(f"Mean Recall: {np.mean(evidence_recall_scores)}|, Mean Precision: {np.mean(evidence_precision_scores)}, Mean F-Score: {np.mean(evidence_fscore_scores)}")

In [7]:
tokenized_evidence_lemmatize = tokenize_evidence(evidence)  # Tokenize once outside the loop

Tokenized evidence finished in 56.30 seconds


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

Testing the default parameter

In [8]:
k = 1.2
b = 0.75
print(f"Testing k={k}, b={b}")
train_df = prefilter_top_evidence_with_bm25(train, tokenized_evidence_lemmatize, top_k=20, k1=k, b=b)
dev_df = prefilter_top_evidence_with_bm25(dev, tokenized_evidence_lemmatize, top_k=20, k1=k, b=b)

print(f"Result of top 20 performance of train: ")
print_retrieval_result(train_df, top_k=20)  # For top 20
print(f"Result of top 3 performance of train: ")
print_retrieval_result(train_df, top_k=3)   # For top 3

print(f"Result of top 20 performance of dev: ")
print_retrieval_result(dev_df, top_k=20)  # For top 20
print(f"Result of top 3 performance of dev: ")
print_retrieval_result(dev_df, top_k=3)   # For top 3

Testing k=1.2, b=0.75
Retrieved top 20 evidence finished in 2407.07 seconds
Retrieved top 20 evidence finished in 328.27 seconds
Result of top 20 performance of train: 
Mean Recall: 0.2607084690553746|, Mean Precision: 0.041164495114006515, Mean F-Score: 0.06922177088831954
Result of top 3 performance of train: 
Mean Recall: 0.11400651465798045|, Mean Precision: 0.11726384364820845, Mean F-Score: 0.10818985574685898
Result of top 20 performance of dev: 
Mean Recall: 0.32283549783549786|, Mean Precision: 0.04642857142857143, Mean F-Score: 0.07873704781948714
Result of top 3 performance of dev: 
Mean Recall: 0.14004329004329003|, Mean Precision: 0.12987012987012986, Mean F-Score: 0.12422696351267778


# hyperparameter search
*In BM25, higher K1 means higher weighted on term frequency, higher b means higher weighted on sentence length*

In [10]:
k_values = [0.6, 0.7, 0.8, 0.9]
b_values = [0.6, 0.675, 0.75, 0.825, 0.9]

for k in k_values:
    for b in b_values:
        if k == 0.6:
            if b ==0.6 or b == 0.675:
                continue
        print(f"Testing k={k}, b={b}")
        train_df = prefilter_top_evidence_with_bm25(train, tokenized_evidence_lemmatize, top_k=20, k1=k, b=b)
        dev_df = prefilter_top_evidence_with_bm25(dev, tokenized_evidence_lemmatize, top_k=20, k1=k, b=b)
        
        print(f"Result of top 20 performance of train: ")
        print_retrieval_result(train_df, top_k=20)  # For top 20
        print(f"Result of top 3 performance of train: ")
        print_retrieval_result(train_df, top_k=3)   # For top 3

        print(f"Result of top 20 performance of dev: ")
        print_retrieval_result(dev_df, top_k=20)  # For top 20
        print(f"Result of top 3 performance of dev: ")
        print_retrieval_result(dev_df, top_k=3)   # For top 3

Testing k=0.6, b=0.75
Retrieved top 20 evidence finished in 1989.80 seconds
Retrieved top 20 evidence finished in 264.71 seconds
Result of top 20 performance of train: 
Mean Recall: 0.28498914223669924, Mean Precision: 0.04466612377850163, Mean F-Score: 0.07514823820375442
Result of top 3 performance of train: 
Mean Recall: 0.12935667752442995, Mean Precision: 0.13083604777415853, Mean F-Score: 0.12135877152163796
Result of top 20 performance of dev: 
Mean Recall: 0.3584415584415584, Mean Precision: 0.05097402597402597, Mean F-Score: 0.08648700443166847
Result of top 3 performance of dev: 
Mean Recall: 0.1448051948051948, Mean Precision: 0.13419913419913418, Mean F-Score: 0.1283395176252319
Testing k=0.6, b=0.825
Retrieved top 20 evidence finished in 1989.44 seconds
Retrieved top 20 evidence finished in 263.99 seconds
Result of top 20 performance of train: 
Mean Recall: 0.2824104234527687, Mean Precision: 0.04429967426710098, Mean F-Score: 0.07452929664868485
Result of top 3 performanc

*By Oberseved result, we suspect k1 = 0.5, and b = 0.85 could compromise the result between train and dev*

In [12]:
k = 0.5
b = 0.85
print(f"Testing k={k}, b={b}")
train_df = prefilter_top_evidence_with_bm25(train, tokenized_evidence_lemmatize, top_k=20, k1=k, b=b)
dev_df = prefilter_top_evidence_with_bm25(dev, tokenized_evidence_lemmatize, top_k=20, k1=k, b=b)
test_df = prefilter_top_evidence_with_bm25(test, tokenized_evidence_lemmatize, top_k=20, k1=k, b=b)
print(f"Result of top 20 performance of train: ")
print_retrieval_result(train_df, top_k=20)  # For top 20
print(f"Result of top 3 performance of train: ")
print_retrieval_result(train_df, top_k=3)   # For top 3

print(f"Result of top 20 performance of dev: ")
print_retrieval_result(dev_df, top_k=20)  # For top 20
print(f"Result of top 3 performance of dev: ")
print_retrieval_result(dev_df, top_k=3)   # For top 3

Testing k=0.5, b=0.85
Retrieved top 20 evidence finished in 2473.02 seconds
Retrieved top 20 evidence finished in 347.93 seconds
Retrieved top 20 evidence finished in 320.24 seconds
Result of top 20 performance of train: 
Mean Recall: 0.28584419109663406, Mean Precision: 0.044828990228013024, Mean F-Score: 0.07541499370666628
Result of top 3 performance of train: 
Mean Recall: 0.1294516829533116, Mean Precision: 0.1305646036916395, Mean F-Score: 0.12113967736931906
Result of top 20 performance of dev: 
Mean Recall: 0.3617965367965368, Mean Precision: 0.05194805194805195, Mean F-Score: 0.08802191879266978
Result of top 3 performance of dev: 
Mean Recall: 0.14642857142857144, Mean Precision: 0.13636363636363635, Mean F-Score: 0.13019480519480517


The reason we only keep top 20

In [10]:
k = 1.2
b = 0.75
print(f"Testing k={k}, b={b}")
train_df = prefilter_top_evidence_with_bm25(train, tokenized_evidence_lemmatize, top_k=100, k1=k, b=b)
dev_df = prefilter_top_evidence_with_bm25(dev, tokenized_evidence_lemmatize, top_k=100, k1=k, b=b)
test_df = prefilter_top_evidence_with_bm25(test, tokenized_evidence_lemmatize, top_k=100, k1=k, b=b)
print(f"Result of top 100 performance of train: ")
print_retrieval_result(train_df, top_k=100)  # For top 100
print(f"Result of top 20 performance of train: ")
print_retrieval_result(train_df, top_k=20)   # For top 20

print(f"Result of top 100 performance of dev: ")
print_retrieval_result(dev_df, top_k=100)  # For top 100
print(f"Result of top 20 performance of dev: ")
print_retrieval_result(dev_df, top_k=20)   # For top 20

k = 0.5
b = 0.85
print(f"Testing k={k}, b={b}")
train_df = prefilter_top_evidence_with_bm25(train, tokenized_evidence_lemmatize, top_k=100, k1=k, b=b)
dev_df = prefilter_top_evidence_with_bm25(dev, tokenized_evidence_lemmatize, top_k=100, k1=k, b=b)
test_df = prefilter_top_evidence_with_bm25(test, tokenized_evidence_lemmatize, top_k=100, k1=k, b=b)
print(f"Result of top 100 performance of train: ")
print_retrieval_result(train_df, top_k=100)  # For top 100
print(f"Result of top 20 performance of train: ")
print_retrieval_result(train_df, top_k=20)   # For top 20

print(f"Result of top 100 performance of dev: ")
print_retrieval_result(dev_df, top_k=100)  # For top 100
print(f"Result of top 20 performance of dev: ")
print_retrieval_result(dev_df, top_k=20)   # For top 20

Testing k=1.2, b=0.75
Retrieved top 100 evidence finished in 2164.95 seconds
Retrieved top 100 evidence finished in 277.05 seconds
Retrieved top 100 evidence finished in 247.59 seconds
Result of top 100 performance of train: 
Mean Recall: 0.42498642779587414|, Mean Precision: 0.013534201954397392, Mean F-Score: 0.026056002395988546
Result of top 20 performance of train: 
Mean Recall: 0.2607084690553746|, Mean Precision: 0.041164495114006515, Mean F-Score: 0.06922177088831954
Result of top 100 performance of dev: 
Mean Recall: 0.5059523809523809|, Mean Precision: 0.01538961038961039, Mean F-Score: 0.0296558379164439
Result of top 20 performance of dev: 
Mean Recall: 0.32283549783549786|, Mean Precision: 0.04642857142857143, Mean F-Score: 0.07873704781948714
Testing k=0.5, b=0.85
Retrieved top 100 evidence finished in 2027.50 seconds
Retrieved top 100 evidence finished in 278.58 seconds
Retrieved top 100 evidence finished in 260.99 seconds
Result of top 100 performance of train: 
Mean Re

we could found that top 20 contains most of the true evidence that BM25 could found, if we want rerank base on more evidence, then it will introduce
even higher imbalanced(bring too much false evidence to model) to the training data.

# Step 3 notebook will use the dataframe that contains the top 20 evidences

In [15]:
train_df.to_csv("train_k05b085_bm25_top20.csv")
dev_df.to_csv("dev_k05b085_bm25_top20.csv")
test_df.to_csv("test_k05b085_bm25_top20.csv")