# Implementation of BM25 results as baseline

Since this is just a benchmark, we will just be doing a basic implementation of Okapi BM25, not the lucene implementation

## Install and import relevant libraries

In [3]:
# !pip install rank_bm25
# !pip install texthero

In [5]:
import texthero as hero
from rank_bm25 import BM25Okapi
import pandas as pd
import numpy as np

## Import test dataset

In [6]:
test_df = pd.read_csv("../0.Datasets/train_test_split/test.csv")

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3415 entries, 0 to 3414
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3415 non-null   int64 
 1   qid         3415 non-null   int64 
 2   docid       3415 non-null   int64 
 3   question    3415 non-null   object
 4   answer      3415 non-null   object
dtypes: int64(3), object(2)
memory usage: 133.5+ KB


In [8]:
test_df.head().drop(columns=["Unnamed: 0"])

Unnamed: 0,qid,docid,question,answer
0,7817,31330,Can you have a positive return with a balance ...,Have you owned the stock for longer than 2015?...
1,6304,105557,Oversimplify it for me: the correct order of i...,Great questions -- the fact that you're thinki...
2,7115,43508,Definition of “U.S. source” for US non-residen...,The examples you provide in the question are c...
3,5716,287327,Are car buying services worth it?,I have used car buying services through Costco...
4,9016,580920,What makes a Company's Stock prices go up or d...,Here are some significant factors affect the c...


## Data cleaning

In [21]:
test_df["question"] =  hero.clean(test_df["question"])
test_df["answer"] =  hero.clean(test_df["answer"])

In [22]:
test_answers_list = test_df["answer"].tolist()
test_df.head().drop(columns=["Unnamed: 0"])

Unnamed: 0,qid,docid,question,answer
0,7817,31330,positive return balance cost basis,owned stock longer stock appears grown value s...
1,6304,105557,oversimplify correct order investing,great questions fact thinking important think ...
2,7115,43508,definition u source us non resident alien capi...,examples provide question completely irrelevan...
3,5716,287327,car buying services worth,used car buying services costco usaa twice for...
4,9016,580920,makes company stock prices go,significant factors affect company stock price...


In [23]:
question_answer_index_map = {}
for _,row in test_df.iterrows():
    if row["qid"] not in question_answer_index_map:
        question_answer_index_map[row["qid"]]= []
        question_answer_index_map[row["qid"]].append(row["docid"])
    else:
        question_answer_index_map[row["qid"]].append(row["docid"])

In [24]:
labels = []
for v in question_answer_index_map.values():
    labels.append(v)

In [25]:
question_map = {}
label_map = {}
for _,row in test_df.iterrows():
    if row["qid"] not in question_map:
        question_map[row["qid"]] = row["question"]
    if row["answer"] not in label_map:
        label_map[row["answer"]] = row["docid"]

## Evaluate results

In [26]:
tokenized_corpus = [doc.split(" ") for doc in test_answers_list]
bm25 = BM25Okapi(tokenized_corpus)

In [33]:
predictions = []
for k,v in question_map.items():
    tokenized_query = v.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    top_response = bm25.get_top_n(tokenized_query,test_answers_list,n=1)[0]
    label_index = label_map[top_response]
    predictions.append([label_index])

In [1]:
#save prediction and results
results = {"labels":labels,"predictions":predictions}
with open("../7.Evaluate/bm25_baseline.pkl", 'wb') as f:
    pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)