In [None]:
# !pip install llama-index llama-index-retrievers-bm25 llama-index-embeddings-huggingface xformers -q

In [None]:
import os
import numpy as np
import pandas as pd

from IPython.utils import io
from types import SimpleNamespace
from eedi_metrics import apk, mapk
from IPython.display import display
from typing import Any, Dict, List, Optional, ClassVar

from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.evaluation.retrieval.metrics import *
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
from llama_index.core.schema import NodeWithScore, QueryBundle, MetadataMode, TextNode
from llama_index.core.evaluation.retrieval.metrics_base import BaseRetrievalMetric, RetrievalMetricResult
from llama_index.core.evaluation import RetrieverEvaluator, EmbeddingQAFinetuneDataset as RetrievalDataset

In [None]:
class MAP25(BaseRetrievalMetric):
    """
    Computes the average precision at k
    
    Useful resources:
      - stackoverflow.com/questions/55748792
      - kaggle.com/code/nandeshwar/mean-average-precision-map-k-metric-explained-code
    """

    metric_name: ClassVar[str] = "map@25"

    def compute(
        self,
        query: Optional[str] = None,
        expected_ids: Optional[List[str]] = None,
        retrieved_ids: Optional[List[str]] = None,
        expected_texts: Optional[List[str]] = None,
        retrieved_texts: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> RetrievalMetricResult:
        """ Computes the average precision at k """
        
        # Checking for the required arguments
        if retrieved_ids is None or expected_ids is None or not retrieved_ids or not expected_ids:
            raise ValueError("Retrieved ids and expected ids must be provided")
            
        map25 = apk(expected_ids, retrieved_ids, k=25)
        
        return RetrievalMetricResult(score=map25)
    

class HybridRetriever(BaseRetriever):
    """ Custom retriever """

    def __init__(
        self,
        bm25_retriever: BM25Retriever,
        vector_retriever: VectorIndexRetriever,
    ) -> None:
        
        self._bm25_retriever = bm25_retriever
        self._vector_retriever = vector_retriever
        
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        bm25_nodes    = self._bm25_retriever.retrieve(query_bundle)
        vector_nodes  = self._vector_retriever.retrieve(query_bundle)

        bm25_ids      = {n.node.node_id for n in bm25_nodes}
        vector_ids    = {n.node.node_id for n in vector_nodes}

        combined_dict = {n.node.node_id: n for n in bm25_nodes}
        combined_dict.update({n.node.node_id: n for n in vector_nodes})

        return list(combined_dict.values())

In [None]:
config = SimpleNamespace(
    run_eval = True,
    comp_dir = '/kaggle/input/eedi-mining-misconceptions-in-mathematics',
    
    similarity_top_k = 15,
    metrics = [HitRate(), MRR(), Recall(), AveragePrecision(), NDCG(), MAP25()],
    
    embed_batch_size=64,
    embed_model_pth="/kaggle/input/bge-small-en-v1.5/transformers/bge/2",
    # embed_model_pth="/kaggle/input/stella-en-embedding-400m-v5/transformers/default/1",
)

Settings.embed_model = HuggingFaceEmbedding(
    model_name=config.embed_model_pth, device='cuda:0', 
    embed_batch_size=config.embed_batch_size, trust_remote_code=True
)

In [None]:
test           = pd.read_csv(f'{config.comp_dir}/test.csv')
train          = pd.read_csv(f'{config.comp_dir}/train.csv')
s_submission   = pd.read_csv(f'{config.comp_dir}/sample_submission.csv')
misconceptions = pd.read_csv(f'{config.comp_dir}/misconception_mapping.csv')

test["AllQuestionText"]  = test["SubjectName"]  + "\n\n" + test["ConstructName"] + "\n\n" + test["QuestionText"]
train["AllQuestionText"] = train["SubjectName"] + "\n\n" + train["ConstructName"] + "\n\n" + train["QuestionText"]

In [None]:
misconception_nodes = [TextNode(text=name, id_=f"{id_}") for id_, name in misconceptions.values]

with io.capture_output() as captured:
    
    # BM25 Retriever
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=misconception_nodes,
        similarity_top_k=config.similarity_top_k,
    )
    
    # Embedding Retriever
    semantic_retriever = VectorStoreIndex(
        nodes=misconception_nodes
    ).as_retriever(similarity_top_k=config.similarity_top_k)
    
    hybrid_retriever = HybridRetriever(bm25_retriever=bm25_retriever, vector_retriever=semantic_retriever)

    
len(misconception_nodes)

In [None]:
keep_cols           = ["QuestionId", "AllQuestionText", "CorrectAnswer"]
answer_cols         = ["AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"]
misconception_cols  = ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"]

def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the answer columns
    answers_df = pd.melt(
        id_vars=keep_cols,
        frame=df[keep_cols + answer_cols],
        var_name='Answer', value_name='Value'
    ).sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
    
    # If NOT test set
    if misconception_cols[0] in df.columns:
        
        # Melt the misconception columns
        misconceptions_df = pd.melt(
            id_vars=keep_cols,
            frame=df[keep_cols + misconception_cols],
            var_name='Misconception', value_name='MisconceptionId'
        ).sort_values(["QuestionId", "Misconception"]).reset_index(drop=True)

        answers_df[['Misconception', 'MisconceptionId']] = misconceptions_df[['Misconception', 'MisconceptionId']]
    
    return answers_df

test = wide_to_long(test)
train = wide_to_long(train)

In [None]:
test["AllText"]  = test["AllQuestionText"]  + "\n\n" + test["Value"]
train["AllText"] = train["AllQuestionText"] + "\n\n" + train["Value"]

test['AnswerId'] = test.Answer.str.replace('Answer', '').str.replace('Text', '')
train['AnswerId'] = train.Answer.str.replace('Answer', '').str.replace('Text', '')

test.drop(['AllQuestionText', 'Answer'], axis=1, inplace=True)
train.drop(['AllQuestionText', 'Answer', 'Misconception'], axis=1, inplace=True)

In [None]:
train.dropna().head()

In [None]:
queries, relevant_docs = {}, {}
corpus = { node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in misconception_nodes }

for q_id, m_id, all_text in train[['QuestionId', 'MisconceptionId', 'AllText']].dropna().values:
    question_id = f'{q_id}'
    queries[question_id] = all_text
    relevant_docs[question_id] = [f'{int(m_id)}']
    
r_dataset = RetrievalDataset(queries=queries, corpus=corpus, relevant_docs=relevant_docs)

In [None]:
def display_results(name, eval_results):
    """ Display results from retrieval evaluation """

    metric_names = [m.metric_name for m in config.metrics]
    results = [eval_result.metric_vals_dict for eval_result in eval_results]
    return pd.DataFrame({ "desc": [name], **{k: [pd.DataFrame(results)[k].mean()] for k in metric_names} })

In [None]:
retriever_evaluator = RetrieverEvaluator(
    retriever=hybrid_retriever, metrics=config.metrics
)

if config.run_eval and not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    with io.capture_output():
        eval_results = await retriever_evaluator.aevaluate_dataset(r_dataset, workers=4)

    display(display_results("hybrid eval", eval_results))

In [None]:
data = []

for q_id, a, all_text, a_id in test[['QuestionId', 'CorrectAnswer', 'AllText', 'AnswerId']].values:
    if a.strip() == a_id.strip(): continue
    
    with io.capture_output():
        ids = " ".join([node.id_ for node in hybrid_retriever.retrieve(all_text)][:25]) # Only the first 25
    
    data.append([f'{q_id}_{a_id}', ids])

In [None]:
submission = pd.DataFrame(data, columns=s_submission.columns)

submission.to_csv('submission.csv', index=False)

In [None]:
submission