# Fine-tuning Cross-Encoders

In [2]:
from typing import List
from llama_index.finetuning.cross_encoders.cross_encoder import (
    CrossEncoderFinetuneEngine,
    CrossEncoderFinetuningDatasetSample
)
import os
import pandas as pd

data_folder = "data"

version = 1

df_train = pd.read_csv(os.path.join(data_folder, f"train_{version}.csv"), index_col=0)
df_test = pd.read_csv(os.path.join(data_folder, f"test_{version}.csv"), index_col=0)
df_finetuning = pd.read_csv(os.path.join(data_folder, f"fine_tuning_{version}.csv"), index_col=0)

finetuning_dataset: List[CrossEncoderFinetuningDatasetSample] = []

for _, row in df_finetuning.iterrows():
    finetuning_dataset.append(
        CrossEncoderFinetuningDatasetSample(
            query=row["question"],
            context=row["context"],
            score=row["score"]
        )
    )

finetuning_engine = CrossEncoderFinetuneEngine(
    dataset=finetuning_dataset, epochs=2, batch_size=8, model_output_path="../models/fine_tuned_cross_encoder"
)

# Finetune the cross-encoder model
finetuning_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/300 [00:00<?, ?it/s]

Iteration:   0%|          | 0/300 [00:00<?, ?it/s]

# Evaluate Reranking

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
# Load Reranking Dataset
import pandas as pd
import os
import ast

data_folder = "data"
version = 1

df_reranking = pd.read_csv(os.path.join(data_folder, f"reranking_test_{version}.csv"), index_col=0)
df_reranking["questions"] = df_reranking["questions"].apply(ast.literal_eval)
df_reranking["context"] = df_reranking["context"].apply(ast.literal_eval)

In [None]:
df_reranking.head()

In [None]:
import openai

os.environ["OPENAI_API_KEY"] = "sk-Ei89D8xbv2ve98FT6e629a55Ca1341518f1238BaB53dAf20"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]

In [None]:
# We evaluate by calculating hits for each (question, context) pair,
# we retrieve top-k documents with the question, and
# it’s a hit if the results contain the context
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
)
from llama_index import Document

service_context_reranker_eval = ServiceContext.from_defaults(chunk_size=256, llm="local", embed_model="local")
rerank_base = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3
)

rerank_finetuned = SentenceTransformerRerank(
    model="../models/cross_encoder", top_n=3
)

In [None]:
without_reranker_hits = 0
base_reranker_hits = 0
finetuned_reranker_hits = 0
total_number_of_context = 0
for index, row in df_reranking.iterrows():
    documents = [Document(text=row["paper"])]
    query_list = row["questions"]
    context_list = row["context"]

    assert len(query_list) == len(context_list)
    vector_index = VectorStoreIndex.from_documents(
        documents, service_context=service_context_reranker_eval
    )

    retriever_without_reranker = vector_index.as_query_engine(
        similarity_top_k=3, response_mode="no_text"
    )
    retriever_with_base_reranker = vector_index.as_query_engine(
        similarity_top_k=8,
        response_mode="no_text",
        node_postprocessors=[rerank_base],
    )
    retriever_with_finetuned_reranker = vector_index.as_query_engine(
        similarity_top_k=8,
        response_mode="no_text",
        node_postprocessors=[rerank_finetuned],
    )

    for index in range(0, len(query_list)):
        query = query_list[index]
        context = context_list[index]
        total_number_of_context += 1

        response_without_reranker = retriever_without_reranker.query(query)
        without_reranker_nodes = response_without_reranker.source_nodes

        for node in without_reranker_nodes:
            if context in node.node.text or node.node.text in context:
                without_reranker_hits += 1

        response_with_base_reranker = retriever_with_base_reranker.query(query)
        with_base_reranker_nodes = response_with_base_reranker.source_nodes

        for node in with_base_reranker_nodes:
            if context in node.node.text or node.node.text in context:
                base_reranker_hits += 1

        response_with_finetuned_reranker = (
            retriever_with_finetuned_reranker.query(query)
        )
        with_finetuned_reranker_nodes = (
            response_with_finetuned_reranker.source_nodes
        )

        for node in with_finetuned_reranker_nodes:
            if context in node.node.text or node.node.text in context:
                finetuned_reranker_hits += 1
        
        assert (
            len(with_finetuned_reranker_nodes)
            == len(with_base_reranker_nodes)
            == len(without_reranker_nodes)
        )

### Result
As we can see below we get more hits with finetuned_cross_encoder compared to other options.

In [None]:
without_reranker_scores = [without_reranker_hits]
base_reranker_scores = [base_reranker_hits]
finetuned_reranker_scores = [finetuned_reranker_hits]
reranker_eval_dict = {
    "Metric": "Hits",
    "BGE_Embeddings": without_reranker_scores,
    "Base_cross_encoder": base_reranker_scores,
    "Finetuned_cross_encoder": finetuned_reranker_hits,
    "Total Relevant Context": total_number_of_context,
}
df_reranker_eval_results = pd.DataFrame(reranker_eval_dict)
display(df_reranker_eval_results)

## RAG Evaluation

In [1]:
import pandas as pd
import os
import ast

data_folder = "data"

version = 1

df_test = pd.read_csv(os.path.join(data_folder, f"test_{version}.csv"), index_col=0)
df_test["questions"] = df_test["questions"].apply(ast.literal_eval)
df_test["answers"] = df_test["answers"].apply(ast.literal_eval)
print(f"Number of papers in the test sample:- {len(df_test)}")

Number of papers in the test sample:- 80


In [2]:
df_test.head()

Unnamed: 0,paper,questions,answers
0,Identifying Condition-Action Statements in Med...,[What supervised machine learning models do th...,"[Unacceptable, Unacceptable, 1470 sentences, U..."
1,Multilingual is not enough: BERT for Finnish\t...,[By how much did the new model outperform mult...,"[For POS, improvements for cased BERT are 1.2..."
2,Efficient Dynamic WFST Decoding for Personaliz...,"[What does the cache consist of?, What languag...","[Unacceptable, Unacceptable, A model that cont..."
3,"A system for the 2019 Sentiment, Emotion and C...",[Did they pre-train on existing sentiment corp...,"[Unacceptable, Unacceptable, 2, Unacceptable]"
4,Towards Automatic Bot Detection in Twitter for...,[Do the authors report results on only English...,"[Unacceptable, Unacceptable, An existing bot d..."


### Baseline Evaluation
Use BAAI/bge-small-en as the base model for RAG without reranker

#### Eval Method:-
1. Iterate over each row of the test dataset:-
    1. For the current row being iterated, create a vector index using the paper document provided in the paper column of the dataset
    2. Query the vector index with a top_k value of top 3 nodes without any reranker
    3. Compare the generated answers with the reference answers of the respective sample using Pairwise Comparison Evaluator and add the scores to a list
5. Repeat 1 untill all the rows have been iterated
6. Calculate avg scores over all samples/ rows


In [3]:
from qasper_data.data import AnswerType
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    Document,
)


import openai



os.environ["OPENAI_API_KEY"] = "sk-Ei89D8xbv2ve98FT6e629a55Ca1341518f1238BaB53dAf20"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate
from datasets import Dataset

from qasper_data.qasper_dataset import QasperDataset
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(chunk_size=256, llm="local", embed_model="local")

test_dataset = QasperDataset("test")

df_all_papers = pd.DataFrame()

df_test = df_test.head(10)

for index, row in df_test.iterrows():
    documents = [Document(text=row["paper"])]
    query_list = row["questions"]
    reference_answers_list = list(map(lambda x: x.split(","), row["answers"]))
    vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    contexts = []
    title = row["paper"].split("\t")[0]
    paper = test_dataset.match_paper_by_title(title) 
    for qa in paper.qas:
        context = []
        for answer in qa.answers:
            if answer.answer_type == AnswerType.EXTRACTIVE:
                spans = ""
                for span in answer.extractive_spans:
                    spans += span + "\t"
                context.append(spans)
        contexts.append(context)        
    print(contexts)
    # Query the vector index with a top_k value of top 3 documents without any reranker
    query_engine = vector_index.as_query_engine(similarity_top_k=3)
    response_list = []
    for query in query_list:
        response = query_engine.query(query)
        response_list.append(response.response)
    # Compare the generated answers with the reference answers of the respective sample using Ragas Metrics
    # and add the scores to a list
    dataset = Dataset.from_dict(
        {
            "question": query_list,
            "answer": response_list,
            "contexts": contexts,
            "ground_truths": reference_answers_list
        }
    )
    ragas_result = evaluate(
        dataset,
        metrics=[
            answer_relevancy,
            faithfulness,
            context_recall,
            context_precision,
        ],
    )
    df_ragas = ragas_result.to_pandas()
    df_all_papers = pd.concat([df_all_papers, df_ragas], ignore_index=True)

df_all_papers.to_csv(os.path.join(data_folder, f"ragas_baseline_{version}.csv"))

from IPython.display import clear_output

clear_output()

    

Found cached dataset qasper (/Users/zeyuli/.cache/huggingface/datasets/allenai___qasper/qasper/0.3.0/2bfcd239e581ab83f9ab7b76a82e42c6bcf574a13246ae6cc5a6c357c35f96f9)


  0%|          | 0/3 [00:00<?, ?it/s]

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/zeyuli/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q

[['ZeroR, Naïve Bayes, J48, and random forest classifiers\t', 'ZeroR, Naïve Bayes, J48, and random forest \t'], [], [], ["Conditions that affect erythrocyte turnover and hemoglobin variants must be considered, particularly when the A1C result does not correlate with the patient's clinical situation\t", 'If patients have asthma, then beta-blockers, including eye drops, are contraindicated\t']]



llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      11.37 ms /    74 runs   (    0.15 ms per token,  6508.36 tokens per second)
llama_print_timings: prompt eval time =   23019.76 ms /   916 tokens (   25.13 ms per token,    39.79 tokens per second)
llama_print_timings:        eval time =    5242.38 ms /    73 runs   (   71.81 ms per token,    13.92 tokens per second)
llama_print_timings:       total time =   28463.00 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      30.24 ms /   129 runs   (    0.23 ms per token,  4265.87 tokens per second)
llama_print_timings: prompt eval time =    5058.27 ms /    19 tokens (  266.22 ms per token,     3.76 tokens per second)
llama_print_timings:        eval time =    8790.31 ms /   128 runs   (   68.67 ms per token,    14.56 tokens per second)
llama_print_timings:       total time =   14383.03 ms
Llama.generate: prefix-

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:10<00:00, 10.17s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:31<00:00, 31.99s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:05<00:00,  5.21s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:02<00:00,  2.90s/it]


[['absolute improvements for FinBERT cased ranging between 0.4 and 1.7% points\tLAS results are 2.3–3.6% points above the previous state of the art\tabsolute advantage for FinBERT models ranging from 3% points for 1K examples to just over 1% point for 100K examples\t'], ['ELMo \tULMFit \tBERT\t', 'che2018towards\tlim2018sex\tFiNER-tagger BIBREF32\tgungor2018\tHIT-SCIR BIBREF22\tBIBREF33\t'], ['Yle corpus\tSTT corpus\tSuomi24 corpus (version 2017H2)\tluotolahti2015towards\tCommon Crawl\tFinnish Wikipedia\t', 'news, online discussion, and an internet crawl\t']]


Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      11.93 ms /   106 runs   (    0.11 ms per token,  8885.16 tokens per second)
llama_print_timings: prompt eval time =   12292.37 ms /   653 tokens (   18.82 ms per token,    53.12 tokens per second)
llama_print_timings:        eval time =    6967.75 ms /   105 runs   (   66.36 ms per token,    15.07 tokens per second)
llama_print_timings:       total time =   19452.79 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      28.13 ms /   140 runs   (    0.20 ms per token,  4976.89 tokens per second)
llama_print_timings: prompt eval time =    9363.19 ms /   650 tokens (   14.40 ms per token,    69.42 tokens per second)
llama_print_timings:        eval time =    9633.65 ms /   139 runs   (   69.31 ms per token,    14.43 tokens per second)
llama_print_timings:       total time =   19

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:06<00:00,  6.30s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:20<00:00, 20.15s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:09<00:00,  9.63s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:03<00:00,  3.41s/it]


[['static public cache stores the most frequent states\tlifetime of a private cache actually can last for the entire dialog section for a specific user\tsubsequent utterances faster as more states are composed and stored\t'], [], [' contains the expected user-specific entities\t']]


Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      19.40 ms /   125 runs   (    0.16 ms per token,  6443.96 tokens per second)
llama_print_timings: prompt eval time =   13058.30 ms /   734 tokens (   17.79 ms per token,    56.21 tokens per second)
llama_print_timings:        eval time =    8644.12 ms /   124 runs   (   69.71 ms per token,    14.35 tokens per second)
llama_print_timings:       total time =   22044.83 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =       2.67 ms /    21 runs   (    0.13 ms per token,  7865.17 tokens per second)
llama_print_timings: prompt eval time =    9442.02 ms /   809 tokens (   11.67 ms per token,    85.68 tokens per second)
llama_print_timings:        eval time =    1372.72 ms /    20 runs   (   68.64 ms per token,    14.57 tokens per second)
llama_print_timings:       total time =   10

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:07<00:00,  7.57s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:18<00:00, 18.19s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:03<00:00,  3.61s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


[[], ['unigrams and bigrams\tword2vec\tmanually constructed lexica\tsentiment embeddings\t'], ['2\t'], []]


Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =       7.98 ms /    55 runs   (    0.15 ms per token,  6893.96 tokens per second)
llama_print_timings: prompt eval time =   15326.45 ms /   778 tokens (   19.70 ms per token,    50.76 tokens per second)
llama_print_timings:        eval time =    3762.49 ms /    54 runs   (   69.68 ms per token,    14.35 tokens per second)
llama_print_timings:       total time =   19259.22 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      19.91 ms /   114 runs   (    0.17 ms per token,  5724.90 tokens per second)
llama_print_timings: prompt eval time =    7176.48 ms /   279 tokens (   25.72 ms per token,    38.88 tokens per second)
llama_print_timings:        eval time =   12903.61 ms /   113 runs   (  114.19 ms per token,     8.76 tokens per second)
llama_print_timings:       total time =   20

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:24<00:00, 24.64s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:21<00:00, 21.71s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:07<00:00,  7.52s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:01<00:00,  1.32s/it]


[[], ['413 $(4\\%)$ users were annotated as "bot," 7849 $(75.35\\%)$ as "non-bot," and $20.69$ $(19.9\\%)$ as "unavailable"\t', 'Tweet Diversity\tURL score\tMean Daily Posts\tTopics\tMean Post Length\tProfile Picture\t', 'a sample of $10,417$ users from a database containing more than 400 million publicly available tweets posted by more than $100,000$ users who have announced their pregnancy on Twitter\tTwo professional annotators manually categorized the $10,417$ users as "bot," "non-bot," or "unavailable," based on their publicly available Twitter sites\tUsers were annotated broadly as "bot" if, in contrast to users annotated as "non-bot," they do not appear to be posting personal information\t Users were annotated as "unavailable" if their Twitter sites could not be viewed at the time of annotation\t'], ['Using a machine learning algorithm on top of an existing bot detection system, and a set of simple derived features, we were able to significantly improve bot detection performance

Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      16.55 ms /    98 runs   (    0.17 ms per token,  5919.66 tokens per second)
llama_print_timings: prompt eval time =   15005.23 ms /   848 tokens (   17.69 ms per token,    56.51 tokens per second)
llama_print_timings:        eval time =    6667.09 ms /    97 runs   (   68.73 ms per token,    14.55 tokens per second)
llama_print_timings:       total time =   22045.37 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      32.88 ms /   256 runs   (    0.13 ms per token,  7786.84 tokens per second)
llama_print_timings: prompt eval time =   18028.04 ms /   820 tokens (   21.99 ms per token,    45.48 tokens per second)
llama_print_timings:        eval time =   18065.98 ms /   255 runs   (   70.85 ms per token,    14.11 tokens per second)
llama_print_timings:       total time =   36

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:09<00:00,  9.86s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:40<00:00, 40.33s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:16<00:00, 16.57s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:05<00:00,  5.24s/it]


[['N18-1126\tUDPipe\tD15-1272\tMorfette\t', 'N18-1126\tUDPipe system of K17-3009\tD15-1272\tMorfette\t'], []]


Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      12.71 ms /   104 runs   (    0.12 ms per token,  8185.11 tokens per second)
llama_print_timings: prompt eval time =   13205.66 ms /   794 tokens (   16.63 ms per token,    60.13 tokens per second)
llama_print_timings:        eval time =    6858.19 ms /   103 runs   (   66.58 ms per token,    15.02 tokens per second)
llama_print_timings:       total time =   20311.90 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =       2.81 ms /    23 runs   (    0.12 ms per token,  8193.80 tokens per second)
llama_print_timings: prompt eval time =   13296.77 ms /   740 tokens (   17.97 ms per token,    55.65 tokens per second)
llama_print_timings:        eval time =    1433.85 ms /    22 runs   (   65.17 ms per token,    15.34 tokens per second)
llama_print_timings:       total time =   14

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:04<00:00,  4.64s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:07<00:00,  7.85s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:07<00:00,  7.19s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:01<00:00,  1.33s/it]


[[], ['stacked bilstms\t'], ['English Penn Treebank\tspmrl datasets\t', ' English Penn Treebank\tspmrl datasets\t']]


Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =       8.90 ms /    85 runs   (    0.10 ms per token,  9554.86 tokens per second)
llama_print_timings: prompt eval time =   11808.01 ms /   703 tokens (   16.80 ms per token,    59.54 tokens per second)
llama_print_timings:        eval time =    5524.84 ms /    84 runs   (   65.77 ms per token,    15.20 tokens per second)
llama_print_timings:       total time =   17555.98 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      13.29 ms /    94 runs   (    0.14 ms per token,  7071.39 tokens per second)
llama_print_timings: prompt eval time =   10355.43 ms /   744 tokens (   13.92 ms per token,    71.85 tokens per second)
llama_print_timings:        eval time =    6366.83 ms /    93 runs   (   68.46 ms per token,    14.61 tokens per second)
llama_print_timings:       total time =   16

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:17<00:00, 17.51s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:01<00:00,  1.80s/it]


[[], ['Seq2seq\tGPT2-FT\tSpeaker\tECM\tSkeleton-to-Response (SR)\tRetrieval + Style Transfer (RST)\tRetrieval + Reranking (RRe)\t', 'Generative Approaches ::: Seq2seq\tGenerative Approaches ::: GPT2-FT:\tGenerative Approaches ::: Speaker:\tGenerative Approaches ::: ECM:\tRetrieval-Based Approaches ::: Skeleton-to-Response (SR)\tRetrieval-Based Approaches ::: Retrieval + Style Transfer (RST)\tRetrieval-Based Approaches ::: Retrieval + Style Transfer (RST)\tRetrieval-Based Approaches ::: Retrieval + Reranking (RRe)\t'], ['Chinese\tEnglish\t'], ['gender-specific (Chinese) dataset\temotion-specific (Chinese) dataset\tsentiment-specific (English) dataset\t', 'Gender-Specific Dialogue Dataset\tEmotion-Specific Dialogue Dataset\tSentiment-Specific Dialogue Dataset\t']]


Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      30.22 ms /   176 runs   (    0.17 ms per token,  5823.19 tokens per second)
llama_print_timings: prompt eval time =   14960.75 ms /   831 tokens (   18.00 ms per token,    55.55 tokens per second)
llama_print_timings:        eval time =   12476.74 ms /   175 runs   (   71.30 ms per token,    14.03 tokens per second)
llama_print_timings:       total time =   28126.10 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   17686.12 ms
llama_print_timings:      sample time =      42.71 ms /   256 runs   (    0.17 ms per token,  5994.19 tokens per second)
llama_print_timings: prompt eval time =   11423.41 ms /   793 tokens (   14.41 ms per token,    69.42 tokens per second)
llama_print_timings:        eval time =   18488.57 ms /   255 runs   (   72.50 ms per token,    13.79 tokens per second)
llama_print_timings:       total time =   30

KeyboardInterrupt: 

In [None]:
df_all_papers.head()

In [None]:
def mean_score(df):
    return df.mean(axis=0)

answer_relevancy_score = mean_score(df_all_papers["answer_relevancy"])
faithfulness_score = mean_score(df_all_papers["faithfulness"])
context_recall_score = mean_score(df_all_papers["context_recall"])
context_precision_score = mean_score(df_all_papers["context_precision"])

ragas_baseline_dict = {
    "Metric": ["answer_relevancy", "faithfulness", "context_recall", "context_precision"],
    "BGE_Embeddings": [answer_relevancy_score, faithfulness_score, context_recall_score, context_precision_score],
}
df_ragas_baseline_results = pd.DataFrame(ragas_baseline_dict)

In [None]:
df_ragas_baseline_results

### RAG Base Reranker Evaluation
`BAAI/bge-small-en` Embedding + `cross-encoder/ms-marco-MiniLM-L-12-v2` as reranker

#### Eval Method:-
1. Iterate over each row of the test dataset:-
    1. For the current row being iterated, create a vector index using the paper document provided in the paper column of the dataset
    2. Query the vector index with a top_k value of top 5 nodes.
    3. Use cross-encoder/ms-marco-MiniLM-L-12-v2 as a reranker as a NodePostprocessor to get top_k value of top 3 nodes out of the 8 nodes
    3. Compare the generated answers with the reference answers of the respective sample using Ragas Evaluator and add the scores to a list
5. Repeat 1 untill all the rows have been iterated
6. Calculate avg scores over all samples/ rows

In [None]:
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import (VectorStoreIndex, ServiceContext, Document)

reranker = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3)

import pandas as pd
import os
import openai
import nest_asyncio

nest_asyncio.apply()

os.environ["OPENAI_API_KEY"] = "sk-Ei89D8xbv2ve98FT6e629a55Ca1341518f1238BaB53dAf20"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"

openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]

data_folder = "data"
version = 1

service_context = ServiceContext.from_defaults(chunk_size=256, llm="local", embed_model="local")

In [None]:
from tqdm.notebook import tqdm

df_test = pd.read_csv(os.path.join(data_folder, f"test_{version}.csv"), index_col=0)

import ast

df_test["questions"] = df_test["questions"].apply(ast.literal_eval)
df_test["answers"] = df_test["answers"].apply(ast.literal_eval)

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

from ragas import evaluate

from datasets import Dataset


metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
]

df_ragas_base_reranker_result = pd.DataFrame()



for index, row in tqdm(df_test.iterrows()):
    documents = [Document(text=row["paper"])]
    query_list = row["questions"]
    reference_answers_list = row["answers"]
    response_list = []

    number_of_accepted_queries = 0
    
    # Create a vector index from the documents
    vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    
    # Query the vector index with a top_k value of top 8 nodes with reranker
    # as cross-encoder/ms-marco-MiniLM-L-12-v2
    query_engine = vector_index.as_query_engine(
        similarity_top_k=3, node_postprocessors=[reranker]
    )
    assert len(query_list) == len(reference_answers_list)
    
    reference_answers_list = list(map(lambda x: x.split(","), row["answers"]))
    for query in query_list:
        response = query_engine.query(query)
        response_list.append(response)
    
    answers = []
    contexts = []
    for r in response_list:
        answers.append(r.response)
        contexts.append([c.node.get_content() for c in r.source_nodes])
        
    dataset = Dataset.from_dict(
        {
            "question": query_list,
            "answer": answers,
            "contexts": contexts,
            "ground_truths": reference_answers_list
        }
    )
    
    ragas_result = evaluate(
        dataset,
        metrics=metrics,
    )
    
    df_ragas = ragas_result.to_pandas()
    df_ragas_base_reranker_result = pd.concat([df_ragas_base_reranker_result, df_ragas], ignore_index=True)

df_ragas_base_reranker_result.to_csv(os.path.join(data_folder, f"ragas_base_reranker_{version}.csv"))


from IPython.display import clear_output

clear_output()



In [None]:
df_ragas_base_reranker_result.head()

In [None]:
def mean_score(df):
    return df.mean(axis=0)

answer_relevancy_score = mean_score(df_ragas_base_reranker_result["answer_relevancy"])
faithfulness_score = mean_score(df_ragas_base_reranker_result["faithfulness"])
context_recall_score = mean_score(df_ragas_base_reranker_result["context_recall"])
context_precision_score = mean_score(df_ragas_base_reranker_result["context_precision"])

ragas_base_reranker_dict = {
    "Metric": ["answer_relevancy", "faithfulness", "context_recall", "context_precision"],
    "Base_Cross_Encoder": [answer_relevancy_score, faithfulness_score, context_recall_score, context_precision_score],
}
df_ragas_base_reranker_results = pd.DataFrame(ragas_base_reranker_dict)

In [None]:
df_ragas_base_reranker_results

### Evaluate with Fine-Tuned re-ranker
`BAAI/bge-small-en` Embedding + `cross_encoder` fine-tuned on the fine-tuning dataset as reranker

#### Eval Method:-
1. Iterate over each row of the test dataset:-
    1. For the current row being iterated, create a vector index using the paper document provided in the paper column of the dataset
    2. Query the vector index with a top_k value of top 5 nodes.
    3. Use cross_encoder fine-tuned on the fine-tuning dataset as a reranker as a NodePostprocessor to get top_k value of top 3 nodes out of the 8 nodes
    3. Compare the generated answers with the reference answers of the respective sample using Ragas Evaluator and add the scores to a list

In [None]:
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import (VectorStoreIndex, ServiceContext, Document)

reranker = SentenceTransformerRerank(model="../models/cross_encoder", top_n=3)

import pandas as pd
import os
import openai
import nest_asyncio

nest_asyncio.apply()

os.environ["OPENAI_API_KEY"] = "sk-Ei89D8xbv2ve98FT6e629a55Ca1341518f1238BaB53dAf20"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"

openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]

data_folder = "data"
version = 1

service_context = ServiceContext.from_defaults(chunk_size=256, llm="local", embed_model="local")

In [None]:
from tqdm.notebook import tqdm
from ragas.llama_index import evaluate

df_test = pd.read_csv(os.path.join(data_folder, f"test_{version}.csv"), index_col=0)

import ast

df_test["questions"] = df_test["questions"].apply(ast.literal_eval)
df_test["answers"] = df_test["answers"].apply(ast.literal_eval)

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
]


from datasets import Dataset

df_ragas_finetuned_reranker_result = pd.DataFrame()

for index, row in tqdm(df_test.iterrows()):
    documents = [Document(text=row["paper"])]
    query_list = row["questions"]
    reference_answers_list = row["answers"]
    response_list = []

    number_of_accepted_queries = 0
    
    # Create a vector index from the documents
    vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    
    # Query the vector index with a top_k value of top 8 nodes with reranker
    # as cross-encoder/ms-marco-MiniLM-L-12-v2
    query_engine = vector_index.as_query_engine(
        similarity_top_k=3, node_postprocessors=[reranker]
    )
    assert len(query_list) == len(reference_answers_list)
    
    reference_answers_list = list(map(lambda x: x.split(","), row["answers"]))
    
    result = evaluate(query_engine, metrics, query_list, reference_answers_list)
    
    df_ragas = result.to_pandas()
    df_ragas_finetuned_reranker_result = pd.concat([df_ragas_finetuned_reranker_result, df_ragas], ignore_index=True)

df_ragas_finetuned_reranker_result.to_csv(os.path.join(data_folder, f"ragas_finetuned_reranker_{version}.csv"))

from IPython.display import clear_output
clear_output()

In [None]:
df_ragas_finetuned_reranker_result.head()

In [None]:
def mean_score(df):
    return df.mean(axis=0)

answer_relevancy_score = mean_score(df_ragas_finetuned_reranker_result["answer_relevancy"])
faithfulness_score = mean_score(df_ragas_finetuned_reranker_result["faithfulness"])
context_recall_score = mean_score(df_ragas_finetuned_reranker_result["context_recall"])
context_precision_score = mean_score(df_ragas_finetuned_reranker_result["context_precision"])

ragas_finetuned_reranker_dict = {
    "Metric": ["answer_relevancy", "faithfulness", "context_recall", "context_precision"],
    "Finetuned_Cross_Encoder": [answer_relevancy_score, faithfulness_score, context_recall_score, context_precision_score],
}

df_ragas_finetuned_reranker_results = pd.DataFrame(ragas_finetuned_reranker_dict)

In [None]:
df_ragas_finetuned_reranker_results