# Evaluate Evidence F1

In [2]:
from llama_index.indices.postprocessor import SentenceTransformerRerank
from qasper_data.qasper_dataset import QasperDataset, PaperIndex
from qasper_data.qasper_evaluator import EvidenceQasperEvaluator
from llama_index import ServiceContext

test_dataset = QasperDataset("test", seed=42)
sample_papers = test_dataset.random_sample(50)

service_context = ServiceContext.from_defaults(llm="local", embed_model="local", chunk_size=1024, chunk_overlap=0)


rerank_base = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3
)

rerank_finetuned = SentenceTransformerRerank(
    model="../models/cross_encoder", top_n=3
)

evaluator = EvidenceQasperEvaluator(service_context=service_context)


Found cached dataset qasper (/Users/zeyuli/.cache/huggingface/datasets/allenai___qasper/qasper/0.3.0/2bfcd239e581ab83f9ab7b76a82e42c6bcf574a13246ae6cc5a6c357c35f96f9)


  0%|          | 0/3 [00:00<?, ?it/s]

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/zeyuli/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
df_all_papers = pd.DataFrame()

for paper in tqdm(sample_papers, "Evaluating papers"):
    vector_index = PaperIndex(paper, service_context)
    
    baseline_engine = vector_index.as_index(show_progress=False).as_query_engine(response_mode="no_text", similarity_top_k=3)
    rerank_base_engine = vector_index.as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_base], response_mode="no_text", similarity_top_k=8)
    rerank_finetuned_engine = vector_index.as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_finetuned], response_mode="no_text", similarity_top_k=8)
    
    baseline_results = evaluator.evaluate(paper, baseline_engine)
    rerank_base_results = evaluator.evaluate(paper, rerank_base_engine)
    rerank_finetuned_results = evaluator.evaluate(paper, rerank_finetuned_engine)
    
    df_paper = pd.DataFrame()
    df_paper["baseline"] = baseline_results
    df_paper["rerank_base"] = rerank_base_results
    df_paper["rerank_finetuned"] = rerank_finetuned_results
    
    df_all_papers = pd.concat([df_all_papers, df_paper], ignore_index=True)

Evaluating papers:   0%|          | 0/50 [00:00<?, ?it/s]

In [3]:
df_all_papers

Unnamed: 0,baseline,rerank_base,rerank_finetuned
0,0.103560,0.103560,0.103560
1,0.163743,0.163743,0.163743
2,0.065371,0.065371,0.065371
3,0.084257,0.089219,0.089219
4,0.047309,0.051282,0.051282
...,...,...,...
150,0.129278,0.155303,0.155303
151,0.070393,0.080537,0.080537
152,0.119617,0.140078,0.140078
153,0.064356,0.066210,0.066210


In [4]:
df_all_papers.mean()

baseline            0.097787
rerank_base         0.106524
rerank_finetuned    0.106524
dtype: float64

## Evaluate With Chunk Size 64

In [1]:
from llama_index.indices.postprocessor import SentenceTransformerRerank
from qasper_data.qasper_dataset import QasperDataset, PaperIndex
from qasper_data.qasper_evaluator import EvidenceQasperEvaluator
from llama_index import ServiceContext

test_dataset = QasperDataset("test", seed=42)
sample_papers = test_dataset.random_sample(50)

service_context = ServiceContext.from_defaults(llm="local", embed_model="local", chunk_size=64, chunk_overlap=0)

rerank_base = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3
)

rerank_finetuned = SentenceTransformerRerank(
    model="../models/cross_encoder", top_n=3
)

evaluator = EvidenceQasperEvaluator(service_context=service_context)


Found cached dataset qasper (/Users/zeyuli/.cache/huggingface/datasets/allenai___qasper/qasper/0.3.0/2bfcd239e581ab83f9ab7b76a82e42c6bcf574a13246ae6cc5a6c357c35f96f9)


  0%|          | 0/3 [00:00<?, ?it/s]

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/zeyuli/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
df_all_papers = pd.DataFrame()

for paper in tqdm(sample_papers, "Evaluating papers"):
    vector_index = PaperIndex(paper, service_context)
    
    baseline_engine = vector_index.as_index(show_progress=False).as_query_engine(response_mode="no_text", similarity_top_k=3)
    rerank_base_engine = vector_index.as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_base], response_mode="no_text", similarity_top_k=8)
    rerank_finetuned_engine = vector_index.as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_finetuned], response_mode="no_text", similarity_top_k=8)
    
    baseline_results = evaluator.evaluate(paper, baseline_engine)
    rerank_base_results = evaluator.evaluate(paper, rerank_base_engine)
    rerank_finetuned_results = evaluator.evaluate(paper, rerank_finetuned_engine)
    
    df_paper = pd.DataFrame()
    df_paper["baseline"] = baseline_results
    df_paper["rerank_base"] = rerank_base_results
    df_paper["rerank_finetuned"] = rerank_finetuned_results
    
    df_all_papers = pd.concat([df_all_papers, df_paper], ignore_index=True)

Evaluating papers:   0%|          | 0/50 [00:00<?, ?it/s]

In [3]:
df_all_papers

Unnamed: 0,baseline,rerank_base,rerank_finetuned
0,0.134921,0.179775,0.179775
1,0.296774,0.296774,0.296774
2,0.066960,0.066960,0.066960
3,0.093458,0.093458,0.093458
4,0.046129,0.056146,0.056146
...,...,...,...
150,0.151436,0.209877,0.209877
151,0.074480,0.074480,0.074480
152,0.119617,0.140078,0.140078
153,0.070588,0.072515,0.072515


In [4]:
df_all_papers.mean()

baseline            0.112713
rerank_base         0.123299
rerank_finetuned    0.123299
dtype: float64

## Evaluate Basline with Ragas for answer

In [None]:

import openai
import os


os.environ["OPENAI_API_KEY"] = "sk-Ei89D8xbv2ve98FT6e629a55Ca1341518f1238BaB53dAf20"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]

# attach to the same event-loop
import nest_asyncio
nest_asyncio.apply()

import pandas as pd


from ragas.llama_index import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
from tqdm.notebook import tqdm


metrics = [
    answer_relevancy,
    faithfulness,
    context_precision,
    context_recall
]

sample_papers = test_dataset.random_sample(10)

df_all_papers = pd.DataFrame()
for paper in tqdm(sample_papers):
    baseline_engine = PaperIndex(paper, service_context).as_index(show_progress=False).as_query_engine(response_mode="no_text", similarity_top_k=3)
    
    # rerank_base_engine = PaperIndex(paper, service_context).as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_base], response_mode="no_text", similarity_top_k=8)
    # 
    # rerank_finetuned_engine = PaperIndex(paper, service_context).as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_finetuned], response_mode="no_text", similarity_top_k=8)
    
    
    queries = paper.get_questions()
    ground_truths = paper.get_answers()
    
    baseline_result = evaluate(baseline_engine, metrics, queries, ground_truths)
    df_answers = baseline_result.to_pandas()
    
    # rerank_base_result = evaluate(rerank_base_engine, metrics, queries, ground_truths)
    # df_answers["rerank_base"] = rerank_base_result.to_pandas()
    # 
    # rerank_finetuned_result = evaluate(rerank_finetuned_engine, metrics, queries, ground_truths)
    # df_answers["rerank_finetuned"] = rerank_finetuned_result.to_pandas()

    df_all_papers = pd.concat([df_all_papers, df_answers], ignore_index=True)    

In [11]:
df_all_papers.head()

Unnamed: 0,question,contexts,answer,ground_truths,answer_relevancy,faithfulness,context_precision,context_recall
0,How do they perform semi-supervised learning?,[So we can do semi-supervised learning by simp...,,"[On each step, a generative network is used to...",0.719635,1.0,1.0,0.25
1,What are the five evaluated tasks?,[We evaluate AC-BLSTM on sentence-level and do...,,"[Model is evaluated on six tasks: TREC, MR, SS...",0.734443,0.0,1.0,0.0
2,what boosting techniques were used?,[We investigate an important family of ensembl...,,"[Light Gradient Boosting Machine (LGBM), Light...",0.723527,1.0,1.0,1.0
3,did they experiment with other text embeddings?,"[More specifically, these techniques enhanced ...",,"[No, Yes, No]",0.702801,1.0,1.0,0.0
4,what is the size of this improved dataset?,[This score was improved to 0.9971 when using ...,,"[363,078 structured abstracts, 363,078, Unacce...",0.723802,0.0,0.0,0.0


In [12]:
df_all_papers[["answer_relevancy", "faithfulness", "context_recall", "context_precision"]].mean()

answer_relevancy     0.722019
faithfulness         0.554054
context_recall       0.295946
context_precision    0.657658
dtype: float64

## Evaluate Basline Reranker with Ragas for answer

In [None]:

import openai
import os


os.environ["OPENAI_API_KEY"] = "sk-Ei89D8xbv2ve98FT6e629a55Ca1341518f1238BaB53dAf20"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]

# attach to the same event-loop
import nest_asyncio
nest_asyncio.apply()

import pandas as pd


from ragas.llama_index import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
from tqdm.notebook import tqdm


metrics = [
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision
]

sample_papers = test_dataset.random_sample(10)

df_all_papers = pd.DataFrame()
for paper in tqdm(sample_papers):
    # baseline_engine = PaperIndex(paper, service_context).as_index(show_progress=False).as_query_engine(response_mode="no_text", similarity_top_k=3)
    
    rerank_base_engine = PaperIndex(paper, service_context).as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_base], response_mode="no_text", similarity_top_k=8)

    # rerank_finetuned_engine = PaperIndex(paper, service_context).as_index(show_progress=False).as_query_engine(node_postprocessors=[rerank_finetuned], response_mode="no_text", similarity_top_k=8)
    
    
    queries = paper.get_questions()
    ground_truths = paper.get_answers()
    # 
    # baseline_result = evaluate(baseline_engine, metrics, queries, ground_truths)
    # df_answers = baseline_result.to_pandas()
    
    rerank_base_result = evaluate(rerank_base_engine, metrics, queries, ground_truths)
    df_answers = rerank_base_result.to_pandas()

    # rerank_finetuned_result = evaluate(rerank_finetuned_engine, metrics, queries, ground_truths)
    # df_answers["rerank_finetuned"] = rerank_finetuned_result.to_pandas()

    df_all_papers = pd.concat([df_all_papers, df_answers], ignore_index=True)    

In [14]:
df_all_papers.head()

Unnamed: 0,question,contexts,answer,ground_truths,answer_relevancy,faithfulness,context_recall,context_precision
0,How do they perform semi-supervised learning?,[So we can do semi-supervised learning by simp...,,"[On each step, a generative network is used to...",0.719791,1.0,0.75,1.0
1,What are the five evaluated tasks?,"[To the best of our knowledge, AC-BLSTM achiev...",,"[Model is evaluated on six tasks: TREC, MR, SS...",0.737302,0.0,0.0,1.0
2,what boosting techniques were used?,[We investigate an important family of ensembl...,,"[Light Gradient Boosting Machine (LGBM), Light...",0.723326,1.0,0.333333,1.0
3,did they experiment with other text embeddings?,"[More specifically, these techniques enhanced ...",,"[No, Yes, No]",0.704788,0.0,0.25,0.833333
4,what is the size of this improved dataset?,"[In this study, we introduce PICONET, a multi-...",,"[363,078 structured abstracts, 363,078, Unacce...",0.723762,0.0,0.0,0.0


In [15]:
df_all_papers[["answer_relevancy", "faithfulness", "context_recall", "context_precision"]].mean()

answer_relevancy     0.722405
faithfulness         0.513514
context_recall       0.289640
context_precision    0.684685
dtype: float64