# Evaluate the responses generated from the finetuned responses

In [4]:
import pandas as pd 
from langchain_openai import OpenAIEmbeddings
from pydantic import BaseModel, SecretStr

import os 
from typing import Any, AsyncGenerator, Dict, List, TypeVar, Union, cast, AsyncIterator

from dotenv import find_dotenv, load_dotenv

load_dotenv()

True

### Types of models / data:

* model_1. Model that has been finetuned using questions with a source
* model_2. Model that has been finetuned using questions without a asource

* data_1. Test set with sources in question
* data_2. Test set without sources in question 

* df11 = model_1 + data_1 
* df12 = model_1 + data_2
* df21 = model_2 + data_1
* df22 = model_2 + data_2

In [5]:
sources_in_question_test_set = [
    # Test Data set Sources in Question
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_sources_in_question_r_16.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_sources_in_question_r_16.csv',
    ),
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_sources_in_question_r_32.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_sources_in_question_r_32.csv'
    ),
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_sources_in_question_r_64.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_sources_in_question_r_64.csv',
    ),
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_sources_in_question_r_128.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_sources_in_question_r_128.csv',
    ),
]

no_sources_in_question_test_set = [
    # Test Data Set No Sources in Question
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_16.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_16.csv',
    ),
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_32.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_32.csv'
    ),
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_64.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_64.csv',
    ),
    (
        'evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_128.csv',
        'evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_no_sources_in_question_r_128.csv',
    ),
]

r_list = [16, 32, 64, 128]

In [23]:
def merge_dfs(df_pairs):
    merged_dfs = []

    for idx, (file1, file2) in enumerate(df_pairs):
        df1 = pd.read_csv(file1)
        df2 = pd.read_csv(file2)
        # drop rows where true_answer column contains 'NO ANSWER FOUND
        df1 = df1[~df1['true_answer'].str.contains('NO ANSWER FOUND')]
        df2 = df2[~df2['true_answer'].str.contains('NO ANSWER FOUND')]
        
        suffix1 = 'finetuned_model_no_sources_in_question'
        suffix2 = 'finetuned_model_sources_in_question'
        df1 = df1.rename(columns={"generated_answer": f"generated_answer_{suffix1}"})
        df2 = df2.rename(columns={"generated_answer": f"generated_answer_{suffix2}"})
        
        merged_df = pd.merge(df1, df2, on=["context", "question", "true_answer"], suffixes=(f"_{suffix1}", f"_{suffix2}"))
        merged_df['r'] = r_list[idx]
        merged_dfs.append(merged_df.reset_index(drop=True))
    return merged_dfs


In [24]:
merged_sources_in_question_test_set = merge_dfs(sources_in_question_test_set)
merged_no_sources_in_question_test_set = merge_dfs(no_sources_in_question_test_set)

In [27]:
len(merged_no_sources_in_question_test_set), len(merged_sources_in_question_test_set)
for df in merged_sources_in_question_test_set:
    print(df.shape)
for df in merged_no_sources_in_question_test_set:
    print(df.shape)

(161, 6)
(161, 6)
(161, 6)
(161, 6)
(180, 6)
(180, 6)
(180, 6)
(180, 6)


In [None]:
# df11 = pd.read_csv('evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_sources_in_question.csv')
# df12 = pd.read_csv('evaluations/2024NOV14_llama_3_1_8b_sources_in_question_test_output_model_finetuned_no_sources_in_question.csv')

# df21 = pd.read_csv('evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_sources_in_question.csv')
# df22 = pd.read_csv('evaluations/2024NOV16_llama_3_1_8b_no_sources_in_question_test_output_model_finetuned_no_sources_in_question.csv')

# df11 = df11[~df11['true_answer'].str.contains('NO ANSWER FOUND')]
# df12 = df12[~df12['true_answer'].str.contains('NO ANSWER FOUND')]
# df21 = df21[~df21['true_answer'].str.contains('NO ANSWER FOUND')]
# df22 = df22[~df22['true_answer'].str.contains('NO ANSWER FOUND')]

* Try BERTScore
* Get cosine similarity between True Answer vs Generated Answer


### Get embedding of generated answer

In [28]:
embedding_model = "text-embedding-nomic-embed-text-v1.5@f32"
embedding_func = OpenAIEmbeddings(
                                    model=embedding_model,
                                    base_url="http://localhost:1234/v1",
                                    api_key=cast(SecretStr, os.getenv("LLM_CLIENT_API_KEY", "lm_studio")),
                                    check_embedding_ctx_length=False,  # https://github.com/langchain-ai/langchain/issues/21318
                                )

In [29]:
merged_no_sources_in_question_test_set[0].head(1)

Unnamed: 0,context,question,true_answer,generated_answer_finetuned_model_no_sources_in_question,generated_answer_finetuned_model_sources_in_question,r
0,What is the Role of Small Models in the LLM Er...,How does the transfer learning technique of we...,The transfer learning technique of weak-to-str...,Large language models can improve health-care ...,The transfer learning technique of weak-to-str...,16


In [30]:
merged_sources_in_question_test_set[0].head(1)

Unnamed: 0,context,question,true_answer,generated_answer_finetuned_model_no_sources_in_question,generated_answer_finetuned_model_sources_in_question,r
0,What is the Role of Small Models in the LLM Er...,Why do computation-constrained environments fa...,"According to the text, computation-constrained...","According to the paper ""What is the Role of Sm...",Model-based evaluation approaches like BERTSCO...,16


In [31]:
import asyncio 
# async def get_embeddings_for_answer(df, generated_answer_column, true_answer_column):
#     async def get_embedding(text):
#         return await embedding_func.aembed_query(text)
    
#     df['true_answer_embedding'] = await asyncio.gather(*[get_embedding(text) for text in df[generated_answer_column]])
#     df['generated_answer_embedding'] = await asyncio.gather(*[get_embedding(text) for text in df[generated_answer_column]])

#     return df

async def get_embeddings_for_answer(df, ):
    async def get_embedding(text):
        return await embedding_func.aembed_query(text)
    
    df['true_answer_embedding'] = await asyncio.gather(*[get_embedding(text) for text in df['true_answer']])
    df['generated_answer_finetuned_model_no_sources_in_question_embedding'] = await asyncio.gather(*[get_embedding(text) for text in df['generated_answer_finetuned_model_no_sources_in_question']])
    df['generated_answer_finetuned_model_sources_in_question_embedding'] = await asyncio.gather(*[get_embedding(text) for text in df['generated_answer_finetuned_model_sources_in_question']])

    return df

In [32]:
for df in merged_no_sources_in_question_test_set:
    df = await get_embeddings_for_answer(df)
for df in merged_sources_in_question_test_set:
    df = await get_embeddings_for_answer(df)


In [5]:
# df11 = await get_embeddings_for_answer(df11)
# df12 = await get_embeddings_for_answer(df12)
# df21 = await get_embeddings_for_answer(df21)
# df22 = await get_embeddings_for_answer(df22)

### Now generate cosine similarity between true answer and generated answer

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_cosine_similarity(df, col1, col2, resulting_col):
    # Convert the embeddings to numpy arrays
    embeddings1 = np.array(df[col1].tolist())
    embeddings2 = np.array(df[col2].tolist())
    
    # Calculate cosine similarity
    similarities = cosine_similarity(embeddings1, embeddings2)
    
    # Since we are comparing each row with itself, we need the diagonal of the similarity matrix
    df[resulting_col] = np.diag(similarities)
    
    return df

# # Calculate cosine similarity for df11
# df11 = calculate_cosine_similarity(df11, 'true_answer_embedding', 'generated_answer_embedding')

# # Calculate cosine similarity for df12
# df12 = calculate_cosine_similarity(df12, 'true_answer_embedding', 'generated_answer_embedding')

# # Calculate cosine similarity for df21
# df21 = calculate_cosine_similarity(df21, 'true_answer_embedding', 'generated_answer_embedding')

# # Calculate cosine similarity for df22
# df22 = calculate_cosine_similarity(df22, 'true_answer_embedding', 'generated_answer_embedding')
# print("Cosine Similarity Description for df11:")
# print(df11['cosine_similarity'].describe())

# print("\nCosine Similarity Description for df12:")
# print(df12['cosine_similarity'].describe())

# print("\nCosine Similarity Description for df21:")
# print(df21['cosine_similarity'].describe())

# print("\nCosine Similarity Description for df22:")
# print(df22['cosine_similarity'].describe())

In [36]:
merged_no_sources_in_question_test_set[0].head(1)

Unnamed: 0,context,question,true_answer,generated_answer_finetuned_model_no_sources_in_question,generated_answer_finetuned_model_sources_in_question,r,true_answer_embedding,generated_answer_finetuned_model_no_sources_in_question_embedding,generated_answer_finetuned_model_sources_in_question_embedding
0,What is the Role of Small Models in the LLM Er...,How does the transfer learning technique of we...,The transfer learning technique of weak-to-str...,Large language models can improve health-care ...,The transfer learning technique of weak-to-str...,16,"[-0.0012398749822750688, 0.04067772626876831, ...","[-0.01908096857368946, 0.09143900871276855, -0...","[0.012546073645353317, 0.005591242108494043, -..."


In [38]:
for df in merged_no_sources_in_question_test_set:
    df = calculate_cosine_similarity(df, 'true_answer_embedding', 'generated_answer_finetuned_model_no_sources_in_question_embedding', 'cosine_similarity_finetuned_model_no_sources_in_question')
    df = calculate_cosine_similarity(df, 'true_answer_embedding', 'generated_answer_finetuned_model_sources_in_question_embedding', 'cosine_similarity_finetuned_model_sources_in_question')

for df in merged_sources_in_question_test_set:
    df = calculate_cosine_similarity(df, 'true_answer_embedding', 'generated_answer_finetuned_model_no_sources_in_question_embedding', 'cosine_similarity_finetuned_model_no_sources_in_question')
    df = calculate_cosine_similarity(df, 'true_answer_embedding', 'generated_answer_finetuned_model_sources_in_question_embedding', 'cosine_similarity_finetuned_model_sources_in_question')

In [40]:
concat_no_sources_in_question_test_set = pd.concat(merged_no_sources_in_question_test_set)
concat_sources_in_question_test_set = pd.concat(merged_sources_in_question_test_set)

In [41]:
concat_no_sources_in_question_test_set.shape, concat_sources_in_question_test_set.shape

((720, 11), (644, 11))

In [43]:
concat_no_sources_in_question_test_set.groupby(['r'])[['cosine_similarity_finetuned_model_no_sources_in_question', 'cosine_similarity_finetuned_model_sources_in_question']].describe()

Unnamed: 0_level_0,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
r,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
16,180.0,0.766992,0.126484,0.405244,0.676157,0.773191,0.872571,0.972202,180.0,0.698324,0.118764,0.387161,0.623971,0.693299,0.776967,0.969082
32,180.0,0.711992,0.111065,0.414118,0.645691,0.705369,0.800605,0.940339,180.0,0.770957,0.123415,0.390667,0.677258,0.798602,0.880405,0.959573
64,180.0,0.70768,0.122773,0.380262,0.627745,0.702928,0.790911,0.971977,180.0,0.701597,0.113621,0.40667,0.626663,0.685806,0.766781,0.952121
128,180.0,0.719992,0.121062,0.391365,0.638582,0.72144,0.80398,0.970777,180.0,0.69494,0.112707,0.390009,0.626798,0.69005,0.772151,0.966145


In [44]:
concat_sources_in_question_test_set.groupby(['r'])[['cosine_similarity_finetuned_model_no_sources_in_question', 'cosine_similarity_finetuned_model_sources_in_question']].describe()

Unnamed: 0_level_0,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,cosine_similarity_finetuned_model_sources_in_question
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
r,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
16,161.0,0.812035,0.112848,0.477632,0.73097,0.828531,0.911637,0.967758,161.0,0.766865,0.116356,0.449759,0.683271,0.779324,0.857104,0.965559
32,161.0,0.763273,0.120425,0.460123,0.663333,0.765378,0.865303,0.974845,161.0,0.820086,0.116039,0.462697,0.745799,0.833053,0.92147,0.978988
64,161.0,0.768057,0.112973,0.460343,0.685435,0.77188,0.8506,0.981624,161.0,0.764328,0.113376,0.49818,0.6876,0.767511,0.84694,0.978001
128,161.0,0.774672,0.116225,0.518617,0.693755,0.761267,0.877907,0.972599,161.0,0.76623,0.116481,0.504274,0.679415,0.769064,0.85833,0.983045


In [58]:
concat_sources_in_question_test_set.to_csv('evaluations/sources_in_question_test_set_results.csv', index=False)
concat_no_sources_in_question_test_set.to_csv('evaluations/no_sources_in_question_test_set_results.csv', index=False)

https://chatgpt.com/c/674509ef-9e80-8002-8e80-992c0ae46ee9

# Compute BERTScore

In [46]:
from bert_score import score
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

print(device)

  or not issubclass(enum_class, Flag)


cuda


In [55]:
def calculate_bertscore(df, reference_col, prediction_col, suffix, lang='en', model_type='roberta-large', device=device, batch_size=16):
    P_list, R_list, F1_list = [], [], []
    # for i in range(0, len(df), batch_size):
    #     batch_df = df.iloc[i:i+batch_size]
    P, R, F1 = score(df[prediction_col].tolist(), df[reference_col].tolist(), lang=lang, model_type=model_type, device=device, batch_size=batch_size)
    P_list.extend(P.tolist())
    R_list.extend(R.tolist())
    F1_list.extend(F1.tolist())
    df['bertscore_P'+ suffix] = P_list
    df['bertscore_R'+ suffix] = R_list
    df['bertscore_F1'+ suffix] = F1_list
    return df

# Calculate BERTScore for df11
# df11 = calculate_bertscore(df11, 'true_answer', 'generated_answer')

# # Calculate BERTScore for df12
# df12 = calculate_bertscore(df12, 'true_answer', 'generated_answer')

# # Calculate BERTScore for df21
# df21 = calculate_bertscore(df21, 'true_answer', 'generated_answer')

# # Calculate BERTScore for df22
# df22 = calculate_bertscore(df22, 'true_answer', 'generated_answer')

In [56]:
concat_sources_in_question_test_set = calculate_bertscore(concat_sources_in_question_test_set, 'true_answer', 'generated_answer_finetuned_model_sources_in_question', '_finetuned_model_sources_in_question')
concat_sources_in_question_test_set = calculate_bertscore(concat_sources_in_question_test_set, 'true_answer', 'generated_answer_finetuned_model_no_sources_in_question', '_finetuned_model_no_sources_in_question')

concat_no_sources_in_question_test_set = calculate_bertscore(concat_no_sources_in_question_test_set, 'true_answer', 'generated_answer_finetuned_model_sources_in_question', '_finetuned_model_sources_in_question')
concat_no_sources_in_question_test_set = calculate_bertscore(concat_no_sources_in_question_test_set, 'true_answer', 'generated_answer_finetuned_model_no_sources_in_question', '_finetuned_model_no_sources_in_question')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [57]:
concat_sources_in_question_test_set.head(1)

Unnamed: 0,context,question,true_answer,generated_answer_finetuned_model_no_sources_in_question,generated_answer_finetuned_model_sources_in_question,r,true_answer_embedding,generated_answer_finetuned_model_no_sources_in_question_embedding,generated_answer_finetuned_model_sources_in_question_embedding,cosine_similarity_finetuned_model_no_sources_in_question,cosine_similarity_finetuned_model_sources_in_question,bertscore_P_finetuned_model_sources_in_question,bertscore_R_finetuned_model_sources_in_question,bertscore_F1_finetuned_model_sources_in_question,bertscore_P_finetuned_model_no_sources_in_question,bertscore_R_finetuned_model_no_sources_in_question,bertscore_F1_finetuned_model_no_sources_in_question
0,What is the Role of Small Models in the LLM Er...,Why do computation-constrained environments fa...,"According to the text, computation-constrained...","According to the paper ""What is the Role of Sm...",Model-based evaluation approaches like BERTSCO...,16,"[0.017459962517023087, 0.12867099046707153, -0...","[-0.022837018594145775, 0.08393042534589767, -...","[-0.0029367581009864807, 0.06684571504592896, ...",0.783405,0.791908,0.871762,0.885544,0.878599,0.854539,0.865411,0.859941


In [64]:
def print_bertscore_descriptions_grouped_by_r(df, name):
    grouped = df.groupby('r')
    
    for r_value, group in grouped:
        desc_P_sources = group['bertscore_P_finetuned_model_sources_in_question'].describe()
        desc_R_sources = group['bertscore_R_finetuned_model_sources_in_question'].describe()
        desc_F1_sources = group['bertscore_F1_finetuned_model_sources_in_question'].describe()

        desc_P_no_sources = group['bertscore_P_finetuned_model_no_sources_in_question'].describe()
        desc_R_no_sources = group['bertscore_R_finetuned_model_no_sources_in_question'].describe()
        desc_F1_no_sources = group['bertscore_F1_finetuned_model_no_sources_in_question'].describe()

        print(f"BERTScore Description for {name} (r={r_value}, Finetuned model Sources in Question):")
        print(f"{'Metric':<10} {'Count':<10} {'Mean':<10} {'Std':<10} {'Min':<10} {'25%':<10} {'50%':<10} {'75%':<10} {'Max':<10}")
        print(f"{'P':<10} {desc_P_sources['count']:<10.2f} {desc_P_sources['mean']:<10.4f} {desc_P_sources['std']:<10.4f} {desc_P_sources['min']:<10.4f} {desc_P_sources['25%']:<10.4f} {desc_P_sources['50%']:<10.4f} {desc_P_sources['75%']:<10.4f} {desc_P_sources['max']:<10.4f}")
        print(f"{'R':<10} {desc_R_sources['count']:<10.2f} {desc_R_sources['mean']:<10.4f} {desc_R_sources['std']:<10.4f} {desc_R_sources['min']:<10.4f} {desc_R_sources['25%']:<10.4f} {desc_R_sources['50%']:<10.4f} {desc_R_sources['75%']:<10.4f} {desc_R_sources['max']:<10.4f}")
        print(f"{'F1':<10} {desc_F1_sources['count']:<10.2f} {desc_F1_sources['mean']:<10.4f} {desc_F1_sources['std']:<10.4f} {desc_F1_sources['min']:<10.4f} {desc_F1_sources['25%']:<10.4f} {desc_F1_sources['50%']:<10.4f} {desc_F1_sources['75%']:<10.4f} {desc_F1_sources['max']:<10.4f}")
        print("\n")

        print(f"BERTScore Description for {name} (r={r_value}, Finetuned model No Sources in Question):")
        print(f"{'Metric':<10} {'Count':<10} {'Mean':<10} {'Std':<10} {'Min':<10} {'25%':<10} {'50%':<10} {'75%':<10} {'Max':<10}")
        print(f"{'P':<10} {desc_P_no_sources['count']:<10.2f} {desc_P_no_sources['mean']:<10.4f} {desc_P_no_sources['std']:<10.4f} {desc_P_no_sources['min']:<10.4f} {desc_P_no_sources['25%']:<10.4f} {desc_P_no_sources['50%']:<10.4f} {desc_P_no_sources['75%']:<10.4f} {desc_P_no_sources['max']:<10.4f}")
        print(f"{'R':<10} {desc_R_no_sources['count']:<10.2f} {desc_R_no_sources['mean']:<10.4f} {desc_R_no_sources['std']:<10.4f} {desc_R_no_sources['min']:<10.4f} {desc_R_no_sources['25%']:<10.4f} {desc_R_no_sources['50%']:<10.4f} {desc_R_no_sources['75%']:<10.4f} {desc_R_no_sources['max']:<10.4f}")
        print(f"{'F1':<10} {desc_F1_no_sources['count']:<10.2f} {desc_F1_no_sources['mean']:<10.4f} {desc_F1_no_sources['std']:<10.4f} {desc_F1_no_sources['min']:<10.4f} {desc_F1_no_sources['25%']:<10.4f} {desc_F1_no_sources['50%']:<10.4f} {desc_F1_no_sources['75%']:<10.4f} {desc_F1_no_sources['max']:<10.4f}")
        print("\n")


In [66]:

print_bertscore_descriptions_grouped_by_r(concat_sources_in_question_test_set, "Sources in Question Test Set")
print_bertscore_descriptions_grouped_by_r(concat_no_sources_in_question_test_set, "No Sources in Question Test Set")

BERTScore Description for Sources in Question Test Set (r=16, Finetuned model Sources in Question):
Metric     Count      Mean       Std        Min        25%        50%        75%        Max       
P          161.00     0.8394     0.0263     0.7766     0.8221     0.8409     0.8566     0.8981    
R          161.00     0.8383     0.0269     0.7686     0.8203     0.8358     0.8568     0.9080    
F1         161.00     0.8387     0.0248     0.7911     0.8206     0.8383     0.8564     0.8995    


BERTScore Description for Sources in Question Test Set (r=16, Finetuned model No Sources in Question):
Metric     Count      Mean       Std        Min        25%        50%        75%        Max       
P          161.00     0.8427     0.0318     0.6351     0.8250     0.8459     0.8627     0.9173    
R          161.00     0.8444     0.0273     0.7718     0.8287     0.8465     0.8614     0.9112    
F1         161.00     0.8434     0.0272     0.6981     0.8260     0.8463     0.8611     0.9142    


B

In [49]:
def print_bertscore_descriptions(df, name):
    desc_P = df['bertscore_P'].describe()
    desc_R = df['bertscore_R'].describe()
    desc_F1 = df['bertscore_F1'].describe()
    
    print(f"BERTScore Description for {name}:")
    print(f"{'Metric':<10} {'Count':<10} {'Mean':<10} {'Std':<10} {'Min':<10} {'25%':<10} {'50%':<10} {'75%':<10} {'Max':<10}")
    print(f"{'P':<10} {desc_P['count']:<10.2f} {desc_P['mean']:<10.4f} {desc_P['std']:<10.4f} {desc_P['min']:<10.4f} {desc_P['25%']:<10.4f} {desc_P['50%']:<10.4f} {desc_P['75%']:<10.4f} {desc_P['max']:<10.4f}")
    print(f"{'R':<10} {desc_R['count']:<10.2f} {desc_R['mean']:<10.4f} {desc_R['std']:<10.4f} {desc_R['min']:<10.4f} {desc_R['25%']:<10.4f} {desc_R['50%']:<10.4f} {desc_R['75%']:<10.4f} {desc_R['max']:<10.4f}")
    print(f"{'F1':<10} {desc_F1['count']:<10.2f} {desc_F1['mean']:<10.4f} {desc_F1['std']:<10.4f} {desc_F1['min']:<10.4f} {desc_F1['25%']:<10.4f} {desc_F1['50%']:<10.4f} {desc_F1['75%']:<10.4f} {desc_F1['max']:<10.4f}")
    print("\n")

# print_bertscore_descriptions(df11, "df11")
# print_bertscore_descriptions(df12, "df12")
# print_bertscore_descriptions(df21, "df21")
# print_bertscore_descriptions(df22, "df22")