In [None]:
!pip install sentence-transformers rouge transformers evaluate bert_score

In [2]:
import pandas as pd
import os
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from sentence_transformers import CrossEncoder
from evaluate import load

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
nltk.download('wordnet')

In [5]:
# Define root_path and data_path

In [None]:
# Vector

# Prepare to accumulate DataFrames
dfs = []

# Loop through the files in the specified directory
for i in range(1, 11):  # Assuming files are numbered from 1 to 10
    file_name = f'file_name'
    file_path = os.path.join(data_path, file_name)

    # Check if the file exists
    if os.path.exists(file_path):
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Append the DataFrame to the list
        dfs.append(df)
    else:
        print(f'File not found: {file_name}')

# Concatenate all DataFrames in the list into a single DataFrame
vector_df = pd.concat(dfs, ignore_index=True)

print(vector_df.shape)

In [7]:
vector_df.columns

Index(['id', 'question', 'teacher_answer', 'response_vector_no_guidance',
       'context_vector_no_guidance', 'response_vector_low_guidance',
       'context_vector_low_guidance', 'response_vector_high_guidance',
       'context_vector_high_guidance'],
      dtype='object')

In [None]:
vector_df_without_column = vector_df.drop(columns=['teacher_answer'])
vector_df_without_column_replaced = vector_df_without_column.replace('None', pd.NA)
len(vector_df_without_column_replaced.dropna())

In [None]:
# Graph

# List to accumulate DataFrames
dfs = []

# Loop through the specified range, including the special case
for i in range(1, 6):  # Files are numbered from 1 to 5
    if i != 4:
        # For standard files
        file_name = f'file_name'
        file_path = os.path.join(data_path, file_name)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            dfs.append(df)
        else:
            print(f'File not found: {file_name}')
    else:
        # For the special case with files 4_1 to 4_3
        for j in range(1, 4):  # Sub-files are numbered from 1 to 3
            file_name = f'file_name'
            file_path = os.path.join(data_path, file_name)
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                dfs.append(df)
            else:
                print(f'File not found: {file_name}')

# Concatenate all DataFrames in the list into a single DataFrame
graph_df = pd.concat(dfs, ignore_index=True)

print(graph_df.shape)

In [10]:
graph_df.columns

Index(['id', 'question', 'teacher_answer', 'response_graph_no_guidance',
       'context_graph_no_guidance', 'response_graph_low_guidance',
       'context_graph_low_guidance', 'response_graph_high_guidance',
       'context_graph_high_guidance'],
      dtype='object')

In [None]:
graph_df_without_column = graph_df.drop(columns=['teacher_answer'])
graph_df_without_column_replaced = graph_df_without_column.replace('None', pd.NA)
len(graph_df_without_column_replaced.dropna())

In [None]:
# Tree

# Prepare to accumulate DataFrames
dfs = []

# Loop through the files in the specified directory
for i in range(1, 11):  # Assuming files are numbered from 1 to 10
    file_name = f'file_name'
    file_path = os.path.join(data_path, file_name)

    # Check if the file exists
    if os.path.exists(file_path):
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Append the DataFrame to the list
        dfs.append(df)
    else:
        print(f'File not found: {file_name}')

# Concatenate all DataFrames in the list into a single DataFrame
tree_df = pd.concat(dfs, ignore_index=True)

print(tree_df.shape)

In [13]:
tree_df.columns

Index(['id', 'question', 'teacher_answer', 'response_tree_no_guidance',
       'context_tree_no_guidance', 'response_tree_low_guidance',
       'context_tree_low_guidance', 'response_tree_high_guidance',
       'context_tree_high_guidance'],
      dtype='object')

In [None]:
tree_df_without_column = tree_df.drop(columns=['teacher_answer'])
tree_df_without_column_replaced = tree_df_without_column.replace('None', pd.NA)
len(tree_df_without_column_replaced.dropna())

In [15]:
def sample_nan_or_none_row(df):
    """
    Randomly samples a row from the DataFrame that contains at least one NaN or the string "None",
    without altering the original DataFrame values.

    Parameters:
    - df: pandas.DataFrame to sample from.

    Returns:
    - A randomly sampled row (as a DataFrame) that contains at least one NaN or "None".
    """
    # Create a mask to identify rows with at least one occurrence of np.nan or "None"
    mask = df.apply(lambda x: x.isna() | (x == "None")).any(axis=1)

    # Filter the DataFrame to only include such rows
    df_with_nan_or_none = df[mask]

    # Randomly sample one row from this filtered DataFrame
    if not df_with_nan_or_none.empty:
        return df_with_nan_or_none.sample(n=1)
    else:
        return None  # Or return an empty DataFrame, depending on your preference

In [None]:
# First, merge vector_df with graph_df
temp_df = pd.merge(vector_df, graph_df.drop(columns=['question', 'teacher_answer']), on='id', how='left')

# Then, merge the result with tree_df
merged_df_all = pd.merge(temp_df, tree_df.drop(columns=['question', 'teacher_answer']), on='id', how='left')
merged_df_all.shape

In [17]:
merged_df_all.columns

Index(['id', 'question', 'teacher_answer', 'response_vector_no_guidance',
       'context_vector_no_guidance', 'response_vector_low_guidance',
       'context_vector_low_guidance', 'response_vector_high_guidance',
       'context_vector_high_guidance', 'response_graph_no_guidance',
       'context_graph_no_guidance', 'response_graph_low_guidance',
       'context_graph_low_guidance', 'response_graph_high_guidance',
       'context_graph_high_guidance', 'response_tree_no_guidance',
       'context_tree_no_guidance', 'response_tree_low_guidance',
       'context_tree_low_guidance', 'response_tree_high_guidance',
       'context_tree_high_guidance'],
      dtype='object')

In [18]:
for col in merged_df_all.columns:
    if col != 'id':
        merged_df_all[col] = merged_df_all[col].astype(str)

In [19]:
bert_score = load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
model = CrossEncoder('vectara/hallucination_evaluation_model')

In [None]:
metric_columns = [
    # Vector
    # no guidance reference
    'bleu_score_vector_no_guidance_reference',
    'bert_score_f_vector_no_guidance_reference',
    'cross_encoder_score_vector_no_guidance_reference',
    # low guidance reference
    'bleu_score_vector_low_guidance_reference',
    'bert_score_f_vector_low_guidance_reference',
    'cross_encoder_score_vector_low_guidance_reference',
    # high guidance reference
    'bleu_score_vector_high_guidance_reference',
    'bert_score_f_vector_high_guidance_reference',
    'cross_encoder_score_vector_high_guidance_reference',
    # no guidance context
    'bleu_score_vector_no_guidance_context',
    'bert_score_f_vector_no_guidance_context',
    'cross_encoder_score_vector_no_guidance_context',
    # low guidance context
    'bleu_score_vector_low_guidance_context',
    'bert_score_f_vector_low_guidance_context',
    'cross_encoder_score_vector_low_guidance_context',
    # high guidance context
    'bleu_score_vector_high_guidance_context',
    'bert_score_f_vector_high_guidance_context',
    'cross_encoder_score_vector_high_guidance_context',
    # Graph
    # no guidance reference
    'bleu_score_graph_no_guidance_reference',
    'bert_score_f_graph_no_guidance_reference',
    'cross_encoder_score_graph_no_guidance_reference',
    # low guidance reference
    'bleu_score_graph_low_guidance_reference',
    'bert_score_f_graph_low_guidance_reference',
    'cross_encoder_score_graph_low_guidance_reference',
    # high guidance reference
    'bleu_score_graph_high_guidance_reference',
    'bert_score_f_graph_high_guidance_reference',
    'cross_encoder_score_graph_high_guidance_reference',
    # no guidance context
    'bleu_score_graph_no_guidance_context',
    'bert_score_f_graph_no_guidance_context',
    'cross_encoder_score_graph_no_guidance_context',
    # low guidance context
    'bleu_score_graph_low_guidance_context',
    'bert_score_f_graph_low_guidance_context',
    'cross_encoder_score_graph_low_guidance_context',
    # high guidance context
    'bleu_score_graph_high_guidance_context',
    'bert_score_f_graph_high_guidance_context',
    'cross_encoder_score_graph_high_guidance_context',
    # Tree
    # no guidance reference
    'bleu_score_tree_no_guidance_reference',
    'bert_score_f_tree_no_guidance_reference',
    'cross_encoder_score_tree_no_guidance_reference',
    # low guidance reference
    'bleu_score_tree_low_guidance_reference',
    'bert_score_f_tree_low_guidance_reference',
    'cross_encoder_score_tree_low_guidance_reference',
    # high guidance reference
    'bleu_score_tree_high_guidance_reference',
    'bert_score_f_tree_high_guidance_reference',
    'cross_encoder_score_tree_high_guidance_reference',
    # no guidance context
    'bleu_score_tree_no_guidance_context',
    'bert_score_f_tree_no_guidance_context',
    'cross_encoder_score_tree_no_guidance_context',
    # low guidance context
    'bleu_score_tree_low_guidance_context',
    'bert_score_f_tree_low_guidance_context',
    'cross_encoder_score_tree_low_guidance_context',
    # high guidance context
    'bleu_score_tree_high_guidance_context',
    'bert_score_f_tree_high_guidance_context',
    'cross_encoder_score_tree_high_guidance_context'
]

metrics_results = {metric: [] for metric in metric_columns}

for i, row in merged_df_all.iterrows():
    if i % 10 == 0:
        print(f'i: {i}')

    question = row['question']
    teacher_answer = row['teacher_answer']
    # teacher_answer = str(row['teacher_answer'])
    # Vector
    response_vector_no_guidance = row['response_vector_no_guidance']
    context_vector_no_guidance = row['context_vector_no_guidance']
    response_vector_low_guidance = row['response_vector_low_guidance']
    context_vector_low_guidance = row['context_vector_low_guidance']
    response_vector_high_guidance = row['response_vector_high_guidance']
    context_vector_high_guidance = row['context_vector_high_guidance']
    # Graph
    response_graph_no_guidance = row['response_graph_no_guidance']
    context_graph_no_guidance = row['context_graph_no_guidance']
    response_graph_low_guidance = row['response_graph_low_guidance']
    context_graph_low_guidance = row['context_graph_low_guidance']
    response_graph_high_guidance = row['response_graph_high_guidance']
    context_graph_high_guidance = row['context_graph_high_guidance']
    # Tree
    response_tree_no_guidance = row['response_tree_no_guidance']
    context_tree_no_guidance = row['context_tree_no_guidance']
    response_tree_low_guidance = row['response_tree_low_guidance']
    context_tree_low_guidance = row['context_tree_low_guidance']
    response_tree_high_guidance = row['response_tree_high_guidance']
    context_tree_high_guidance = row['context_tree_high_guidance']

    if not pd.isna(teacher_answer):
        # Vector

        # No guidance.
        bleu_score_vector_no_guidance_reference = sentence_bleu([teacher_answer.split()], response_vector_no_guidance.split())

        bert_score_result_vector_no_guidance_reference = bert_score.compute(predictions=[response_vector_no_guidance], references=[teacher_answer], lang="en")
        bert_score_f_vector_no_guidance_reference = bert_score_result_vector_no_guidance_reference['f1'][0]

        cross_encoder_score_vector_no_guidance_reference = model.predict([[teacher_answer, response_vector_no_guidance]])[0]

        # Low guidance.
        bleu_score_vector_low_guidance_reference = sentence_bleu([teacher_answer.split()], response_vector_low_guidance.split())

        bert_score_result_vector_low_guidance_reference = bert_score.compute(predictions=[response_vector_low_guidance], references=[teacher_answer], lang="en")
        bert_score_f_vector_low_guidance_reference = bert_score_result_vector_low_guidance_reference['f1'][0]

        cross_encoder_score_vector_low_guidance_reference = model.predict([[teacher_answer, response_vector_low_guidance]])[0]

        # High guidance.
        bleu_score_vector_high_guidance_reference = sentence_bleu([teacher_answer.split()], response_vector_high_guidance.split())

        bert_score_result_vector_high_guidance_reference = bert_score.compute(predictions=[response_vector_high_guidance], references=[teacher_answer], lang="en")
        bert_score_f_vector_high_guidance_reference = bert_score_result_vector_high_guidance_reference['f1'][0]

        cross_encoder_score_vector_high_guidance_reference = model.predict([[teacher_answer, response_vector_high_guidance]])[0]

        # Graph

        # No guidance.
        bleu_score_graph_no_guidance_reference = sentence_bleu([teacher_answer.split()], response_graph_no_guidance.split())

        bert_score_result_graph_no_guidance_reference = bert_score.compute(predictions=[response_graph_no_guidance], references=[teacher_answer], lang="en")
        bert_score_f_graph_no_guidance_reference = bert_score_result_graph_no_guidance_reference['f1'][0]

        cross_encoder_score_graph_no_guidance_reference = model.predict([[teacher_answer, response_graph_no_guidance]])[0]

        # Low guidance.
        bleu_score_graph_low_guidance_reference = sentence_bleu([teacher_answer.split()], response_graph_low_guidance.split())

        bert_score_result_graph_low_guidance_reference = bert_score.compute(predictions=[response_graph_low_guidance], references=[teacher_answer], lang="en")
        bert_score_f_graph_low_guidance_reference = bert_score_result_graph_low_guidance_reference['f1'][0]

        cross_encoder_score_graph_low_guidance_reference = model.predict([[teacher_answer, response_graph_low_guidance]])[0]

        # High guidance.
        bleu_score_graph_high_guidance_reference = sentence_bleu([teacher_answer.split()], response_graph_high_guidance.split())

        bert_score_result_graph_high_guidance_reference = bert_score.compute(predictions=[response_graph_high_guidance], references=[teacher_answer], lang="en")
        bert_score_f_graph_high_guidance_reference = bert_score_result_graph_high_guidance_reference['f1'][0]

        cross_encoder_score_graph_high_guidance_reference = model.predict([[teacher_answer, response_graph_high_guidance]])[0]

        # Tree

        # No guidance.
        bleu_score_tree_no_guidance_reference = sentence_bleu([teacher_answer.split()], response_tree_no_guidance.split())

        bert_score_result_tree_no_guidance_reference = bert_score.compute(predictions=[response_tree_no_guidance], references=[teacher_answer], lang="en")
        bert_score_f_tree_no_guidance_reference = bert_score_result_tree_no_guidance_reference['f1'][0]

        cross_encoder_score_tree_no_guidance_reference = model.predict([[teacher_answer, response_tree_no_guidance]])[0]

        # Low guidance.
        bleu_score_tree_low_guidance_reference = sentence_bleu([teacher_answer.split()], response_tree_low_guidance.split())

        bert_score_result_tree_low_guidance_reference = bert_score.compute(predictions=[response_tree_low_guidance], references=[teacher_answer], lang="en")
        bert_score_f_tree_low_guidance_reference = bert_score_result_tree_low_guidance_reference['f1'][0]

        cross_encoder_score_tree_low_guidance_reference = model.predict([[teacher_answer, response_tree_low_guidance]])[0]

        # High guidance.
        bleu_score_tree_high_guidance_reference = sentence_bleu([teacher_answer.split()], response_tree_high_guidance.split())

        bert_score_result_tree_high_guidance_reference = bert_score.compute(predictions=[response_tree_high_guidance], references=[teacher_answer], lang="en")
        bert_score_f_tree_high_guidance_reference = bert_score_result_tree_high_guidance_reference['f1'][0]

        cross_encoder_score_tree_high_guidance_reference = model.predict([[teacher_answer, response_tree_high_guidance]])[0]

    else:
        # Vector

        # No guidance.
        bleu_score_vector_no_guidance_reference = np.nan
        bert_score_f_vector_no_guidance_reference = np.nan
        cross_encoder_score_vector_no_guidance_reference = np.nan

        # Low guidance.
        bleu_score_vector_low_guidance_reference = np.nan
        bert_score_f_vector_low_guidance_reference = np.nan
        cross_encoder_score_vector_low_guidance_reference = np.nan

        # High guidance.
        bleu_score_vector_high_guidance_reference = np.nan
        bert_score_f_vector_high_guidance_reference = np.nan
        cross_encoder_score_vector_high_guidance_reference = np.nan

        # Graph

        # No guidance.
        bleu_score_graph_no_guidance_reference = np.nan
        bert_score_f_graph_no_guidance_reference = np.nan
        cross_encoder_score_graph_no_guidance_reference = np.nan

        # Low guidance.
        bleu_score_graph_low_guidance_reference = np.nan
        bert_score_f_graph_low_guidance_reference = np.nan
        cross_encoder_score_graph_low_guidance_reference = np.nan

        # High guidance.
        bleu_score_graph_high_guidance_reference = np.nan
        bert_score_f_graph_high_guidance_reference = np.nan
        cross_encoder_score_graph_high_guidance_reference = np.nan

        # Tree

        # No guidance.
        bleu_score_tree_no_guidance_reference = np.nan
        bert_score_f_tree_no_guidance_reference = np.nan
        cross_encoder_score_tree_no_guidance_reference = np.nan

        # Low guidance.
        bleu_score_tree_low_guidance_reference = np.nan
        bert_score_f_tree_low_guidance_reference = np.nan
        cross_encoder_score_tree_low_guidance_reference = np.nan

        # High guidance.
        bleu_score_tree_high_guidance_reference = np.nan
        bert_score_f_tree_high_guidance_reference = np.nan
        cross_encoder_score_tree_high_guidance_reference = np.nan

    # Vector

    # No guidance.
    if not pd.isna(context_vector_no_guidance):
        bleu_score_vector_no_guidance_context = sentence_bleu([context_vector_no_guidance.split()], response_vector_no_guidance.split())

        bert_score_result_vector_no_guidance_context = bert_score.compute(predictions=[response_vector_no_guidance], references=[context_vector_no_guidance], lang="en")
        bert_score_f_vector_no_guidance_context = bert_score_result_vector_no_guidance_context['f1'][0]

        cross_encoder_score_vector_no_guidance_context = model.predict([[context_vector_no_guidance, response_vector_no_guidance]])[0]
    else:
        bleu_score_vector_no_guidance_context = np.nan
        bert_score_f_vector_no_guidance_context = np.nan
        cross_encoder_score_vector_no_guidance_context = np.nan

    # Low guidance.
    if not pd.isna(context_vector_low_guidance):
        bleu_score_vector_low_guidance_context = sentence_bleu([context_vector_low_guidance.split()], response_vector_low_guidance.split())

        bert_score_result_vector_low_guidance_context = bert_score.compute(predictions=[response_vector_low_guidance], references=[context_vector_low_guidance], lang="en")
        bert_score_f_vector_low_guidance_context = bert_score_result_vector_low_guidance_context['f1'][0]

        cross_encoder_score_vector_low_guidance_context = model.predict([[context_vector_low_guidance, response_vector_low_guidance]])[0]
    else:
        bleu_score_vector_low_guidance_context = np.nan
        bert_score_f_vector_low_guidance_context = np.nan
        cross_encoder_score_vector_low_guidance_context = np.nan

    # High guidance.
    if not pd.isna(context_vector_high_guidance):
        bleu_score_vector_high_guidance_context = sentence_bleu([context_vector_high_guidance.split()], response_vector_high_guidance.split())

        bert_score_result_vector_high_guidance_context = bert_score.compute(predictions=[response_vector_high_guidance], references=[context_vector_high_guidance], lang="en")
        bert_score_f_vector_high_guidance_context = bert_score_result_vector_high_guidance_context['f1'][0]

        cross_encoder_score_vector_high_guidance_context = model.predict([[context_vector_high_guidance, response_vector_high_guidance]])[0]
    else:
        bleu_score_vector_high_guidance_context = np.nan
        bert_score_f_vector_high_guidance_context = np.nan
        cross_encoder_score_vector_high_guidance_context = np.nan

    # Graph.

    # No guidance.
    if not pd.isna(context_graph_no_guidance):
        bleu_score_graph_no_guidance_context = sentence_bleu([context_graph_no_guidance.split()], response_graph_no_guidance.split())

        bert_score_result_graph_no_guidance_context = bert_score.compute(predictions=[response_graph_no_guidance], references=[context_graph_no_guidance], lang="en")
        bert_score_f_graph_no_guidance_context = bert_score_result_graph_no_guidance_context['f1'][0]

        cross_encoder_score_graph_no_guidance_context = model.predict([[context_graph_no_guidance, response_graph_no_guidance]])[0]
    else:
        bleu_score_graph_no_guidance_context = np.nan
        bert_score_f_graph_no_guidance_context = np.nan
        cross_encoder_score_graph_no_guidance_context = np.nan

    # Low guidance.
    if not pd.isna(context_graph_low_guidance):
        bleu_score_graph_low_guidance_context = sentence_bleu([context_graph_low_guidance.split()], response_graph_low_guidance.split())

        bert_score_result_graph_low_guidance_context = bert_score.compute(predictions=[response_graph_low_guidance], references=[context_graph_low_guidance], lang="en")
        bert_score_f_graph_low_guidance_context = bert_score_result_graph_low_guidance_context['f1'][0]

        cross_encoder_score_graph_low_guidance_context = model.predict([[context_graph_low_guidance, response_graph_low_guidance]])[0]
    else:
        bleu_score_graph_low_guidance_context = np.nan
        bert_score_f_graph_low_guidance_context = np.nan
        cross_encoder_score_graph_low_guidance_context = np.nan

    # High guidance.
    if not pd.isna(context_graph_high_guidance):
        bleu_score_graph_high_guidance_context = sentence_bleu([context_graph_high_guidance.split()], response_graph_high_guidance.split())

        bert_score_result_graph_high_guidance_context = bert_score.compute(predictions=[response_graph_high_guidance], references=[context_graph_high_guidance], lang="en")
        bert_score_f_graph_high_guidance_context = bert_score_result_graph_high_guidance_context['f1'][0]

        cross_encoder_score_graph_high_guidance_context = model.predict([[context_graph_high_guidance, response_graph_high_guidance]])[0]
    else:
        bleu_score_graph_high_guidance_context = np.nan
        bert_score_f_graph_high_guidance_context = np.nan
        cross_encoder_score_graph_high_guidance_context = np.nan

    # Tree.

    # No guidance.
    if not pd.isna(context_tree_no_guidance):
        bleu_score_tree_no_guidance_context = sentence_bleu([context_tree_no_guidance.split()], response_tree_no_guidance.split())

        bert_score_result_tree_no_guidance_context = bert_score.compute(predictions=[response_tree_no_guidance], references=[context_tree_no_guidance], lang="en")
        bert_score_f_tree_no_guidance_context = bert_score_result_tree_no_guidance_context['f1'][0]

        cross_encoder_score_tree_no_guidance_context = model.predict([[context_tree_no_guidance, response_tree_no_guidance]])[0]
    else:
        bleu_score_tree_no_guidance_context = np.nan
        bert_score_f_tree_no_guidance_context = np.nan
        cross_encoder_score_tree_no_guidance_context = np.nan

    # Low guidance.
    if not pd.isna(context_tree_low_guidance):
        bleu_score_tree_low_guidance_context = sentence_bleu([context_tree_low_guidance.split()], response_tree_low_guidance.split())

        bert_score_result_tree_low_guidance_context = bert_score.compute(predictions=[response_tree_low_guidance], references=[context_tree_low_guidance], lang="en")
        bert_score_f_tree_low_guidance_context = bert_score_result_tree_low_guidance_context['f1'][0]

        cross_encoder_score_tree_low_guidance_context = model.predict([[context_tree_low_guidance, response_tree_low_guidance]])[0]
    else:
        bleu_score_tree_low_guidance_context = np.nan
        bert_score_f_tree_low_guidance_context = np.nan
        cross_encoder_score_tree_low_guidance_context = np.nan

    # High guidance.
    if not pd.isna(context_tree_high_guidance):
        bleu_score_tree_high_guidance_context = sentence_bleu([context_tree_high_guidance.split()], response_tree_high_guidance.split())

        bert_score_result_tree_high_guidance_context = bert_score.compute(predictions=[response_tree_high_guidance], references=[context_tree_high_guidance], lang="en")
        bert_score_f_tree_high_guidance_context = bert_score_result_tree_high_guidance_context['f1'][0]

        cross_encoder_score_tree_high_guidance_context = model.predict([[context_tree_high_guidance, response_tree_high_guidance]])[0]
    else:
        bleu_score_tree_high_guidance_context = np.nan
        bert_score_f_tree_high_guidance_context = np.nan
        cross_encoder_score_tree_high_guidance_context = np.nan


    # Append results to their respective lists

    # Vector
    # no guidance reference
    metrics_results['bleu_score_vector_no_guidance_reference'].append(bleu_score_vector_no_guidance_reference)
    metrics_results['bert_score_f_vector_no_guidance_reference'].append(bert_score_f_vector_no_guidance_reference)
    metrics_results['cross_encoder_score_vector_no_guidance_reference'].append(cross_encoder_score_vector_no_guidance_reference)
    # low guidance reference
    metrics_results['bleu_score_vector_low_guidance_reference'].append(bleu_score_vector_low_guidance_reference)
    metrics_results['bert_score_f_vector_low_guidance_reference'].append(bert_score_f_vector_low_guidance_reference)
    metrics_results['cross_encoder_score_vector_low_guidance_reference'].append(cross_encoder_score_vector_low_guidance_reference)
    # high guidance reference
    metrics_results['bleu_score_vector_high_guidance_reference'].append(bleu_score_vector_high_guidance_reference)
    metrics_results['bert_score_f_vector_high_guidance_reference'].append(bert_score_f_vector_high_guidance_reference)
    metrics_results['cross_encoder_score_vector_high_guidance_reference'].append(cross_encoder_score_vector_high_guidance_reference)
    # no guidance context
    metrics_results['bleu_score_vector_no_guidance_context'].append(bleu_score_vector_no_guidance_context)
    metrics_results['bert_score_f_vector_no_guidance_context'].append(bert_score_f_vector_no_guidance_context)
    metrics_results['cross_encoder_score_vector_no_guidance_context'].append(cross_encoder_score_vector_no_guidance_context)
    # low guidance context
    metrics_results['bleu_score_vector_low_guidance_context'].append(bleu_score_vector_low_guidance_context)
    metrics_results['bert_score_f_vector_low_guidance_context'].append(bert_score_f_vector_low_guidance_context)
    metrics_results['cross_encoder_score_vector_low_guidance_context'].append(cross_encoder_score_vector_low_guidance_context)
    # high guidance context
    metrics_results['bleu_score_vector_high_guidance_context'].append(bleu_score_vector_high_guidance_context)
    metrics_results['bert_score_f_vector_high_guidance_context'].append(bert_score_f_vector_high_guidance_context)
    metrics_results['cross_encoder_score_vector_high_guidance_context'].append(cross_encoder_score_vector_high_guidance_context)

    # Graph
    # no guidance reference
    metrics_results['bleu_score_graph_no_guidance_reference'].append(bleu_score_graph_no_guidance_reference)
    metrics_results['bert_score_f_graph_no_guidance_reference'].append(bert_score_f_graph_no_guidance_reference)
    metrics_results['cross_encoder_score_graph_no_guidance_reference'].append(cross_encoder_score_graph_no_guidance_reference)
    # low guidance reference
    metrics_results['bleu_score_graph_low_guidance_reference'].append(bleu_score_graph_low_guidance_reference)
    metrics_results['bert_score_f_graph_low_guidance_reference'].append(bert_score_f_graph_low_guidance_reference)
    metrics_results['cross_encoder_score_graph_low_guidance_reference'].append(cross_encoder_score_graph_low_guidance_reference)
    # high guidance reference
    metrics_results['bleu_score_graph_high_guidance_reference'].append(bleu_score_graph_high_guidance_reference)
    metrics_results['bert_score_f_graph_high_guidance_reference'].append(bert_score_f_graph_high_guidance_reference)
    metrics_results['cross_encoder_score_graph_high_guidance_reference'].append(cross_encoder_score_graph_high_guidance_reference)
    # no guidance context
    metrics_results['bleu_score_graph_no_guidance_context'].append(bleu_score_graph_no_guidance_context)
    metrics_results['bert_score_f_graph_no_guidance_context'].append(bert_score_f_graph_no_guidance_context)
    metrics_results['cross_encoder_score_graph_no_guidance_context'].append(cross_encoder_score_graph_no_guidance_context)
    # low guidance context
    metrics_results['bleu_score_graph_low_guidance_context'].append(bleu_score_graph_low_guidance_context)
    metrics_results['bert_score_f_graph_low_guidance_context'].append(bert_score_f_graph_low_guidance_context)
    metrics_results['cross_encoder_score_graph_low_guidance_context'].append(cross_encoder_score_graph_low_guidance_context)
    # high guidance context
    metrics_results['bleu_score_graph_high_guidance_context'].append(bleu_score_graph_high_guidance_context)
    metrics_results['bert_score_f_graph_high_guidance_context'].append(bert_score_f_graph_high_guidance_context)
    metrics_results['cross_encoder_score_graph_high_guidance_context'].append(cross_encoder_score_graph_high_guidance_context)

    # Tree
    # no guidance reference
    metrics_results['bleu_score_tree_no_guidance_reference'].append(bleu_score_tree_no_guidance_reference)
    metrics_results['bert_score_f_tree_no_guidance_reference'].append(bert_score_f_tree_no_guidance_reference)
    metrics_results['cross_encoder_score_tree_no_guidance_reference'].append(cross_encoder_score_tree_no_guidance_reference)
    # low guidance reference
    metrics_results['bleu_score_tree_low_guidance_reference'].append(bleu_score_tree_low_guidance_reference)
    metrics_results['bert_score_f_tree_low_guidance_reference'].append(bert_score_f_tree_low_guidance_reference)
    metrics_results['cross_encoder_score_tree_low_guidance_reference'].append(cross_encoder_score_tree_low_guidance_reference)
    # high guidance reference
    metrics_results['bleu_score_tree_high_guidance_reference'].append(bleu_score_tree_high_guidance_reference)
    metrics_results['bert_score_f_tree_high_guidance_reference'].append(bert_score_f_tree_high_guidance_reference)
    metrics_results['cross_encoder_score_tree_high_guidance_reference'].append(cross_encoder_score_tree_high_guidance_reference)
    # no guidance context
    metrics_results['bleu_score_tree_no_guidance_context'].append(bleu_score_tree_no_guidance_context)
    metrics_results['bert_score_f_tree_no_guidance_context'].append(bert_score_f_tree_no_guidance_context)
    metrics_results['cross_encoder_score_tree_no_guidance_context'].append(cross_encoder_score_tree_no_guidance_context)
    # low guidance context
    metrics_results['bleu_score_tree_low_guidance_context'].append(bleu_score_tree_low_guidance_context)
    metrics_results['bert_score_f_tree_low_guidance_context'].append(bert_score_f_tree_low_guidance_context)
    metrics_results['cross_encoder_score_tree_low_guidance_context'].append(cross_encoder_score_tree_low_guidance_context)
    # high guidance context
    metrics_results['bleu_score_tree_high_guidance_context'].append(bleu_score_tree_high_guidance_context)
    metrics_results['bert_score_f_tree_high_guidance_context'].append(bert_score_f_tree_high_guidance_context)
    metrics_results['cross_encoder_score_tree_high_guidance_context'].append(cross_encoder_score_tree_high_guidance_context)

# Now, create a new DataFrame from metrics_results
calc_df = merged_df_all.copy()
for metric, results in metrics_results.items():
    calc_df[metric] = results

# Save calc_df