### Changes In this version 12.2:
- The final evaluation prompt has been modified and improved.
- The code has been split into two separate files for ease of use by the user.
- The final outputs that will be displayed to the user have been adjusted.


In [1]:
import json
import pandas as pd
from custom_metric_evaluator import TestCaseParams, Eval_metric, CustomLLMEvaluator 

## Load Test cases and metrics info form JSON file 

In [2]:

def load_data(file_path):
    # ----------------------------------------------------- Load and Process Data -----------------------------------------------------------------------------------
    # Load the JSON file with utf-8 encoding
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        data = json.load(file)

    # Extract the "Actual_Responses" list and context
    actual_responses = data.get("Actual_Responses", [])
    context = data.get("Context", [])

    # Create a DataFrame for test cases
    TestCases_df = pd.DataFrame({'query': "", 'context': context, 'expected_output': "", 'actual_output': actual_responses})
    
    # Normalize the 'metrics' data
    metrics_df = pd.json_normalize(data['metrics'])

    # Return the final DataFrame
    return TestCases_df , metrics_df

## Custom LLM Evaluator

In [3]:
def evaluate_test_cases(TestCases_df , metrics_df):
    # ----------------------------------------------------- Initialize Evaluator -----------------------------------------------------------------------------------
    # Create evaluator instance
    custom_llm_evaluator = CustomLLMEvaluator()

    # ----------------------------------------------------- Evaluate test cases with multi metrics -----------------------------------------------------------------------------------
    custom_Eval_all_results = []

    # Loop through each row in the metrics DataFrame
    for index, row in metrics_df.iterrows():
        # Define the custom evaluation parameters based on the metric
        evaluation_params = [TestCaseParams.ACTUAL_OUTPUT]  # Always evaluate the actual output

        # Add 'context' as a parameter if 'Uses_Context' is True
        if row["Uses_Context"]:
            evaluation_params.append(TestCaseParams.CONTEXT)

        # Create an Eval_metric object for each metric
        metric = Eval_metric(
            metric_name=row['Metric_Name'],
            criteria=row['Criteria'],
            eval_steps=row['Evaluation_Steps'],
            eval_steps_correct_example=row['Correct_Eval_steps_example'],
            eval_steps_incorrect_example=row['Incorrect_Eval_steps_example'],
            uses_context=row['Uses_Context'],
            evaluation_params=evaluation_params
        )

        # Evaluate test cases for the current metric
        custom_Eval_result = custom_llm_evaluator.custom_Eval(TestCases=TestCases_df, metric=metric)

        # Append the results
        custom_Eval_all_results.append(custom_Eval_result)

    # Concatenate all results into one DataFrame
    custom_Eval_all_results = pd.concat(custom_Eval_all_results, ignore_index=True)

    # ----------------------------------------------------- Process the evaluation results -----------------------------------------------------------------------------------
    # Define the columns that will always be present in the output
    base_columns = ["query", "context", "expected_output", "actual_output"]
    
    # Define the columns that contain evaluation data specific to each metric
    pivot_columns = ['scores', 'final_score', 'reasoning', 'improved_steps_response', 'evaluation_result']
    
    # Create a base DataFrame with distinct test cases
    base_df = custom_Eval_all_results[base_columns].drop_duplicates()

    # Pivot the custom evaluation results to organize by 'metric_name'
    metrics_result_df = custom_Eval_all_results.pivot_table(index=base_columns, columns='metric_name', values=pivot_columns, aggfunc='first')

    # Reorder the columns in the metrics result DataFrame based on 'new_order'
    new_order = ['scores', 'final_score', 'reasoning', 'evaluation_result', 'improved_steps_response']
    metrics_result_df = metrics_result_df[new_order]

    # Flatten the MultiIndex columns by combining metric name with column names
    metrics_result_df.columns = [f"{col}_{metric}" for metric, col in metrics_result_df.columns]
    metrics_result_df.reset_index(inplace=True)

    # Return the final DataFrame
    return metrics_result_df


In [4]:
file_path = 'RFP_correctClarificationGeneration.json'
#file_path = 'GEval_Context_InvalidDataGen_me_test.json'
TestCases_df , metrics_df = load_data(file_path)

In [None]:
TestCases_df

In [None]:
metrics_df

In [None]:
result = evaluate_test_cases(TestCases_df , metrics_df)
result

In [8]:
result.to_excel(f"custom_Eval_result_v2.xlsx")