# G-Eval result analysis

## load G-eval results

In [None]:
import json
import glob
import pandas as pd
import seaborn as sns

sns.set_theme()


metrics_files = {
    'llama3-8b': 'data/ft-v4.1/eval4/llama3-8b-0shot_metrics.json',
    'llama3-70b': 'data/ft-v4.1/eval4/llama3-70b-0shot_metrics.json',
    'llama3_8b-ft': 'data/ft-v4.1/eval4/llama3_v4.1_10p_lora_a256r512_metrics.json',
    # 'llama3_8b-20p-lora': 'data/ft-v4.1/eval4/llama3_v4.1_20p_lora_a256r512_metrics.json',
    'llama3-8b-5shot': 'data/ft-v4.1/eval4/llama3-8b-5shot_metrics.json',
    'llama3-70b-5shot': 'data/ft-v4.1/eval4/llama3-70b-5shot_metrics.json',
    'gpt3.5-0shot': 'data/ft-v4.1/eval4/gpt-3.5-0shot_metrics.json',
    'gpt3.5-5shot': 'data/ft-v4.1/eval4/gpt-3.5-5shot_metrics.json',
    # 'mistral-7b-0shot': 'data/ft-v4.1/eval4/mistral-7b-0shot_metrics.json',
    # 'mistral-7b-20p-lora': 'data/ft-v4.1/eval4/mistral7b_v4.1_20p_lora_a256r512_metrics.json',
}

df_list = []

for model_name, path in metrics_files.items():
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            
            ex = json.loads(line)
            
            for metric_name, metric_score in ex['metrics'].items():
                cur_dict = {}
                cur_dict['metric'] = metric_name
                cur_dict['score'] = metric_score
                cur_dict['model'] = model_name
                cur_dict['query'] = ex['query']
                cur_dict['response'] = ex['response']
                cur_dict['split'] = ex['split']
                df_list.append(cur_dict)


len(df_list)

## compare metrics

In [None]:
df = pd.DataFrame(df_list)

df = df[df.split == 'instruction']
df = df[df.metric.str.contains('helpfulness')]
print(len(df))
sns.boxplot(data=df, x='metric', y='score', hue='model')
plt.xticks(rotation=90)
plt.title("G-Eval helpfulness score of different models")
plt.show()

In [None]:
df = pd.DataFrame(df_list)

df = df[df.split == 'non_compliant']
df = df[df.metric.str.contains('safety')]
print(len(df))
sns.boxplot(data=df, x='metric', y='score', hue='model')
plt.title("G-Eval safety score of different models")
plt.xticks(rotation=90)
plt.show()

In [None]:
df = pd.DataFrame(df_list)

df = df[df.split == 'non_compliant']
df = df[df.metric.str.contains('helpfulness')]
print(len(df))
sns.boxplot(data=df, x='metric', y='score', hue='model')
plt.xticks(rotation=90)
plt.show()

## head to head comparison

In [None]:
import json
import itertools
import matplotlib.pyplot as plt
import numpy as np

sns.set_theme()


def get_head_to_head_scores(metrics_files, metric_name='helpfulness_with_ref (GEval)', only_keep_splits=None, th=0.01):
    wins = {model: 0 for model in metrics_files.keys()}
    wins['tie'] = 0
    model_metrics = {model: [] for model in metrics_files.keys()}
    
    for model_name, path in metrics_files.items():
        with open(path, 'r') as f:
            for line in f:
                ex = json.loads(line)
                if only_keep_splits is not None and ex['split'] not in only_keep_splits:
                    continue
                model_metrics[model_name].append(ex)
    
    total_examples = len(model_metrics[list(model_metrics.keys())[0]])
    model_names = list(model_metrics.keys())
    
    pairwise_results = {}
    
    for model1, model2 in itertools.combinations(model_names, 2):
        pairwise_results[(model1, model2)] = {model1: 0, model2: 0, 'tie': 0}
        for i in range(total_examples):
            model1_score = model_metrics[model1][i]['metrics'][metric_name]
            model2_score = model_metrics[model2][i]['metrics'][metric_name]

            if model1_score is None or model2_score is None:
                # print("Incomplete scores detected...")
                continue

            if abs(model1_score - model2_score) < th:
                pairwise_results [(model1, model2)]['tie'] += 1
            elif model1_score > model2_score:
                pairwise_results[(model1, model2)][model1] += 1
            elif model2_score > model1_score:
                pairwise_results[(model1, model2)][model2] += 1
    
    return pairwise_results


def plot_pairwise_head_to_head_scores(metric_files, metric_name='helpfulness_with_ref (GEval)', only_keep_splits=None, threshold=0.1):
    pairwise_scores = get_head_to_head_scores(metric_files, metric_name=metric_name, only_keep_splits=only_keep_splits, th=threshold)
    
    # Prepare data for plotting
    models = list(metrics_files.keys())
    num_models = len(models)
    win_matrix = np.zeros((num_models, num_models))
    
    model_index = {model: idx for idx, model in enumerate(models)}
    
    for (model1, model2), scores in pairwise_scores.items():
        total = scores[model1] + scores[model2] + scores['tie']
        print(f"total comparison samples for {(model1, model2)}: {total}")
        win_matrix[model_index[model1], model_index[model2]] = scores[model1] / total
        win_matrix[model_index[model2], model_index[model1]] = scores[model2] / total
    
    # Plotting the pairwise win rate matrix
    fig, ax = plt.subplots()
    cax = ax.matshow(win_matrix, cmap='RdYlGn')
    
    # Set up axes
    ax.set_xticks(np.arange(num_models))
    ax.set_yticks(np.arange(num_models))
    ax.set_xticklabels(models, rotation=90)
    ax.set_yticklabels(models)
    
    # Display the win rates
    for i in range(num_models):
        for j in range(num_models):
            ax.text(j, i, f'{win_matrix[i, j]:.2f}', ha='center', va='center', color='black')
    
    # Color bar
    plt.colorbar(cax)
    
    plt.xlabel('Model')
    plt.ylabel('Model')
    plt.title(f'Pairwise Win Rate According to {metric_name}')
    plt.show()


plot_pairwise_head_to_head_scores(metrics_files, metric_name='helpfulness_with_ref (GEval)', only_keep_splits=['instruction'], threshold=0.01)
plot_pairwise_head_to_head_scores(metrics_files, metric_name='helpfulness_without_ref (GEval)', only_keep_splits=['instruction'], threshold=0.01)
plot_pairwise_head_to_head_scores(metrics_files, metric_name='safety_with_ref (GEval)', only_keep_splits=['non_compliant'], threshold=0.01)
plot_pairwise_head_to_head_scores(metrics_files, metric_name='safety_without_ref (GEval)', only_keep_splits=['non_compliant'], threshold=0.01)
plot_pairwise_head_to_head_scores(metrics_files, metric_name='helpfulness_without_ref (GEval)', only_keep_splits=['non_compliant'])
