# Evaluation 

This notebook contains the code for the evaluation of the scoring performance, consistency and the automatic evaluations for the helpfulness of the feedback as well as the comparison to the manual annotation study.

#### Imports and General Variables

In [None]:
import json
import multiprocessing
import os
import random

import krippendorff
import numpy as np
import pandas as pd
from datasets import load_from_disk
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from numpy import mean, std
from scipy.stats import kendalltau
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm

from analysis import calculate_similarity_for_cluster_part, create_embedding, compare_texts
from essay_meta_data import essay_set_descriptions
from evaluation_helpers import qwk
from helpfulness_example_dataset import HELPFUL, NOT_HELPFUL, SEMI_HELPFUL
from llm_functions import collect_llm_output
from llm_functions import load_model

df = pd.read_excel("../data/training_set_rel3.xlsx", index_col="essay_id")
df = df.drop(10534)  # this essay is not annotated 

os.makedirs("../data/results/", exist_ok=True)

## Scoring Performance Evaluation 

For the scoring performance evaluation, the QWK scores for the different prompts have to be evaluated. One experiment can be done via: 
```bash 
python main.py --logging_data_path="./data/qwk/template1_1/one_shot_prompt.json" --prompt=one_shot_prompt --dataset_split=dev --model_size="7b" --model="mistral" --prompt-template=1 --instruction-variant=1 --setting=one-shot --full-rubric
```
This command will generate the QWK scores for the one-shot prompt using the basic template and the original formulation variant. The variable `experiments` then has to be filled with all the experiments that have been conducted.

The result of the following cell is a pandas dataframe that contains the QWK scores for each essay set and the average QWK score over all essay sets. The dataframe also contains the number of incorrect predictions for each essay set.
Using this dataframe, the results can be visualized and analyzed.

In [None]:
experiments = [
    {
        "data_path": "./data/qwk/template1_1/one_shot_prompt.json",
        "prompt": "one_shot_prompt",
        "template": 1,
        "variant": 1,
        "model": "mistral",
    }
]

df_data = []
experiment_results = {}
for i, experiment in tqdm(enumerate(experiments)):
    with (open(experiment["data_path"]), "r") as f:
        results = json.load(f)
    experiments[i]["results"] = results

    df[f"{experiment}_predicted_score"] = 0

    for fold in results:
        for j in range(len(results[fold])):
            essay_id = results[fold][j]["id"]
            output = results[fold][j]["parsed_output"]
            essay_set = df.loc[essay_id]['essay_set']
            meta_data = essay_set_descriptions[essay_set - 1]
            try:
                score = meta_data['full_score_fn'](output, output)
            except Exception as e:
                score = -1  # mark the score as incorrect to filter it out later

            df.loc[essay_id, f"{experiment}_predicted_score"] = score

    wrong_predictions = 0
    for essay_set in range(1, 9):
        meta_data = essay_set_descriptions[essay_set - 1]
        score_range = meta_data['score_ranges'][0]
        filtered_df = df[df["essay_set"] == essay_set]
        incorrect_predictions = filtered_df[(filtered_df[f"{experiment}_predicted_score"] < score_range[0]) | (
                filtered_df[f"{experiment}_predicted_score"] > score_range[1])]
        # change the scores to -1 for the incorrect predictions in the original dataframe
        df.loc[incorrect_predictions.index, f"{experiment}_predicted_score"] = -1
        wrong_predictions += len(incorrect_predictions)

    experiments[i]["wrong_predictions"] = wrong_predictions

    # calculate the QWK score for each essay set
    qwks = [[] for _ in range(8)]
    for fold in results:
        essay_set = fold.split("_")[-1]
        fold_index = int(fold.split("_")[1])
        filtered_df = df[df["essay_set"] == int(essay_set) + 1]

        # get the true and predicted scores but filter out the incorrect predictions with -1
        clean_true_scores = filtered_df[filtered_df[f"{experiment}_predicted_score"] != -1]["domain1_score"].filter(
            items=[essay['id'] for essay in experiment_results[experiment][fold]])
        clean_predicted_scores = filtered_df[filtered_df[f"{experiment}_predicted_score"] != -1][
            f"{experiment}_predicted_score"].filter(
            items=[essay['id'] for essay in experiment_results[experiment][fold]])
        qwks[int(essay_set)].append(qwk(clean_predicted_scores, clean_true_scores))

    # take the average over all folds
    qwks = [sum(score) / len(score) for score in qwks]

    # save the results in a pandas dataframe
    for essay_set in range(1, 9):
        meta_data = essay_set_descriptions[essay_set - 1]
        score_range = meta_data['score_ranges'][0]
        filtered_df = df[df["essay_set"] == essay_set]
        calculated_qwk_filtered = qwks[essay_set - 1]

        new_df_entry = {f"Essay Set {essay_set}": calculated_qwk_filtered, 'Average': sum(qwks) / len(qwks),
                        'Incorrect Predictions': wrong_predictions, 'template': experiment['template'],
                        'variant': experiment['variant'], 'prompt': experiment['prompt']}
        df_data.append(new_df_entry)

# create the dataframe and save it to a csv file
df_data = pd.DataFrame(df_data)
df_data.to_csv("./data/results/qwk_table.csv", index=False)

## Consistency Evaluation

The experiments for the consistency evaluation can be conducted via:
```bash
python main.py --logging_data_path="./data/consistency/template${template}_${variant}/${prompt}.json" --prompt=one_shot_prompt --dataset_split=dev --model_size="7b" --model="mistral" --prompt-template=1 --instruction-variant=1 --setting=one-shot --consistency
```

This command will generate the QWK scores for the one-shot prompt using the basic template and the original formulation variant. The variable `experiments` then has to be filled with all the experiments that have been conducted.

The result of the following cell is a pandas dataframe that contains the MSE, MAE and cosine similarity for each essay set and the average over all essay sets. Using this dataframe, the results can be visualized and analyzed.

In [None]:
experiments = [
    {
        'data_path': "./data/consistency/template1_1/one_shot_prompt.json",
        'prompt': "one_shot_prompt",
        'template': 1,
        'variant': 1,
    },
]

data = []
score_ranges = [essay_set_descriptions[i]['score_ranges'][0][1] - essay_set_descriptions[i]['score_ranges'][0][0] for i
                in range(8)]
embedding_model = create_embedding()

# embed all feedback texts first for a faster later comparison 
for i, experiment in tqdm(enumerate(experiments)):
    with (open(experiment["data_path"]), "r") as f:
        data = json.load(f)

    # group the essays by their essay_id that they were translated from using the backtranslation method 
    groups = {}
    essay_to_essay_set = {}
    for key in data.keys():
        for essay in data[key]:
            if essay['essay_id'] not in groups.keys():
                groups[essay['essay_id']] = [essay]
                essay_to_essay_set[essay['essay_id']] = int(key)
            else:
                groups[essay['essay_id']].append(essay)

    score_sets = [[] for _ in range(8)]

    for essay_id in groups:
        essay_set = essay_to_essay_set[essay_id]
        scoring_function = essay_set_descriptions[essay_set - 1]['full_score_fn']
        try:
            true_scores = [essay['correct_grade'] for essay in groups[essay_id]]
            scores = [scoring_function(essay['parsed_output'], essay['parsed_output']) for essay in groups[essay_id]]

            score_sets[essay_set - 1].append(scores)
        except Exception as e:
            pass

            # calculate the MSE and MAE for each score_set with the majority vote as the predicted score
    y = [[] for _ in range(8)]
    y_hat = [[] for _ in range(8)]

    for i, score_set in enumerate(score_sets):
        for score in score_set:
            y[i].extend(score)
            y_hat[i].extend([max(set(score), key=score.count)] * len(score))

    mse_normalized = [mean_squared_error(y[i], y_hat[i]) ** 0.5 / (score_ranges[i]) for i in range(8)]
    mae_normalized = [mean_absolute_error(y[i], y_hat[i]) / score_ranges[i] for i in range(8)]

    average_mae_normalized = np.mean(mse_normalized)
    average_mse_normalized = np.mean(mae_normalized)

    # calculate the cosine similarity
    # embed all feedback texts first for a faster later comparison
    clusters = {}
    for key in data.keys():
        embeddings = embedding_model.embed_documents([entry['output'] for entry in data[key]])
        for i, essay in enumerate(data[key]):
            if essay['essay_id'] not in clusters.keys():
                clusters[essay['essay_id']] = [embeddings[i]]
            else:
                clusters[essay['essay_id']].append(embeddings[i])

    # compare the all combinations of feedback texts 
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        cluster_similarities = list(
            tqdm(pool.imap(calculate_similarity_for_cluster_part, clusters.items()), total=len(clusters)))

    average = np.mean([entry['average_cosine_similarity'] for entry in cluster_similarities])

    data.append({
        'template': experiment['template'],
        'variant': experiment['variant'],
        'prompt': experiment['prompt'],
        'average_mse_normalized': str(round(average_mse_normalized, 3)),
        'average_mae_normalized': str(round(average_mae_normalized, 3)),
        'average_cosine_similarity': str(round(average, 3)),
        **{f"MSE ES {i + 1}": str(round(mse_normalized[i], 3)) for i in range(8)},
        **{f"MAE ES {i + 1}": str(round(mae_normalized[i], 3)) for i in range(8)},
    })

df = pd.DataFrame(data)

# save the df as a csv
df.to_csv("../data/results/interconsistency.csv", index=False)

#### Reference Points for Cosine Similarity 

We calculated two reference points for the cosine similarity to make the values more comparable. 

In [None]:
data_path = ''  # add correct path to the helpfulness results file (.json)
number_of_samples = 30

with open(data_path, "r") as f:
    data = json.load(f)

samples = []
for _ in range(number_of_samples):
    fold = random.choice(list(data.keys()))
    feedback = random.choice(data[fold])
    samples.append(feedback)

template = """
{model_prefix}
Here is a text that needs to be reformulated. Don't change its meaning, but try to express it differently. 
'''
{feedback}
'''
{model_suffix}
"""

prompt_template = PromptTemplate.from_template(template)


class Config:
    def __init__(self):
        self.model = 'mistral'
        self.temperature = 0
        self.max_length = 4096


config = Config()

llm, prompt_template = load_model(config, prompt_template)

# reformulate the feedback 
chain = prompt_template | llm
for i in range(len(samples)):
    samples[i]['reformulated_output'] = chain.invoke({'feedback': samples[i]['output']})


embeddings_model = create_embedding()
similarities_reformulated = []
for sample in samples:
    similarities_reformulated.append(compare_texts(sample['output'], sample['reformulated_output'], embeddings_model))

similarities_original = []
for i in range(len(samples)):
    for j in range(i + 1, len(samples)):
        similarities_original.append(compare_texts(samples[i]['output'], samples[j]['output'], embeddings_model))

print(
    f"Reference point reformulated feedback: {np.mean([sim['cosine_similarity'] for sim in similarities_reformulated])}")
print(f"Reference point random feedback: {np.mean([sim['cosine_similarity'] for sim in similarities_original])}")

## Helpfulness Evaluation

### Helpfulness Score Evaluation

The experiments for the helpfulness score evaluation can be conducted via:
```bash
python helpfulness_evaluation.py --logging_data_path="./data/helpfulness/template1_1_mistral/one_shot_prompt.json" --input_data="./data/qwk/template1_1/one_shot_prompt.json" --prompt=one_shot_prompt --model_size="7b" --model='mistral'
```
This command will generate the helpfulness scores for the one-shot prompt using the basic template and the original formulation variant. The variable `experiments` then has to be filled with all the experiments that have been conducted.

The result of the following cell is a pandas dataframe that contains the helpfulness scores. Using this dataframe, the results can be visualized and analyzed.

In [None]:
experiments = [
    {
        'data_path': "./data/helpfulness/template1_1_mistral/one_shot_prompt.json",
        'prompt': "one_shot_prompt",
        'template': 1,
        'variant': 1,
        'model': 'mistral'
    },
]

data = []
for i, experiment in tqdm(enumerate(experiments)):
    with (open(experiment["data_path"]), "r") as f:
        data = json.load(f)

    helpfulness_scores = [int(entry['parsed_output']['score']) for entry in data if
                          entry['parsed_output'] is not None and not isinstance(entry['parsed_output']['score'],
                                                                                dict) and not isinstance(
                              entry['parsed_output']['score'], list) and entry['parsed_output']['score'] is not None]

    # filter out the values out of range 
    helpfulness_scores = [int(score) for score in helpfulness_scores if score in list(range(0, 11))]

    mu = round(mean(helpfulness_scores), 2)
    sigma = round(std(helpfulness_scores), 2)

    data.append({
        "template": experiment['template'],
        "variant": experiment['variant'],
        "prompt": experiment['prompt'],
        "model": experiment['model'],
        'Help Mean': mu,
        'Help Std': sigma,
    })

df = pd.DataFrame(data)

df.sort_values('Help Mean M', ascending=False)

# save the df to a csv
df.to_csv("./data/results/helpfulness.csv", index=False)

#### Verification of the Approach

The approach was verified using the helpfulness verification dataset (`helpfulness_example_dataset.py`). The following cell contains the code for the verification of the approach.
The helpfulness scores need to be saved previously using the following command for each model: 
```bash
python helpfulness_evaluation.py --logging_data_path="./data/helpfulness/reference_mistral.json" --model_size="7b" --model=mistral --reference_dataset
```

In [None]:
with open("./data/helpfulness/reference_mistral.json", "r") as f:
    mistral_data = json.load(f)

with open("./data/helpfulness/reference_llama.json", "r") as f:
    llama_data = json.load(f)

llama_gradings = [[] for _ in range(3)]
mistral_gradings = [[] for _ in range(3)]

mapping = {
    NOT_HELPFUL: 0,
    SEMI_HELPFUL: 1,
    HELPFUL: 2
}

for llama, mistral, reference in zip(llama_data, mistral_data, samples):
    assert llama['feedback'] == mistral['feedback'] == reference['feedback']
    llama_gradings[mapping[reference['label']]].append(llama['parsed_output']['score'])
    mistral_gradings[mapping[reference['label']]].append(mistral['parsed_output']['score'])
    print(llama['parsed_output']['score'], "\t", mistral['parsed_output']['score'], "\t", reference['label'])

data = []
labels = ['not-helpful', 'semi-helpful', 'helpful']
for i in range(3):
    for score in llama_gradings[i]:
        data.append({'label': labels[i], 'score': score, 'model': 'llama'})
    for score in mistral_gradings[i]:
        data.append({'label': labels[i], 'score': score, 'model': 'mistral'})
df = pd.DataFrame(data)

# Calculate the mean and standard deviation for each group
df = df.groupby(['label', 'model']).agg(['mean', 'std']).reset_index()
print(df)

### Pairwise Comparison Evaluation

The experiment for the pairwise comparison evaluation can be conducted via:
```bash
python elo_ranking.py --logging_data_path="./data/elo/matches" --model_size="7b" --model="mistral"
```
This command will use the dataset in `.data/datasets/elo` to make all pairwise comparisons between the feedback texts in the dataset. 

The result of the following cell is a pandas dataframe that contains the contains the wins and losses for each prompt that is contained in the dataset. Using this dataframe, the results can be visualized and analyzed.

In [None]:
dataset = load_from_disk("../data/datasets/elo_ranking")

with open(f"./data/elo/matches.json", "r") as f:
    results = json.load(f)

all_experiments = set([entry['experiment'] for entry in dataset])

wins = {exp: 0 for exp in all_experiments}
losses = {exp: 0 for exp in all_experiments}
wins_against = {
    exp: {exp2: 0 for exp2 in all_experiments}
    for exp in all_experiments
}

for match in tqdm(results):
    # skip if the LLM did not produce a parsed output
    if match['parsed_output'] is None:
        continue
    if match['parsed_output']['preferred_feedback'] == 1:
        wins[dataset[match['index1']]['experiment']] += 1
        losses[dataset[match['index2']]['experiment']] += 1
        wins_against[dataset[match['index1']]['experiment']][dataset[match['index2']]['experiment']] += 1
    elif match['parsed_output']['preferred_feedback'] == 2:
        wins[dataset[match['index2']]['experiment']] += 1
        losses[dataset[match['index1']]['experiment']] += 1
        wins_against[dataset[match['index2']]['experiment']][dataset[match['index1']]['experiment']] += 1
        
# create the dataframe 
data = {
    "experiment": [],
    "wins": [],
    "losses": [],
}

data.update({
    f"wins_against_{exp}": []
    for exp in all_experiments
})

for exp in all_experiments:
    data['experiment'].append(exp)
    data['wins'].append(wins[exp])
    data['losses'].append(losses[exp])
    for exp2 in sorted(all_experiments):
        data[f"wins_against_{exp}"].append(wins_against[exp][exp2])

df = pd.DataFrame(data)

# save the dataframe
df.to_csv("./data/results/elo_ranking.csv", index=False)

#### Verification of the Approach

The approach was verified using a reference dataset (`helpfulness_example_dataset.py`). The following cell contains the code for the verification of the approach.

In [None]:
template = """
{model_prefix}
# Task 
You are given now given two feedbacks and should decide which one is better. If the first feedback is better, type 1. If the second feedback is better, type 2. You have to decide for one feedback based on the following criteria:
A good feedback is a feedback that indicates what are the errors, why the errors are errors and makes concrete suggestions on how to fix them. 

#### Feedback 1
```{feedback1}```

#### Feedback 2
```{feedback2}```

#### Which feedback is better?
Type 1 if the first feedback is better, type 2 if the second feedback is better.

{format_instructions}

{model_suffix}
"""

prompt_template = PromptTemplate(template=template, input_variables=["model_prefix", "model_suffix", "feedback1",
                                                                     "feedback2", "format_instructions"])


class Config:
    def __init__(self):
        self.model = 'mistral'
        self.max_length = 4096
        self.temperature = 0


config = Config()
llm, prompt_template = load_model(config, prompt_template)
output_parser = StructuredOutputParser.from_response_schemas(response_schemas=[
    ResponseSchema(name="preferred_feedback", description="the preferred feedback (1 for feedback 1, 2 for feedback 2",
                   type="int")
])

all_pairs = []
for i in range(len(samples)):
    for j in range(i + 1, len(samples)):
        all_pairs.append({"feedback1": samples[i]['feedback'], "feedback2": samples[j]['feedback'], "i": i, "j": j})
results = collect_llm_output(llm, all_pairs, output_parser, prompt_template)

# get the number of wins and losses for each category
wins = {
    "helpful": 0,
    "semi-helpful": 0,
    "not-helpful": 0
}
losses = {
    "helpful": 0,
    "semi-helpful": 0,
    "not-helpful": 0
}
wins_against = {
    "helpful": {"helpful": 0, "semi-helpful": 0, "not-helpful": 0},
    "semi-helpful": {"helpful": 0, "semi-helpful": 0, "not-helpful": 0},
    "not-helpful": {"helpful": 0, "semi-helpful": 0, "not-helpful": 0}
}

for result in results:
    # skip if the LLM did not produce a parsed output
    if result['parsed_output'] is None:
        continue
    if result['parsed_output']['preferred_feedback'] == 1:
        winner = result['i']
        looser = result['j']
    elif result['parsed_output']['preferred_feedback'] == 2:
        winner = result['j']
        looser = result['i']
    else:
        continue

    wins[samples[winner]['label']] += 1
    losses[samples[looser]['label']] += 1
    wins_against[samples[winner]['label']][samples[looser]['label']] += 1

data = {
    "label": [],
    "wins": [],
    "losses": [],
    "wins_against_helpful": [],
    "wins_against_semi_helpful": [],
    "wins_against_not_helpful": []
}

for label in wins:
    data['label'].append(label)
    data['wins'].append(wins[label])
    data['losses'].append(losses[label])
    data['wins_against_helpful'].append(wins_against[label]["helpful"])
    data['wins_against_semi_helpful'].append(wins_against[label]["semi-helpful"])
    data['wins_against_not_helpful'].append(wins_against[label]["not-helpful"])

df = pd.DataFrame(data)

print(df)

### Manual Annotation Study

For the MAS, three things were done: selecting feedback texts which are included, calculate IAA, calculate agreement with automatic evaluations, and calculate averages for each prompt. 

#### Select Feedback Texts

In [None]:
df = pd.read_excel('./data/training_set_rel3.xlsx', index_col='essay_id')

dataset = load_from_disk('./data/datasets/consistency')

# select all essays from the dataset that belong to essay set 4 
essay_ids_from_essay_set_4 = set()
for entry in dataset:
    if entry['essay_set'] == 4:
        essay_ids_from_essay_set_4.add(entry['essay_id'])

# put the essay ids into buckets based on their grade
buckets = [[] for _ in range(4)]
for id in essay_ids_from_essay_set_4:
    grade = df.loc[id]['domain1_score']
    buckets[int(grade)].append(id)

random.seed(42)

# randomly select two essays from each bucket for a total of 8 essays
selected_essays = []
for bucket in buckets:
    selected_essays.extend(random.sample(bucket, 2))

print(selected_essays)

#### Preparing the data 

First, the data has to be prepared to get it into a usable format. We used a pandas dataframe here.

In [None]:
df_ref = pd.read_csv('../data/mas_results.csv')

values = []
graded_combinations = {token: set() for token in df_ref['token']}
annotators = list(graded_combinations.keys())

for i, row in df_ref.iterrows():
    token = row['token']
    for column in df_ref.columns:
        if "SQ" not in column:
            continue

        if row[column] is None or not isinstance(row[column], str):
            continue

        question = int(column[-2])
        prompt = column.split("0")[0]
        essay = int(column[-9:-7])

        values.append({
            "prompt": prompt,
            "essay": essay,
            "question": question,
            token: int(row[column][0])
        })
        graded_combinations[token].add((prompt, essay))

df = pd.DataFrame(values)

# create a ranking for the feedback texts for each annotator (limited to the texts that this annotator has graded)
annotator_rankings_dict = {}
for annotator in annotators:
    annotator_rankings_dict[annotator] = {}
    for prompt, essay in graded_combinations[annotator]:
        overall_score = \
        df[(df['prompt'] == prompt) & (df['essay'] == essay) & (df['question'] == 5)][annotator].dropna().iloc[0]
        tiebreaker_score = df[(df['prompt'] == prompt) & (df['essay'] == essay) & (df['question'] != 5)][
            annotator].dropna().sum()
        annotator_rankings_dict[annotator][(prompt, essay)] = (overall_score, tiebreaker_score)

    annotator_rankings_dict[annotator] = sorted(annotator_rankings_dict[annotator].items(),
                                                key=lambda x: (x[1][0], x[1][1]), reverse=True)
annotator_rankings = sorted(annotator_rankings_dict.items(), key=lambda x: x[0])

#### Calculate IAA

We used the Krippendorff's alpha to calculate the IAA. The following cell contains the code for the calculation of the IAA.

In [None]:
# calculate the krippendorff alpha for all annotators that start with the same letter
annotator_groups = [['Affen', 'Alexa', 'Andre'], ['Bohle', 'Bombe', 'Bulli'], ['Cande', 'Cesar', 'Cloth'],
                    ['Demut', 'Dobby', 'Doria']]

group_agreement = {}
for group in annotator_groups:
    annotator_values = [annotator_rankings_dict[annotator] for annotator in group]
    annotator_values = [[item[1][0] for item in annotator] for annotator in annotator_values]

    matrix = np.array(annotator_values)

    group_agreement[group[0][0]] = krippendorff.alpha(reliability_data=matrix, level_of_measurement='ordinal')

    print(group, round(krippendorff.alpha(reliability_data=matrix, level_of_measurement='ordinal'), 2))

#### Calculate Agreement with Automatic Evaluations

We used the Kendall's Tau to calculate the agreement with the automatic evaluations. The following cell contains the code for the calculation of the agreement with the automatic evaluations.

In [None]:
dataset_df = pd.read_excel("../data/training_set_rel3.xlsx", index_col='essay_id')

essay_ids = {
    'dcot': [9098, 9833, 9486, 9512, 9399, 9047, 10292, 8995],
    'fo': [9098, 9486, 9833, 9512, 9399, 8995, 9047, 10292],
    'ff': [9098, 9486, 9833, 9512, 9399, 8995, 10292, 9047]
}

essays = {
    'dcot': dataset_df.loc[essay_ids['dcot']].essay.tolist(),
    'fo': dataset_df.loc[essay_ids['fo']].essay.tolist(),
    'ff': dataset_df.loc[essay_ids['ff']].essay.tolist()
}

# get the helpfulness scores for the mistral model
pathes = {
    'dcot': '',  # add correct path to the helpfulness results file for the mistral model (.json)
    'fo': '',  # add correct path to the helpfulness results file for the mistral model (.json)
    'ff': '',  # add correct path to the helpfulness results file for the mistral model (.json)
}

mistral_gradings = {'dcot': [None for _ in range(8)], 'fo': [None for _ in range(8)], 'ff': [None for _ in range(8)]}
for prompt in ['dcot', 'fo', 'ff']:
    with open(pathes[prompt], "r") as f:
        results = json.load(f)
    for essay in data:
        if essay['essay'] in essays[prompt]:
            mistral_gradings['fo'][essays[prompt].index(essay['essay'])] = essay['parsed_output']['score']

        # get the helpfulness scores for the llama model 
pathes = {
    'dcot': '',  # add correct path to the helpfulness results file for the llama model (.json)
    'fo': '',  # add correct path to the helpfulness results file for the llama model (.json)
    'ff': '',  # add correct path to the helpfulness results file for the llama model (.json)
}

llama_gradings = {'dcot': [None for _ in range(8)], 'fo': [None for _ in range(8)], 'ff': [None for _ in range(8)]}
for prompt in ['dcot', 'fo', 'ff']:
    with open(pathes[prompt], "r") as f:
        results = json.load(f)
    for essay in data:
        if essay['essay'] in essays[prompt]:
            llama_gradings['fo'][essays[prompt].index(essay['essay'])] = essay['parsed_output']['score']

# make a ranking out of the mistral and llama gradings for all essays 
mistral_ranking = [(mistral_gradings[instruction][i], instruction, i) for instruction in mistral_gradings for i in
                   range(8)]
llama_ranking = [(llama_gradings[instruction][i], instruction, i) for instruction in llama_gradings for i in range(8)]

mistral_ranking = sorted(mistral_ranking, key=lambda x: x[0], reverse=True)
llama_ranking = sorted(llama_ranking, key=lambda x: x[0], reverse=True)

# make a combined ranking out of all the human annotations 
feedbacks = {}
for annotator, annotator_ranking in annotator_rankings_dict.items():
    for (feedback, score) in annotator_ranking:
        if feedback not in feedbacks:
            feedbacks[feedback] = 0
        feedbacks[feedback] += score[0]
combined_human_ranking = sorted(feedbacks.items(), key=lambda x: x[1], reverse=True)

# calculate the kendall tau between the rankings by giving all feedbacks a ranking and then sorting them alphabetically 
mistral_ranking_values = [None for _ in range(24)]
current_rank = 1
for i in range(10, 0, -1):
    found = False
    for j in range(24):
        if mistral_ranking[j][0] == i:
            mistral_ranking_values[j] = (current_rank, (mistral_ranking[j][1], mistral_ranking[j][2]))
            found = True
    if found:
        current_rank += 1

llama_ranking_values = [None for _ in range(24)]
current_rank = 1
for i in range(10, 0, -1):
    found = False
    for j in range(24):
        if llama_ranking[j][0] == i:
            llama_ranking_values[j] = (current_rank, (llama_ranking[j][1], llama_ranking[j][2]))
            found = True
    if found:
        current_rank += 1

combined_human_ranking_values = [None for _ in range(24)]
current_rank = 1
for i in range(24):
    combined_human_ranking_values[i] = (
    current_rank, (combined_human_ranking[i][0][0].lower(), combined_human_ranking[i][0][1]))
    current_rank += 1

mistral_ranking_values = sorted(mistral_ranking_values, key=lambda x: x[1])
llama_ranking_values = sorted(llama_ranking_values, key=lambda x: x[1])
combined_human_ranking_values = sorted(combined_human_ranking_values, key=lambda x: x[1])

print(
    f"Kendalls Tau Mistral and Human: {kendalltau([i[0] for i in mistral_ranking_values], [i[0] for i in combined_human_ranking_values]).correlation}")
print(
    f"Kendalls Tau Llama and Human: {kendalltau([i[0] for i in llama_ranking_values], [i[0] for i in combined_human_ranking_values]).correlation}")

# get the number of wins and losses and wins against for each list of items 
wins = {}
losses = {}
wins_against = {}

indices = [i for i in range(len(dataset)) if dataset[i]['essay_id'] in essay_ids['dcot'] and ("dcot" in dataset[i]['experiment'] or "feedback_only" in dataset[i]['experiment'] or "ff" in dataset[i]['experiment'])]

with open(f"./data/elo/matches.json", "r") as f:
    matches = json.load(f)

for match in tqdm(matches): 
    if match['parsed_output'] is None: 
        continue
    if match['index1'] not in indices or match['index2'] not in indices: 
        continue
    if match['parsed_output']['preferred_feedback'] == 1: 
        if match['index1'] not in wins: 
            wins[match['index1']] = 0
        wins[match['index1']] += 1
        if match['index2'] not in losses: 
            losses[match['index2']] = 0
        losses[match['index2']] += 1
        if match['index1'] not in wins_against: 
            wins_against[match['index1']] = {}
        if match['index2'] not in wins_against[match['index1']]: 
            wins_against[match['index1']][match['index2']] = 0
        wins_against[match['index1']][match['index2']] += 1
    elif match['parsed_output']['preferred_feedback'] == 2:
        if match['index2'] not in wins: 
            wins[match['index2']] = 0
        wins[match['index2']] += 1
        if match['index1'] not in losses: 
            losses[match['index1']] = 0
        losses[match['index1']] += 1
        if match['index2'] not in wins_against: 
            wins_against[match['index2']] = {}
        if match['index1'] not in wins_against[match['index2']]: 
            wins_against[match['index2']][match['index1']] = 0
        wins_against[match['index2']][match['index1']] += 1
        
# order the items so that the essay ids are in the same order as the essay_ids_*
items_dcot = [(dataset[i], i) for i in range(len(dataset)) if "dcot" in dataset[i]['experiment'] and dataset[i]['essay_id'] in essay_ids['dcot']]
items_fo = [(dataset[i], i) for i in range(len(dataset)) if ("feedback_only") in dataset[i]['experiment'] and dataset[i]['essay_id'] in essay_ids['dcot']]
items_ff = [(dataset[i], i) for i in range(len(dataset)) if "ff" in dataset[i]['experiment'] and dataset[i]['essay_id'] in essay_ids['dcot']]
items_dcot = sorted(items_dcot, key=lambda x: essay_ids['dcot'].index(x[0]['essay_id']))
items_fo = sorted(items_fo, key=lambda x: essay_ids['fo'].index(x[0]['essay_id']))
items_ff = sorted(items_ff, key=lambda x: essay_ids['ff'].index(x[0]['essay_id']))

ranking_values = []
for item in items_dcot:
    id = item[0]['essay_id']
    ranking_values.append((("dcot", essay_ids['dcot'].index(id)), (wins[item[1]], losses[item[1]])))
    
for item in items_ff:
    id = item[0]['essay_id']
    ranking_values.append((("ff", essay_ids['ff'].index(id)), (wins[item[1]], losses[item[1]])))
    
for item in items_fo:
    id = item[0]['essay_id']
    ranking_values.append((("fo", essay_ids['fo'].index(id)), (wins[item[1]], losses[item[1]])))
    
mistral_ranking_values = [None for _ in range(24)]
current_rank = 1
for i in range(30, 0, -1):
    found = False
    for j in range(24):
        if ranking_values[j][1][0] == i:
            mistral_ranking_values[j] = (current_rank, ranking_values[j][0])
            found = True
    if found:
        current_rank += 1
        
print(kendalltau([x[0] for x in combined_human_ranking_values], [x[0] for x in mistral_ranking_values]))

#### Calculate Averages for Each Prompt

The following cell contains the code for the calculation of the average response for each prompt.

In [None]:
# sum up the scores for each prompt
prompt_scores = {'dCoT': [0 for _ in range(5)], 'FO': [0 for _ in range(5)], 'FF': [0 for _ in range(5)]}

for i, row in df.iterrows():
    prompt = row['prompt']
    question = row['question']
    score = row.dropna().iloc[3]

    prompt_scores[prompt][question - 1] += score

# take the average of the scores for each prompt
for prompt in prompt_scores:
    prompt_scores[prompt] = [round(score / 24, 2) for score in prompt_scores[prompt]]

print(prompt_scores)