#### Score Correlation Analysis
###### This notebook analyses the correlation between the different evaluation method scores

In [None]:
import json

with open("gpt4-model-1-response.json", "r") as file:
    gpt4_model_1 = json.load(file)

with open("llama3-8b-model-1-response.json", "r") as file:
    llama3_8b_model_1 = json.load(file)

#### GPT-4 vs Llama 3 8B

In [None]:
import numpy as np
import matplotlib.pyplot as plt


list1, list2 = gpt4_model_1, llama3_8b_model_1

plt.scatter(list1, list2)
plt.plot(
    np.unique(list1),
    np.poly1d(np.polyfit(list1, list2, 1))(np.unique(list1))
)
plt.xlabel("GPT-4")
plt.ylabel("Llama3 8B")
plt.show()

#### Correlation Coefficients

In [None]:
import pandas as pd
from scipy.stats import spearmanr, kendalltau

pearson_correlation = np.corrcoef(list1, list2)[0, 1]
spearman_correlation, _ = spearmanr(list1, list2)
kendall_tau_correlation, _ = kendalltau(list1, list2)

correlation_table = pd.DataFrame({
    "Pearson": [pearson_correlation],
    "Spearman": [spearman_correlation],
    "Kendall Tau": [kendall_tau_correlation]
}, index=['Results'])

correlation_table

###### For comparison, below are the correlation coefficients from the Prometheus 2 paper by Kim et al. 2024 (https://arxiv.org/abs/2405.01535), which are all in the same ballpark as the ones reported for Llama 3 above
###### Note that Prometheus 2 is a model specifically finetuned for LLM rating and evaluation