In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def eval_sts(labels, y_pred):
    eval_pearson, _ = pearsonr(labels, y_pred)
    eval_spearman, _ = spearmanr(labels, y_pred)
    mse = np.square(np.subtract(labels, y_pred)).mean()

    metrics = {
        "Pearson": round(eval_pearson, 3),
        "Spearman": round(eval_spearman, 3),
        "MSE": round(mse, 2)
    }
    
    # print(f"Pearson correlation {metrics['Pearson']}, Spearman correlation {metrics['Spearman']}, MSE {metrics['MSE']}")
    return metrics


def eval_nli(y_true, y_pred, average="macro"):
    precision, recall, F1, support = precision_recall_fscore_support(y_true, y_pred, average=average)
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {
        "accuracy": round(accuracy, 3),
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "F1": round(F1, 3),
    }
    return metrics

### Test set

In [23]:
df_t = pd.read_csv("./data/Test_set_scores.csv")
df_v = pd.read_csv("./data/Validation_set_Scores.csv")

In [27]:
df_v.columns

Index(['similarity_empathy_human_AGG', 'similarity_event_human_AGG',
       'similarity_emotion_human_AGG', 'similarity_moral_human_AGG',
       'Unnamed: 4', 'Unnamed: 5', 'SBERT_score', 'BART_score'],
      dtype='object')

In [26]:
df_v[:4]

Unnamed: 0,similarity_empathy_human_AGG,similarity_event_human_AGG,similarity_emotion_human_AGG,similarity_moral_human_AGG,Unnamed: 4,Unnamed: 5,SBERT_score,BART_score
0,2.5,1.0,2.0,2.5,,,1.941957,0.334691
1,1.5,2.0,2.0,1.5,,,2.408138,0.706342
2,2.0,2.0,2.0,2.0,,,0.902103,0.996153
3,4.0,2.0,2.5,2.5,,,2.032713,0.915401


In [28]:
for k, df in zip(["validation set", "test set"], [df_v, df_t]):
    print(k)
    if k == "validation set":
        models = ['SBERT_score', 'BART_score']
    elif k == "test set":
        models = ['SBERT', 'BART']
    for model in models:
        for human in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG',\
                  'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
            gold = df[human].to_list()
            pred = df[model].to_list()
            metrics = eval_sts(labels = gold, y_pred = pred)
            print(model, human.split("_")[-3], metrics["Pearson"])
        print("\n")
    

validation set
SBERT_score empathy 0.239
SBERT_score event 0.378
SBERT_score emotion 0.296
SBERT_score moral 0.263


BART_score empathy 0.35
BART_score event 0.385
BART_score emotion 0.25
BART_score moral 0.335


test set
SBERT empathy 0.346
SBERT event 0.439
SBERT emotion 0.381
SBERT moral 0.365


BART empathy 0.333
BART event 0.417
BART emotion 0.332
BART moral 0.381


