In [1]:
import vllm
import torch
import transformers

model_name="meta-llama/Meta-Llama-3-8B-Instruct"
model = vllm.LLM(
    model=model_name,
    tokenizer=model_name,
    tensor_parallel_size=2,
    dtype="bfloat16" if torch.cuda.is_bf16_supported() else "float16"
)

In [3]:
from transformers import AutoTokenizer
# model_name="meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# print(tokenizer.chat_template)
# set generation parameters
sampling_params = vllm.SamplingParams(max_tokens=512)
print(sampling_params)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=512, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None)


In [4]:
import os
import pandas as pd

system_prompt = "You are a helpful assistant, can conduct great emotional analysis"
user_prompt = """Rate the extent to which you agree with the statement: the narrators of the two stories would empathize with each other. We define empathy as feeling, understanding, and relating to what another person is experiencing. Note that it is possible to have empathy even without sharing the exact same experience or circumstance. Importantly, for two stories to be empathetically similar, both narrators should be able to empathize with each other (if narrator A’s story was shared in response to narrator B’s story, narrator B would empathize with narrator A and vice versa). Give your answer on a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely)

Narrative A: {story_a}

Narrative B: {story_b}

Rate in a scale of 1-4.

Answer:
"""

data_dir = "./empathic-stories-main/data/"
dataset_name_to_files = {
    "pair-train": "PAIRS (train).csv",
    "pair-dev": "PAIRS (dev).csv",
    "pair-test": "PAIRS (test).csv",
    "story-train": "STORIES (train).csv",
    "story-dev": "STORIES (dev).csv",
    "story-test": "STORIES (test).csv",
}

for split in ["train", "dev", "test"]:
    name = f"pair-{split}"
    df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[name]))
    print(len(df))
# print(df.columns)

samples = []
for k, v in df.iterrows():
    story_a = v['story_A_summary'].strip()
    story_b = v['story_B_summary'].strip()
    samples.append([story_a, story_b])

model_inputs = [
    tokenizer.apply_chat_template(
        [
            {"role":"system", "content": system_prompt.strip()}, 
            {"role":"user", "content": user_prompt.format(story_a=sample[0], story_b=sample[1]).strip()}
        ], 
        tokenize=False)
    for sample in samples
]

1500
100
400


In [5]:
print(model_inputs[29])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant, can conduct great emotional analysis<|eot_id|><|start_header_id|>user<|end_header_id|>

Rate the extent to which you agree with the statement: the narrators of the two stories would empathize with each other. We define empathy as feeling, understanding, and relating to what another person is experiencing. Note that it is possible to have empathy even without sharing the exact same experience or circumstance. Importantly, for two stories to be empathetically similar, both narrators should be able to empathize with each other (if narrator A’s story was shared in response to narrator B’s story, narrator B would empathize with narrator A and vice versa). Give your answer on a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely)

Narrative A: The author has a friend that they value and trust. They recently told their friend about their autism, and their friend reacted positively. The 

In [6]:
responses = []
for i in range(20):
    batch = model_inputs[i*20: (i+1)*20]
    result = model.generate(batch, sampling_params)
    r = [ x.outputs[0].text.replace("<|start_header_id|>assistant<|end_header_id|>", "").strip() for x in result]
    responses += r
    pd.DataFrame(responses).to_json("pair_test_original_prompt_0501.json")

Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.67it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:04<00:00,  4.01it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:06<00:00,  3.22it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.64it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.43it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.64it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.38it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.58it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:06<00:00,  3.23it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.76it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.68it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:06<00:00,  3.01it/s]
Processed prompts: 100%|████

In [7]:
responses[0]

'Based on the narratives provided, I would rate the extent to which the narrators of the two stories would empathize with each other as a 3, "very much".\n\nWhile the narrators may not share the exact same experiences (unwanted attention in Narrative A vs. body shaming and societal pressure in Narrative B), there are common underlying themes and emotions that could lead to empathetic understanding. Both narrators express feelings of discomfort, frustration, and resentment towards external forces that affect their personal lives.\n\nNarrative A\'s experience of unwanted attention and indifference towards consent could resonate with Narrative B\'s feelings of body shaming and societal pressure. The theme of being objectified, scrutinized, and controlled might be relatable to both narrators. Additionally, both narratives highlight the importance of autonomy, respect, and personal agency, which could facilitate empathetic understanding.\n\nWhile they may not share the same specific experie

### Parse results

In [9]:
parse_prompt = """This is a rater's analysis, please output the score this rater give (1-not at all, 2-not so much, 3-very much, 4-extremely). 

Analysis: {response}

ONLY OUTPUT THE SCORE."""

model_inputs = [
tokenizer.apply_chat_template(
    [
        {"role":"system", "content": system_prompt.strip()}, 
        {"role":"user", "content": parse_prompt.format(response=response).strip()}
    ], 
    tokenize=False)
for response in responses
]

In [11]:
print(model_inputs[9])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant, can conduct great emotional analysis<|eot_id|><|start_header_id|>user<|end_header_id|>

This is a rater's analysis, please output the score this rater give (1-not at all, 2-not so much, 3-very much, 4-extremely). 

Analysis: Based on the two narratives, I would rate the extent to which the narrators would empathize with each other as a 3 - very much.

Both narrators have experienced feelings of distress and discomfort as a result of the actions of others. The lesbian woman felt threatened and unsupported by her new neighbors, while the basketball narrator felt hated and belittled by the girl. Both narratives convey a sense of vulnerability and exposure to negative emotions.

While the specific circumstances of the two stories differ, the emotional resonance is similar. Both narrators have been on the receiving end of hurtful behavior, and both are seeking validation and understanding. As a result,

In [None]:
import re
scores = []
for i in range(20):
    batch = model_inputs[i*20: (i+1)*20]
    result = model.generate(batch, sampling_params)
    r = [ x.outputs[0].text.replace("<|start_header_id|>assistant<|end_header_id|>", "").strip() for x in result]
    scores += r
    pd.DataFrame(scores).to_json("Score_pair_test_original_prompt_0501.json")

scores = [re.findall(r'[1-4]', score) for score in scores]
scores = [int(score[0]) for score in scores]

### Evaluate Pearson

In [26]:
from src.eval import eval_sts, eval_nli
for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
          'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
    gold = [int(i) for i in df[h].to_list()]
    m1 = eval_sts(gold, scores)
    m2 = eval_nli(gold, scores)
    print(h.split("_")[1], m1)

empathy {'Pearson': 0.231, 'Spearman': 0.22, 'MSE': 0.8}
event {'Pearson': 0.25, 'Spearman': 0.245, 'MSE': 1.58}
emotion {'Pearson': 0.165, 'Spearman': 0.176, 'MSE': 1.01}
moral {'Pearson': 0.193, 'Spearman': 0.203, 'MSE': 1.06}


In [27]:
from src.eval import eval_sts, eval_nli
for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
          'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
    gold = [int(i) for i in df[h].to_list()]
    m1 = eval_sts(gold, scores)
    m2 = eval_nli(gold, scores)
    print(h.split("_")[1], m2)

empathy {'accuracy': 0.47, 'precision': 0.342, 'recall': 0.288, 'F1': 0.253}
event {'accuracy': 0.265, 'precision': 0.422, 'recall': 0.326, 'F1': 0.179}
emotion {'accuracy': 0.435, 'precision': 0.35, 'recall': 0.294, 'F1': 0.248}
moral {'accuracy': 0.408, 'precision': 0.437, 'recall': 0.293, 'F1': 0.239}


## LLaMA 3 using its original prompt

### Load data

In [1]:
import os 
import pandas as pd

data_dir = "./empathic-stories-main/data/"
dataset_name_to_files = {
    "pair-train": "PAIRS (train).csv",
    "pair-dev": "PAIRS (dev).csv",
    "pair-test": "PAIRS (test).csv",
    "story-train": "STORIES (train).csv",
    "story-dev": "STORIES (dev).csv",
    "story-test": "STORIES (test).csv",
}

data = {}
for split in ["train", "dev", "test"]:
    name = f"pair-{split}"
    df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[name]))
    data[split] = df
    # print(len(df))

empathic_similarity_prompt = """Rate the extent to which you agree with the statement "the narrators of the two stories would empathize with each other." We define empathy as feeling, understanding, and relating to what another person is experiencing. Note that it is possible to have empathy even without sharing the exact same experience or circumstance. Importantly, for two stories to be empathetically similar, both narrators should be able to empathize with each other (if narrator A’s story was shared in response to narrator B’s story, narrator B would empathize with narrator A and vice versa). Give your answer on a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely)

Narrative A: {story_a}

Narrative B: {story_b}

Rate in a scale of 1-4.
Answer: 
""".strip()

brief_empathic_similarity_prompt = """Rate how similar two narratives below are, in a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely)

Narrative A: {story_a}

Narrative B: {story_b}

Rate in a scale of 1-4.
Answer: """.strip()

system_prompt = "You are a helpful assistant, can conduct great emotional analysis"

split_to_test = "dev"
df = data[split_to_test]
user_inputs = []
for k, v in df.iterrows():
    story_a = v['story_A_summary'].strip()
    story_b = v['story_B_summary'].strip()
    user_inputs.append(empathic_similarity_prompt.format(story_a=story_a, story_b=story_b))

# print(user_inputs[0])

### Load model and generate

In [2]:
from src.llm import LLaMA3, gpt_easy
from src.eval import eval_sts, eval_nli
model_name="meta-llama/Meta-Llama-3-8B-Instruct"
model = LLaMA3(model_name=model_name)

responses = []
batch_size = 20
max_output_length = 512
save_path = f"pair_{split_to_test}_original_prompt_0501.json"
for i in range(int(len(user_inputs)/batch_size)):
    samples = user_inputs[i*batch_size: (i+1)*batch_size]
    r = model.generate(samples, max_output_length=max_output_length, system_prompt = system_prompt)
    responses += r
    pd.DataFrame(responses).to_json(save_path)

  from .autonotebook import tqdm as notebook_tqdm
2024-05-01 16:57:16,032	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-05-01 16:57:18,581	INFO worker.py:1749 -- Started a local Ray instance.


INFO 05-01 16:57:19 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-01 16:57:21 utils.py:608] Found nccl from library /home/yuxiawang/.config/vllm/nccl/cu12/libnccl.so.2.18.1
[36m(RayWorkerWrapper pid=46420)[0m INFO 05-01 16:57:21 utils.py:608] Found nccl from library /home/yuxiawang/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-01 16:57:21 selector.py:28] Using FlashAttention backend.
[36m(RayWorkerWrapper pid=46420)[0m INFO 05-01 16:57:21 selector.py:28] Using FlashAttention backend.
INFO 05-01 16:57:21 pynccl_utils.py:43] vLLM is using nccl==2.18.1
[36m(RayWorkerWrapper pid=46420)[0m INFO 05-01 16:57:21 pynccl_utils.py:43] vLLM is using nccl==2.18.1
INFO 05-01 16:57:22 utils.py:129] reading GPU P2P access cache from /home/yuxiawang/.config/vllm/gpu_p2p_access_cache_for_0,1.json
[36m(RayWorkerWrapper pid=46420)[0m INFO 05-01 16:57:22 utils.py:129] reading GPU P2P access cache from /home/yuxiawang/.config/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 05-01 16:57:22 weight_utils.py:193] Using model weights format ['*.safetensors']
[36



[36m(RayWorkerWrapper pid=46420)[0m INFO 05-01 16:57:30 model_runner.py:1057] Graph capturing finished in 3 secs.
INFO 05-01 16:57:30 model_runner.py:1057] Graph capturing finished in 3 secs.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.61it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:06<00:00,  3.25it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.45it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.54it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:05<00:00,  3.79it/s]


In [3]:
responses[0]

"I'd rate the extent to which the narrators of the two stories would empathize with each other a 3 - very much.\n\nBoth narratives deal with unexpected events and unexpected kindness. In Narrative A, the group faces a breakdown and unexpected help from a musician and their team, while in Narrative B, the birthday party takes an unexpected turn with the presence of long-lost friends and a heartwarming performance by the children. Both stories feature moments of gratitude, joy, and a sense of community.\n\nThe narrators of the two stories could likely understand and relate to each other's experiences, even if they seem vastly different at first glance. They might both appreciate the power of unexpected events and the kindness of strangers (in Narrative A) or the surprise and delight of reuniting with old friends (in Narrative B). They could empathize with the emotional highs and lows, as well as the sense of gratitude and appreciation that comes with these experiences."

### Parse and Eval

In [5]:
import re
from collections import Counter

parse_prompt = """This is a rater's analysis, please output the score this rater give (1-not at all, 2-not so much, 3-very much, 4-extremely). 

Analysis: {response}

ONLY OUTPUT THE SCORE."""
user_inputs = [parse_prompt.format(response=response) for response in responses]

scores = []
for i in range(int(len(user_inputs)/batch_size)):
    samples = user_inputs[i*batch_size: (i+1)*batch_size]
    r = model.generate(samples, max_output_length=max_output_length, system_prompt = system_prompt)
    scores += r
    pd.DataFrame(scores).to_json("Score_"+save_path)

scores = [re.findall(r'[1-4]', score) for score in scores]
scores = [int(score[0]) for score in scores]
pd.DataFrame(scores).to_json("ParedScore_"+save_path)
print(Counter(scores))

results = {}
for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
          'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
    gold_f = [float(i) for i in df[h].to_list()]
    gold_c = [int(i) for i in df[h].to_list()]
    m = eval_sts(gold_f, scores) 
    m.update(eval_nli(gold_c, scores))
    # m1 = eval_sts(gold_f, scores)
    # m2 = eval_nli(gold_c, scores)
    # print(h.split("_")[1], m1, m2)
    results[h.split("_")[1]] = m
print(pd.DataFrame.from_dict(results).T.to_latex(float_format="%.3f"))

Processed prompts: 100%|████████████████████████| 20/20 [00:00<00:00, 23.70it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:00<00:00, 22.57it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:00<00:00, 23.22it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:00<00:00, 24.51it/s]
Processed prompts: 100%|████████████████████████| 20/20 [00:00<00:00, 23.43it/s]

Counter({3: 68, 2: 26, 4: 6})
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.291 & 0.305 & 0.680 & 0.490 & 0.281 & 0.319 & 0.275 \\
event & 0.331 & 0.324 & 1.330 & 0.220 & 0.166 & 0.394 & 0.203 \\
emotion & 0.249 & 0.250 & 0.910 & 0.330 & 0.237 & 0.312 & 0.230 \\
moral & 0.218 & 0.216 & 1.000 & 0.320 & 0.213 & 0.304 & 0.200 \\
\bottomrule
\end{tabular}




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Results and Analysis

In [1]:
import re
import os
import pandas as pd
from collections import Counter
from src.eval import eval_sts, eval_nli

def bin_label(label):
    if label <= 2.5:
        return 0
    else:
        return 1

def bin_labels(labels):
    return [bin_label(label) for label in labels]

#### score distribution

In [2]:
# read json in the way: scores = pd.read_json("Score_pair_dev_original_prompt_0501.json")
# the data is not presented in the order it was saved, the index is not sequentially lines
# Next time, remember to check the data index, whether it is 0,1,2,3,4 ...

In [5]:
split_to_test = "test"
scores = pd.read_json(f"./results/LLaMA3_8B_zs/Score_pair_{split_to_test}_original_prompt_0501.json", 
                      typ="series")[0].values()
scores = [re.findall(r'[1-4]', score) for score in scores]
scores = [int(score[0]) for score in scores]
print(Counter(scores))


data_dir = "./empathic-stories-main/data/"
dataset_name = f"pair-{split_to_test}"
dataset_name_to_files = {
    "pair-train": "PAIRS (train).csv",
    "pair-dev": "PAIRS (dev).csv",
    "pair-test": "PAIRS (test).csv",
    "story-train": "STORIES (train).csv",
    "story-dev": "STORIES (dev).csv",
    "story-test": "STORIES (test).csv",
}

results = {}
# print(os.path.join(data_dir, dataset_name_to_files[dataset_name]))
df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[dataset_name]))
for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
          'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
    gold_f = [float(i) for i in df[h].to_list()]
    gold_c = [int(i) if i-int(i)<0.5 else int(i)+1 for i in df[h].to_list()]
    m = eval_sts(gold_f, scores) 
    # m.update(eval_nli(gold_c, scores))
    m.update(eval_nli(bin_labels(gold_f), bin_labels(scores)))
    results[h.split("_")[1]] = m
print(pd.DataFrame.from_dict(results).T.to_latex(float_format="%.3f"))

Counter({3: 325, 2: 55, 4: 17, 1: 3})
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.229 & 0.214 & 0.640 & 0.573 & 0.628 & 0.564 & 0.506 \\
event & 0.244 & 0.238 & 1.130 & 0.367 & 0.616 & 0.580 & 0.363 \\
emotion & 0.197 & 0.200 & 0.760 & 0.532 & 0.626 & 0.563 & 0.482 \\
moral & 0.210 & 0.212 & 0.820 & 0.497 & 0.642 & 0.573 & 0.463 \\
\bottomrule
\end{tabular}



In [16]:
# LLaMA3 zs summary
# dev
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.291 & 0.305 & 0.680 & 0.600 & 0.705 & 0.667 & 0.594 \\
event & 0.331 & 0.324 & 1.330 & 0.410 & 0.589 & 0.621 & 0.405 \\
emotion & 0.249 & 0.250 & 0.910 & 0.470 & 0.592 & 0.586 & 0.470 \\
moral & 0.218 & 0.216 & 1.000 & 0.450 & 0.616 & 0.634 & 0.449 \\
\bottomrule
\end{tabular}


# test
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.229 & 0.214 & 0.640 & 0.573 & 0.628 & 0.564 & 0.506 \\
event & 0.244 & 0.238 & 1.130 & 0.367 & 0.616 & 0.580 & 0.363 \\
emotion & 0.197 & 0.200 & 0.760 & 0.532 & 0.626 & 0.563 & 0.482 \\
moral & 0.210 & 0.212 & 0.820 & 0.497 & 0.642 & 0.573 & 0.463 \\
\bottomrule
\end{tabular}

#### No issue with parse score

In [2]:
import re
import pandas as pd
from collections import Counter
scores = pd.read_json("Score_pair_dev_original_prompt_0501.json")[0].to_list()
scores = [re.findall(r'[1-4]', score) for score in scores]
scores = [int(score[0]) for score in scores]
print(Counter(scores))

purescores = pd.read_json("ParedScore_pair_dev_original_prompt_0501.json")[0].to_list()

c = 0
for i, j in zip(purescores, scores):
    if i != j:
        print("False", i, j)
        c += 1
print(c)

Counter({3: 68, 2: 26, 4: 6})
0


#### Debug Tokenizer issues

In [None]:
from transformers import AutoTokenizer
model_name="meta-llama/Meta-Llama-3-8B-Instruct" # I used a wrong model name without -Instruct
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_inputs = [
    tokenizer.apply_chat_template(
        [
            {"role":"system", "content": system_prompt.strip()}, 
            {"role":"user", "content": sample.strip()}
        ], 
        tokenize=False)
    for sample in samples
]

print(model_inputs[100])

#### ChatGPT trial

In [6]:
a = """Rate the extent to which you agree with the statement "the narrators of the two stories would empathize with each other." We define empathy as feeling, understanding, and relating to what another person is experiencing. Note that it is possible to have empathy even without sharing the exact same experience or circumstance. Importantly, for two stories to be empathetically similar, both narrators should be able to empathize with each other (if narrator A’s story was shared in response to narrator B’s story, narrator B would empathize with narrator A and vice versa). Give your answer on a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely)

Narrative A: The author experienced unwanted attention from a young age, including being kissed without consent. When she punched the guy who kissed her at 12, she was told she overreacted, causing her to become indifferent to unwanted touching. In college, she learned about consent and discovered that positive experiences are possible with trustworthy people.

Narrative B: The author expresses her frustration with societal expectations of female body standards. She resents the pressure to compare her body to others and the trend of certain body types being in style. She also discusses her personal experiences with her family monitoring her body and feeling better treated after losing weight.

Rate in a scale of 1-4.
Answer: """

In [5]:
gpt_easy(a)

'Based on the narratives provided, I would rate the extent to which the narrators of the two stories would empathize with each other as a 3-very much. Both narrators have shared experiences of feeling uncomfortable and pressured due to societal expectations and unwanted attention in different forms. While their specific experiences may vary, the underlying emotions of frustration, resentment, and the struggle against societal norms are present in both stories, allowing for a strong potential for empathy between the narrators.'

### 0503 - GPT3.5 Two-stage

In [19]:
prompt = {
    "sys_prompt": "You are a helpful assistant, can conduct great emotion and event analysis", 
    "prompt_template": "[Story A]\n{story_a}\n\n[Story B]\n{story_b}\n\n[Analysis]\n{response}\n[End Analysis]\n\n{prompt2}", 
    "defaults": {
        "prompt1": "Compose an analysis comparing two stories from the perspectives of depiction of events, exploration of emotions, the underlying moral lessons and the empathy. Consider how the events unfolded in each story, the emotions evoked by these events, the underlying moral messages conveyed, and the level of empathy the stories elicit from the reader towards the characters involved. Craft your analysis to highlight both the similarities and differences between the stories in each of these four critical aspects.\n\n[Story A]\n{story_a}\n\n[Story B]\n{story_b}\n",
        "prompt2": """Rate the similarity of two stories across the dimensions of event portrayal, emotional resonance, moral significance and empathy. Note that rate in a scale of 1 to 4, with 1 indicating minimal alignment and 4 suggesting a high degree of correlation (1-not at all, 2-not so much, 3-very much, 4-extremely). Provide continuous scores based on your comprehensive analysis above and the two stories. 
        You should only respond in format as described below. DO NOT RETURN ANYTHING ELSE. START YOUR RESPONSE WITH '{{'.
        [response format]: 
{{
    "event": "score 1-4 indicating how similar in events",
    "emotion": "score 1-4 indicating how similar in emotional resonance between two stories",
    "moral": "score 1-4 indicating how similar in moral significance",
    "empathy": "score 1-4 indicating the similarity of the level of empathy the stories elicit from the reader towards the characters involved",
}}""",
    }, 
    "description": "two-stage empathic similarity rating."
}

In [20]:
import os
import re
import pandas as pd
from argparse import Namespace
from src.llm import LLaMA3, gpt_easy
from src.eval import eval_sts, eval_nli


dataset_name_to_files = {
    "pair-train": "PAIRS (train).csv",
    "pair-dev": "PAIRS (dev).csv",
    "pair-test": "PAIRS (test).csv",
    "story-train": "STORIES (train).csv",
    "story-dev": "STORIES (dev).csv",
    "story-test": "STORIES (test).csv",
}

split_to_test = "dev"
data_dir = "./empathic-stories-main/data/"
model_name="gpt-3.5-turbo-0125"
save_path = f"./results/pair_{split_to_test}_twostage_prompt-{model_name}.json"
    
dataset_name = f"pair-{split_to_test}"
df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[dataset_name]))

Compose an analysis comparing two stories from the perspectives of depiction of events, exploration of emotions, the underlying moral lessons and the empathy. Consider how the events unfolded in each story, the emotions evoked by these events, the underlying moral messages conveyed, and the level of empathy the stories elicit from the reader towards the characters involved. Craft your analysis to highlight both the similarities and differences between the stories in each of these four critical aspects.

[Story A]
The author and their friends road tripped from Munich to Croatia for a music festival. After driving for five hours, their car experienced a flat tire with no spare and no phone service in the middle of nowhere. They walked four miles to a gas station but found no help, and after waiting for four hours, a musician and their security team picked them up and gave them backstage passes to the festival.

[Story B]
The author hosted a Hawaiian themed birthday party for their boyfri

#### Stage 1

In [11]:
user_inputs = []
for k, v in df.iterrows():
    story_a = v['story_A_summary'].strip()
    story_b = v['story_B_summary'].strip()
    user_inputs.append(prompt["defaults"]["prompt1"].format(story_a=story_a, story_b=story_b))
print(user_inputs[0])

responses = []
for i, user_input in enumerate(user_inputs):
    r = gpt_easy(user_input, model=model_name, system_role=prompt["sys_prompt"])
    responses.append({"split": split_to_test, "id": i, "analysis": r})
    pd.DataFrame(responses).to_json(save_path, lines=True, orient="records")

#### Stage 2

In [21]:
user_inputs = []
for k, v in df.iterrows():
    story_a = v['story_A_summary'].strip()
    story_b = v['story_B_summary'].strip()
    user_inputs.append(prompt["prompt_template"].format(story_a=story_a, story_b=story_b, 
                                                        response=responses[k]['analysis'], 
                                                        prompt2=prompt["defaults"]["prompt2"]))
print(user_inputs[0])

[Story A]
The author and their friends road tripped from Munich to Croatia for a music festival. After driving for five hours, their car experienced a flat tire with no spare and no phone service in the middle of nowhere. They walked four miles to a gas station but found no help, and after waiting for four hours, a musician and their security team picked them up and gave them backstage passes to the festival.

[Story B]
The author hosted a Hawaiian themed birthday party for their boyfriend in a rented hall. Friends who hadn't been seen since moving to a new city attended and there were unexpected gifts and a dance performed by the couple's young children. The party continued at their house, with some guests spending the night and the next morning filled with laughter and memories.

[Analysis]
When comparing Story A and Story B, we can delve into various aspects such as the depiction of events, exploration of emotions, underlying moral lessons, and empathy evoked.

In terms of the depic

In [23]:
for i, user_input in enumerate(user_inputs):
    r = gpt_easy(user_input, model=model_name, system_role=prompt["sys_prompt"])
    try:
        # parse and save into dict
        rate = eval(r.strip()[1:-1])
    except:
        print(i)
        rate = r
    responses[i].update({"ratings": rate})
    pd.DataFrame(responses).to_json("final_"+save_path, lines=True, orient="records")

### Evaluate

In [42]:
scores = pd.read_json("final_"+save_path, lines=True)

In [43]:
scores[:3]

Unnamed: 0,split,id,analysis,ratings
0,dev,0,"When comparing Story A and Story B, we can del...","{'event': 2, 'emotion': 3, 'moral': 3, 'empath..."
1,dev,1,"In comparing Story A and Story B, several key ...","{'event': 2, 'emotion': 3, 'moral': 3, 'empath..."
2,dev,2,"In comparing Story A and Story B, we can obser...","{'event': 2, 'emotion': 3, 'moral': 3, 'empath..."


In [44]:
ratings = []
for i, row in scores.iterrows():
    r = row["ratings"]
    ratings.append(r)

In [55]:
predictions = pd.DataFrame(ratings)

In [56]:
predictions

Unnamed: 0,event,emotion,moral,empathy
0,2,3,3,3
1,2,3,3,3
2,2,3,3,3
3,2,3,3,3
4,2,3,3,4
...,...,...,...,...
95,3,3,3,4
96,3,3,3,3
97,3,2,3,3
98,2,3,3,4


In [52]:
df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[dataset_name]))
def bin_label(label):
    if label <= 2.5:
        return 0
    else:
        return 1

def bin_labels(labels):
    return [bin_label(label) for label in labels]

In [63]:
results = {}
for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
            'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
    gold_f = [float(i) for i in df[h].to_list()]
    gold_c = [int(i) if i-int(i)<0.5 else int(i)+1 for i in df[h].to_list()]
    
    for aspect in ['event', 'emotion', 'moral', 'empathy'][:1]:
        scores = [float(i) for i in predictions[aspect].to_list()]
    
        m = eval_sts(gold_f, scores) 
        # m.update(eval_nli(gold_c, scores))
        m.update(eval_nli(bin_labels(gold_f), bin_labels(scores)))
        results[h.split("_")[1]+f"-{aspect}"] = m
print(pd.DataFrame.from_dict(results).T.to_latex(float_format="%.3f"))

\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy-event & 0.314 & 0.327 & 0.610 & 0.720 & 0.716 & 0.667 & 0.673 \\
event-event & 0.315 & 0.313 & 0.650 & 0.770 & 0.635 & 0.674 & 0.647 \\
emotion-event & 0.345 & 0.364 & 0.630 & 0.730 & 0.666 & 0.647 & 0.653 \\
moral-event & 0.286 & 0.288 & 0.610 & 0.750 & 0.636 & 0.649 & 0.642 \\
\bottomrule
\end{tabular}



### GPT-4o zs

In [None]:
import os
import re
import pandas as pd
from argparse import Namespace
from src.llm import LLaMA3, gpt_easy
from src.eval import eval_sts, eval_nli

def bin_label(label):
    if label <= 2.5:
        return 0
    else:
        return 1

def bin_labels(labels):
    return [bin_label(label) for label in labels]

optimized_similarity_system_prompt = """Rate the extent to which you agree with the statement "the narrators of the two stories would empathize with each other." We define empathy as feeling, understanding, and relating to what another person is experiencing. Note that it is possible to have empathy even without sharing the exact same experience or circumstance. Importantly, for two stories to be empathetically similar, both narrators should be able to empathize with each other (if narrator A’s story was shared in response to narrator B’s story, narrator B would empathize with narrator A and vice versa). Give your answer on a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely), with 0.5 increments in each level between 1-4 are allowed. Please only return the score without any explanation.""".strip()

optimized_similarity_user_prompt = """
### Narrative A:
{story_a}

### Narrative B:
{story_b}

### Similarity Score:
"""

dataset_name_to_files = {
    "pair-train": "PAIRS (train).csv",
    "pair-dev": "PAIRS (dev).csv",
    "pair-test": "PAIRS (test).csv",
    "story-train": "STORIES (train).csv",
    "story-dev": "STORIES (dev).csv",
    "story-test": "STORIES (test).csv",
}

def load_es_data(split_to_test, data_dir = "./empathic-stories-main/data/"):
    dataset_name = f"pair-{split_to_test}"
    df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[dataset_name]))
    return df

def get_save_path(split_to_test, story_type, reorder_story, model_name):
    if reorder_story:
        suffix = "reverse"
    else:
        suffix = ""
    save_path = f"./results/openai_eval/pair_{split_to_test}_{model_name}_{story_type}_zs_{suffix}.json"
    return save_path

def gpt_rate(split_to_test, story_type, reorder_story, 
             model_name = "gpt-4o", 
             data_dir = "./empathic-stories-main/data/"):
    
    df = load_es_data(split_to_test, data_dir)
    save_path = get_save_path(split_to_test, story_type, reorder_story, model_name)
    print(save_path)
    
    # construst user prompt
    responses = []
    for i, row in df.iterrows():
        if story_type == "summary":
            story_a = row['story_A_summary'].strip()
            story_b = row['story_B_summary'].strip()   
        else:
            story_a = row['story_A'].strip()
            story_b = row['story_B'].strip()
        if reorder_story:
            user_input = optimized_similarity_user_prompt.format(story_a=story_b, story_b=story_a)
        else:
            user_input = optimized_similarity_user_prompt.format(story_a=story_a, story_b=story_b)
            # print(user_input)
        
        # run model to get ratings
        r = gpt_easy(user_input, model=model_name, system_role=optimized_similarity_system_prompt)
        responses.append({"split": split_to_test, "id": i, "score": r})
        pd.DataFrame(responses).to_json(save_path, lines=True, orient="records")
    
    # eval
    responses = pd.DataFrame(responses)
    eval_es(responses, df)
    return responses

def eval_es(responses, df):
    # prediction
    scores = [float(score) for score in responses_ab["score"].to_list()]
    results = {}
    for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
                'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
        gold_f = [float(i) for i in df[h].to_list()]
        m = eval_sts(gold_f, scores) 
        m.update(eval_nli(bin_labels(gold_f), bin_labels(scores)))
        results[h.split("_")[1]] = m
    print(pd.DataFrame.from_dict(results).T.to_latex(float_format="%.3f"))

s
def overall_eval(df, save_path):
    # overall evaluation, save_path is reverse version savepath
    responses_ba = pd.read_json(save_path, lines=True) # story order by b, a
    responses_ab = pd.read_json(save_path.replace("_reverse", ""), lines=True) # story order by a, b

    for score_type in ["mean", "standard", "reverse"]:
        print(score_type)
        if score_type == "standard":
            scores = [float(score) for score in responses_ab["score"].to_list()]
        elif score_type == "reverse":
            scores = [float(score) for score in responses_ba["score"].to_list()]
        else:
            scores = [(float(s1)+float(s2))/2 for s1, s2 in zip(responses_ab["score"].to_list(), responses_ba["score"].to_list())]   
        
        results = {}
        for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
                    'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
            gold_f = [float(i) for i in df[h].to_list()]
            
            m = eval_sts(gold_f, scores) 
            # m.update(eval_nli(gold_c, scores))
            m.update(eval_nli(bin_labels(gold_f), bin_labels(scores)))
            results[h.split("_")[1]] = m
        print(pd.DataFrame.from_dict(results).T.to_latex(float_format="%.3f"))

# load data
data_dir = "./empathic-stories-main/data/"
model_name="gpt-4o"
# split_to_test = "dev"
# story_type = "full"
# reorder_story = True

# get ratings
for story_type in ["summary", "full"]:
    for split_to_test in ['dev', 'test']:
        for reorder_story in [False, True]:
            responses = gpt_rate(split_to_test, story_type, reorder_story, 
                                 model_name = "gpt-4o", data_dir = "./empathic-stories-main/data/")

# run the final overall evaluation
for story_type in ["summary", "full"]:
    for split_to_test in ['dev', 'test']:
        df = load_es_data(split_to_test, data_dir)
        save_path = get_save_path(split_to_test, story_type, reorder_story=True, model_name=model_name)
        print(save_path)
        overall_eval(df, save_path)

In [None]:
# reverse story A and B position
# GPT-4o zs summary
# dev
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.405 & 0.404 & 0.730 & 0.720 & 0.703 & 0.688 & 0.692 \\
event & 0.408 & 0.415 & 0.780 & 0.730 & 0.628 & 0.697 & 0.635 \\
emotion & 0.469 & 0.472 & 0.680 & 0.750 & 0.700 & 0.712 & 0.705 \\
moral & 0.353 & 0.347 & 0.770 & 0.710 & 0.621 & 0.659 & 0.628 \\
\bottomrule
\end{tabular}

# test
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.368 & 0.376 & 0.720 & 0.650 & 0.658 & 0.652 & 0.648 \\
event & 0.437 & 0.438 & 0.650 & 0.695 & 0.651 & 0.704 & 0.650 \\
emotion & 0.399 & 0.398 & 0.680 & 0.645 & 0.642 & 0.638 & 0.638 \\
moral & 0.422 & 0.438 & 0.640 & 0.700 & 0.687 & 0.689 & 0.688 \\
\bottomrule
\end{tabular}


# GPT-4o zs full
# dev
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.347 & 0.354 & 0.720 & 0.740 & 0.725 & 0.729 & 0.727 \\
event & 0.335 & 0.336 & 1.000 & 0.690 & 0.629 & 0.720 & 0.620 \\
emotion & 0.407 & 0.395 & 0.730 & 0.730 & 0.696 & 0.728 & 0.701 \\
moral & 0.251 & 0.220 & 0.900 & 0.690 & 0.638 & 0.699 & 0.634 \\
\bottomrule
\end{tabular}

# test
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.298 & 0.307 & 0.760 & 0.610 & 0.611 & 0.611 & 0.610 \\
event & 0.365 & 0.367 & 0.800 & 0.635 & 0.623 & 0.672 & 0.601 \\
emotion & 0.319 & 0.327 & 0.760 & 0.615 & 0.613 & 0.614 & 0.613 \\
moral & 0.354 & 0.367 & 0.720 & 0.665 & 0.661 & 0.667 & 0.660 \\
\bottomrule
\end{tabular}

In [None]:
# GPT-4o zs full
# dev
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.292 & 0.307 & 0.720 & 0.700 & 0.683 & 0.687 & 0.685 \\
event & 0.371 & 0.359 & 0.890 & 0.690 & 0.629 & 0.720 & 0.620 \\
emotion & 0.314 & 0.330 & 0.780 & 0.690 & 0.654 & 0.680 & 0.657 \\
moral & 0.244 & 0.247 & 0.840 & 0.710 & 0.658 & 0.729 & 0.658 \\
\bottomrule
\end{tabular}

# test
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.335 & 0.359 & 0.720 & 0.660 & 0.660 & 0.660 & 0.660 \\
event & 0.385 & 0.404 & 0.810 & 0.615 & 0.627 & 0.678 & 0.590 \\
emotion & 0.332 & 0.345 & 0.740 & 0.630 & 0.632 & 0.633 & 0.630 \\
moral & 0.362 & 0.397 & 0.720 & 0.680 & 0.685 & 0.693 & 0.678 \\
\bottomrule
\end{tabular}


# GPT-4o zs summary
# dev
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.486 & 0.486 & 0.700 & 0.750 & 0.739 & 0.717 & 0.723 \\
event & 0.468 & 0.420 & 0.710 & 0.740 & 0.634 & 0.703 & 0.644 \\
emotion & 0.489 & 0.477 & 0.690 & 0.780 & 0.734 & 0.743 & 0.738 \\
moral & 0.429 & 0.416 & 0.710 & 0.740 & 0.652 & 0.696 & 0.662 \\
\bottomrule
\end{tabular}

# test
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.443 & 0.432 & 0.670 & 0.652 & 0.664 & 0.655 & 0.649 \\
event & 0.466 & 0.459 & 0.630 & 0.703 & 0.649 & 0.698 & 0.652 \\
emotion & 0.457 & 0.436 & 0.630 & 0.632 & 0.630 & 0.624 & 0.623 \\
moral & 0.452 & 0.448 & 0.630 & 0.677 & 0.663 & 0.661 & 0.661 \\
\bottomrule
\end{tabular}

In [None]:
python ./src/llm_eval_openllm.py \
    --output_dir results/llm_eval_openllm_small \
    --dataset_dir empathic-stories-main/data \
    --model_name gpt-4o \
    --split dev \
    --prompt_type optimized \
    --story_type summary

### OpenAI embedding cosine

In [4]:
import os
import pandas as pd

dataset_name_to_files = {
    "pair-train": "PAIRS (train).csv",
    "pair-dev": "PAIRS (dev).csv",
    "pair-test": "PAIRS (test).csv",
    "story-train": "STORIES (train).csv",
    "story-dev": "STORIES (dev).csv",
    "story-test": "STORIES (test).csv",
}

split_to_test = "test"
data_dir = "./empathic-stories-main/data/"    
dataset_name = f"pair-{split_to_test}"
df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[dataset_name]))

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def cal_cosine_sim(a, b):
    assert(len(a) == len(b))
    # Reshape embeddings to 2D arrays (required by cosine_similarity)
    embedding1 = np.array(a).reshape(1, -1)
    embedding2 = np.array(b).reshape(1, -1)
    
    # Step 5: Compute cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]
    # print("Cosine similarity:", similarity[0][0])

from openai import OpenAI
client = OpenAI(api_key="Your_OpenAI_Key")

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

def get_pair_embedding(story_a, story_b, model="text-embedding-3-small"):
    story_a = story_a.replace("\n", " ").strip()
    story_b = story_b.replace("\n", " ").strip()
    rs = client.embeddings.create(input = [story_a, story_b], model=model)
    a = rs.data[0].embedding
    b = rs.data[1].embedding

    return a, b, cal_cosine_sim(a, b)

embeddings = []
model="text-embedding-3-large"
is_story_summary = False
for i, row in df.iterrows():
    if is_story_summary:
        story_a = row['story_A_summary']
        story_b = row['story_B_summary']
        suffix = "summary"
    else:
        story_a = row['story_A']
        story_b = row['story_B']
        suffix = "fullstory"
    ae, be, sim = get_pair_embedding(story_a, story_b, model=model)
    embeddings.append({
        "story_a_embedding": ae,
        "story_b_embedding": be,
        "cosine_sim": round(sim, 3)
    })
    pd.DataFrame(embeddings).to_csv(f'./results/{dataset_name}_{model}_embeddings_{suffix}.csv', index=False)
emds = pd.DataFrame(embeddings)

#### Evaluation

In [6]:
import os
import pandas as pd

dataset_name_to_files = {
    "pair-train": "PAIRS (train).csv",
    "pair-dev": "PAIRS (dev).csv",
    "pair-test": "PAIRS (test).csv",
    "story-train": "STORIES (train).csv",
    "story-dev": "STORIES (dev).csv",
    "story-test": "STORIES (test).csv",
}

split_to_test = "test"
data_dir = "./empathic-stories-main/data/"    
dataset_name = f"pair-{split_to_test}"
df = pd.read_csv(os.path.join(data_dir, dataset_name_to_files[dataset_name]))

emds = pd.read_csv(f'./results/pair-{split_to_test}_text-embedding-3-large_embeddings_fullstory.csv')
preds = emds['cosine_sim'].to_list()

from src.eval import eval_sts, eval_nli
def bin_label(label):
    if label <= 2.5:
        return 0
    else:
        return 1

def bin_labels(labels):
    return [bin_label(label) for label in labels]

results = {}
for h in ['similarity_empathy_human_AGG', 'similarity_event_human_AGG', 
            'similarity_emotion_human_AGG', 'similarity_moral_human_AGG']:
    gold_f = [float(i) for i in df[h].to_list()]
    gold_c = [int(i) if i-int(i)<0.5 else int(i)+1 for i in df[h].to_list()]
    scores = [i*4 for i in preds]
    m = eval_sts(gold_f, scores) 
    # m.update(eval_nli(gold_c, scores))
    m.update(eval_nli(bin_labels(gold_f), bin_labels(scores)))
    results[h.split("_")[1]] = m
print(pd.DataFrame.from_dict(results).T.to_latex(float_format="%.3f"))

\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.362 & 0.363 & 1.440 & 0.507 & 0.624 & 0.519 & 0.384 \\
event & 0.488 & 0.469 & 0.590 & 0.782 & 0.737 & 0.551 & 0.538 \\
emotion & 0.393 & 0.386 & 1.260 & 0.568 & 0.685 & 0.529 & 0.421 \\
moral & 0.395 & 0.403 & 1.140 & 0.618 & 0.651 & 0.524 & 0.440 \\
\bottomrule
\end{tabular}



In [None]:
# full story
# dev
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.303 & 0.309 & 1.400 & 0.620 & 0.561 & 0.505 & 0.406 \\
event & 0.385 & 0.409 & 0.680 & 0.810 & 0.413 & 0.488 & 0.448 \\
emotion & 0.345 & 0.361 & 1.260 & 0.710 & 0.607 & 0.510 & 0.446 \\
moral & 0.337 & 0.325 & 1.050 & 0.790 & 0.648 & 0.517 & 0.484 \\
\bottomrule
\end{tabular}

# test
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.362 & 0.363 & 1.440 & 0.507 & 0.624 & 0.519 & 0.384 \\
event & 0.488 & 0.469 & 0.590 & 0.782 & 0.737 & 0.551 & 0.538 \\
emotion & 0.393 & 0.386 & 1.260 & 0.568 & 0.685 & 0.529 & 0.421 \\
moral & 0.395 & 0.403 & 1.140 & 0.618 & 0.651 & 0.524 & 0.440 \\
\bottomrule
\end{tabular}

# story summary
# dev
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.335 & 0.315 & 1.280 & 0.630 & 0.813 & 0.513 & 0.411 \\
event & 0.437 & 0.411 & 0.600 & 0.820 & 0.414 & 0.494 & 0.451 \\
emotion & 0.394 & 0.350 & 1.130 & 0.720 & 0.859 & 0.517 & 0.451 \\
moral & 0.359 & 0.309 & 0.960 & 0.800 & 0.899 & 0.524 & 0.489 \\
\bottomrule
\end{tabular}

# test
\begin{tabular}{lrrrrrrr}
\toprule
 & Pearson & Spearman & MSE & accuracy & precision & recall & F1 \\
\midrule
empathy & 0.336 & 0.329 & 1.510 & 0.505 & 0.633 & 0.517 & 0.376 \\
event & 0.485 & 0.465 & 0.620 & 0.780 & 0.738 & 0.542 & 0.522 \\
emotion & 0.392 & 0.388 & 1.310 & 0.550 & 0.582 & 0.510 & 0.392 \\
moral & 0.366 & 0.356 & 1.210 & 0.620 & 0.692 & 0.525 & 0.437 \\
\bottomrule
\end{tabular}