In [1]:
import pandas as pd
import torch
import numpy as np
import scipy
from sklearn import metrics
import matplotlib.pyplot as plt
import random
from sklearn.metrics import classification_report
from transformers import GenerationConfig
import torch

In [2]:
from transformers.utils import logging
logging.get_logger("transformers").setLevel(logging.ERROR)

In [3]:
data_path = './empathic-stories/data/'

In [4]:
train_df = pd.read_csv(f'{data_path}/PAIRS (train).csv')
dev_df = pd.read_csv(f'{data_path}/PAIRS (dev).csv')
test_df = pd.read_csv(f'{data_path}/PAIRS (test).csv')

In [5]:

label_columns = "similarity_empathy_human_AGG	similarity_event_human_AGG	similarity_emotion_human_AGG	similarity_moral_human_AGG".split()[::-1]
text_preprocess = lambda x: x.strip().replace("\n", " ")

In [6]:
import trl

In [7]:
story_in_use = 'summary'
label_in_use = 'empathy'

In [8]:

text_columns = {
    "summary": ["story_A_summary", "story_B_summary"],
    "full": ["story_A", "story_B"],
}.get(story_in_use)
label_column = {
    "empathy": ["similarity_empathy_human_AGG"],
    "event": ["similarity_event_human_AGG"],
    "emotion": ["similarity_emotion_human_AGG"],
    "moral": ["similarity_moral_human_AGG"],
    "all": label_columns,
}.get(label_in_use)

In [9]:
score_conversion_funcs = {
    "none": lambda x:x,
    "original_paper": lambda x: x / 4,
    "01_continue": lambda x:(x - 1) / 3,
}
score_recover_funcs = {
    "none": lambda x:x,
    "original_paper": lambda x: x * 4,
    "01_continue": lambda x:(x * 3) + 1,
}

In [10]:
# Choose score conversion method
score_conversion_in_use = "none"

In [11]:
score_conversion_func = score_conversion_funcs.get(score_conversion_in_use)
score_recover_func = score_recover_funcs.get(score_conversion_in_use)

In [12]:

def create_data(df, text_pps, score_conversion_funcs, analysis_df):
    required_columns = text_columns + label_column
    score_names = [f"score_{i}" for i in range(len(label_column))]
    df = df[required_columns].rename(
        columns={
            k: v
            for k, v in zip(
                required_columns, ["sentence1", "sentence2"] + score_names)
        }
    )
    for i in [1, 2]:
        df[f"sentence{i}"] = df[f"sentence{i}"].apply(text_pps)
    for i in range(len(label_column)):
        df[f"score_{i}"] = score_conversion_funcs(df[f"score_{i}"])
    return pd.concat([df, analysis_df],axis=1)[["sentence1", "sentence2", "analysis"] + score_names]


In [13]:

process_analysis = lambda d: '\n\n'.join(d.split('\n\n')[np.argmax(list(map(lambda x: "Thematic Similarities" in x,d.split('\n\n')))):])
analysis_path = './data/'
train_analysis = pd.read_json(f"{analysis_path}/train_explanations_{story_in_use}.json").rename(columns={0: "analysis"})
train_analysis['analysis'] = train_analysis.apply(lambda x:process_analysis(x['analysis']),axis=1)
dev_analysis = pd.read_json(f"{analysis_path}/dev_explanations_{story_in_use}.json").rename(columns={0: "analysis"})
dev_analysis['analysis'] = dev_analysis.apply(lambda x:process_analysis(x['analysis']),axis=1)
test_analysis = pd.read_json(f"{analysis_path}/test_explanations_{story_in_use}.json").rename(columns={0: "analysis"})
test_analysis['analysis'] = test_analysis.apply(lambda x:process_analysis(x['analysis']),axis=1)


In [14]:

train_df = create_data(train_df, text_preprocess, score_conversion_func, train_analysis)
dev_df = create_data(dev_df, text_preprocess, score_conversion_func, dev_analysis)
test_df = create_data(test_df, text_preprocess, score_conversion_func, test_analysis)


In [15]:
import datasets
from datasets import Dataset

In [16]:
optimized_similarity_system_prompt = """
We define empathy as feeling, understanding, and relating to what another person is experiencing.
Note that it is possible to have empathy even without sharing the exact same experience or circumstance.
Your task is to measure the empathic similarity of the given two stories.
Importantly, for two stories to be empathetically similar, both narrators should be able to empathize with each other (if narrator A’s story was shared in response to narrator B’s story, narrator B would empathize with narrator A and vice versa).
Give your answer on a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely), with 0.5 increments in each level between 1-4 are allowed.
You should first analyze the two stories, and then, return the score in a JSON object.""".strip()

optimized_similarity_user_prompt = """
## Narrative A:
{story_a}

## Narrative B:
{story_b}
"""

optimized_similarity_response_prompt = """
## Analysis
{analysis}

## Similarity Score:
```
{{"score": {score}}}
```
"""


In [17]:
from transformers import AutoTokenizer

In [18]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", padding_side='left')

In [19]:
tokenizer.padding_side

'left'

In [20]:

def promptify(x1, x2, analysis, labels):
    user_input = optimized_similarity_user_prompt.format(
        story_a=x1, story_b=x2
    ).strip()
    label = optimized_similarity_response_prompt.format(
        analysis=analysis, score=score_recover_func(labels[0])
    ).strip()
    model_input = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": optimized_similarity_system_prompt},
            {"role": "user", "content": user_input},
            {"role": "assistant", "content": label},
        ],
        tokenize=False,
    )
    return {"text": model_input}

def create_dataset(df, concat_reverse=False, shuffle=False):
    score_columns = [f"score_{i}" for i in range(len(label_column))]
    dataset = (
        Dataset.from_pandas(df)
        .map(lambda x: promptify(x["sentence1"], x["sentence2"], x['analysis'], [x[s] for s in score_columns]))
        .remove_columns(["sentence1", "sentence2", "analysis"] + score_columns)
    )
    if concat_reverse:
        _ = (
            Dataset.from_pandas(df)
            .map(lambda x: promptify(x["sentence2"], x["sentence1"], x['analysis'], [x[s] for s in score_columns]))
            .remove_columns(["sentence1", "sentence2", "analysis"] + score_columns)
        )
        dataset = datasets.concatenate_datasets([dataset, _])
    if shuffle:
        dataset = dataset.shuffle()
    return dataset

In [21]:
train_dataset = create_dataset(train_df, concat_reverse=True, shuffle=True)
dev_dataset = create_dataset(dev_df, concat_reverse=False, shuffle=False)
test_dataset = create_dataset(test_df, concat_reverse=False, shuffle=False)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [22]:
from transformers import AutoModelForCausalLM
from peft import AutoPeftModelForCausalLM

In [23]:
model = AutoPeftModelForCausalLM.from_pretrained('./sft_llama3-emp_2gpus-summary-cot-3/')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
model = model.cuda(0).to(torch.bfloat16).eval()
model = model.base_model.merge_and_unload()

In [25]:
generation_config = GenerationConfig.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct')
tokenizer.pad_token_id = tokenizer.added_tokens_encoder[
    "<|reserved_special_token_0|>"
]
tokenizer.padding_side = "left"

generation_config.max_length = 2048

generation_config.temperature = 0.0
generation_config.pad_token_id = tokenizer.pad_token_id
generation_config.do_sample = False
model.generation_config.pad_token_id=tokenizer.pad_token_id

In [26]:
def predict_batch(samples):
    model_inputs, labels = list(zip(*[ sample.split('<|start_header_id|>assistant<|end_header_id|>') for sample in samples['text']]))
    model_inputs = [ x+'<|start_header_id|>assistant<|end_header_id|>' for x in model_inputs]
    labels = [ label.strip() for label in labels]
    model_inputs = tokenizer(model_inputs, add_special_tokens=False, return_tensors='pt', padding=True)
    
    response = model.generate(**{ k:v.cuda(0) for k,v in model_inputs.items()}, generation_config=generation_config, pad_token_id=tokenizer.eos_token_id)
    predictions = [ tokenizer.decode(result[result!=tokenizer.pad_token_id]) for result in response]
    # predictions = [ tokenizer.decode(response[i][model_inputs.input_ids[i].masked_select(model_inputs.input_ids[i]!=tokenizer.pad_token_id).shape[0]:], skip_special_tokens=True) for i in range(len(labels))]
    # predicted_scores = [ multi_scores.split() for multi_scores in predicted_scores]
    # labels = [ multi_labels.split() for multi_labels in labels]
    return predictions, labels

In [27]:
# predictions, labels = predict_batch(dev_dataset[:3])

In [28]:
import re

In [29]:
all_equal = lambda x: all([ x[0] == z for z in x])

In [30]:
def parse(text):
    text = text.split('<|start_header_id|>assistant<|end_header_id|>')[-1]
    if len(text.split('```')) == 3:
        predicted_score = list(eval(text.split('```')[1]).values())[0]
        return predicted_score
    else:
        possible_values = re.findall('(1.0|1.5|2.0|2.5|3.0|3.5|4.0)', text)
        if len(possible_values)>0:
            all_values = [ eval(x) for x in possible_values]
            if all_equal(all_values):
                return all_values[0]
            else:
                return all_values[-1]
        else:
            print(text)
            return 1.0
        

In [31]:
model.generation_config.padding_side='left'

In [32]:
import tqdm

In [33]:
def make_prediction(dataset, bsz=20):
    predictions = []
    labels = []
    for i in tqdm.tqdm(range(len(dataset) // bsz + 1)):
        batch = dataset[i*bsz:(i+1)*bsz]
        if len(batch['text']) == 0:
            break
        predicted_score, label = predict_batch(batch)
    # label = [ parse(x) for x in label]
    # predicted_score = [ parse(x) for x in predicted_score]
        predictions.extend(predicted_score)
        labels.extend(label)
    # bin_labels = [ [int(float(x)>2.5) for x in multi_labels] for multi_labels in labels]
    # bin_prediction = [ [int(float(x)>2.5) for x in multi_predictions ] for multi_predictions in predictions]
    # float_predictions = [ [float(x) for x in multi_predictions ] for multi_predictions in predictions]
    # float_labels = [ [float(x) for x in multi_labels ] for multi_labels in labels]
    return predictions, labels

In [35]:
def evaluate(predictions, labels):
    bin_labels = [ int(float(x)>2.5) for x in labels]
    bin_prediction = [ int(float(x)>2.5) for x in predictions]
    float_predictions = predictions
    float_labels = labels
    cls_dfs = pd.DataFrame(classification_report(bin_labels, bin_prediction, output_dict=True)).T
    spearmanr = scipy.stats.spearmanr(np.array(float_labels),np.array(float_predictions)).statistic
    pearsonr = scipy.stats.pearsonr(np.array(float_labels),np.array(float_predictions)).statistic
    mse = np.square(np.subtract(np.array(float_labels),np.array(float_predictions))).mean()
    sp_df = pd.DataFrame({'S': [spearmanr], 'P': [pearsonr], 'M': [mse]})
    return cls_dfs, sp_df, float_predictions, float_labels, bin_prediction, bin_labels

In [36]:
dev_predictions, dev_labels = make_prediction(dev_dataset)

 83%|█████████████████████████████████████▌       | 5/6 [03:35<00:43, 43.13s/it]


In [37]:
dev_predictions = [ parse(x) for x in dev_predictions]

In [38]:
dev_labels = [ parse(x) for x in dev_labels]

In [41]:
dev_cls_df, dev_sp_df, predicted_dev_scores, dev_score, dev_bin_predictions, dev_bin_labels=evaluate(dev_predictions, dev_labels)

In [42]:
dev_cls_df.to_csv('./results/dev_cls.csv',float_format=lambda x:f'{x:.3f}')

In [43]:
dev_cls_df

Unnamed: 0,precision,recall,f1-score,support
0,0.829787,0.629032,0.715596,62.0
1,0.566038,0.789474,0.659341,38.0
accuracy,0.69,0.69,0.69,0.69
macro avg,0.697912,0.709253,0.687468,100.0
weighted avg,0.729562,0.69,0.694219,100.0


In [44]:
dev_sp_df.rename(columns={'S':'Spearman','P':"Pearson", 'M':'MSE'}).to_csv('./results/dev_reg.csv',float_format=lambda x:f'{x:.3f}')

In [45]:
print(dev_cls_df.to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrrrr}
\toprule
 & precision & recall & f1-score & support \\
\midrule
0 & 0.830 & 0.629 & 0.716 & 62.000 \\
1 & 0.566 & 0.789 & 0.659 & 38.000 \\
accuracy & 0.690 & 0.690 & 0.690 & 0.690 \\
macro avg & 0.698 & 0.709 & 0.687 & 100.000 \\
weighted avg & 0.730 & 0.690 & 0.694 & 100.000 \\
\bottomrule
\end{tabular}



In [46]:
dev_sp_df

Unnamed: 0,S,P,M
0,0.341992,0.32472,0.6175


In [47]:
print(dev_sp_df.rename(columns={'S':'Spearman','P':"Pearson", 'M':'MSE'}).to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrrr}
\toprule
 & Spearman & Pearson & MSE \\
\midrule
0 & 0.342 & 0.325 & 0.618 \\
\bottomrule
\end{tabular}



In [48]:
test_predictions, test_labels = make_prediction(test_dataset)

 95%|████████████████████████████████████████▉  | 20/21 [15:42<00:47, 47.14s/it]


In [57]:
test_predictions = [ parse(x) for x in test_predictions]

In [50]:
test_labels = [ parse(x) for x in test_labels]

In [58]:
test_cls_df, test_sp_df, predicted_test_scores, test_score, test_bin_predictions, test_bin_labels=evaluate(test_predictions, test_labels)

In [59]:
test_cls_df

Unnamed: 0,precision,recall,f1-score,support
0,0.640845,0.466667,0.540059,195.0
1,0.596899,0.75122,0.665227,205.0
accuracy,0.6125,0.6125,0.6125,0.6125
macro avg,0.618872,0.608943,0.602643,400.0
weighted avg,0.618323,0.6125,0.604208,400.0


In [53]:
test_cls_df.to_csv('./results/test_cls.csv',float_format=lambda x:f'{x:.3f}')

In [54]:
print(test_cls_df.to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrrrr}
\toprule
 & precision & recall & f1-score & support \\
\midrule
0 & 0.641 & 0.467 & 0.540 & 195.000 \\
1 & 0.597 & 0.751 & 0.665 & 205.000 \\
accuracy & 0.613 & 0.613 & 0.613 & 0.613 \\
macro avg & 0.619 & 0.609 & 0.603 & 400.000 \\
weighted avg & 0.618 & 0.613 & 0.604 & 400.000 \\
\bottomrule
\end{tabular}



In [60]:
test_sp_df

Unnamed: 0,S,P,M
0,0.289461,0.298746,0.651875


In [55]:
test_sp_df.rename(columns={'S':'Spearman','P':"Pearson"}).to_csv('./results/test_reg.csv',float_format=lambda x:f'{x:.3f}')

In [56]:
print(test_sp_df.rename(columns={'S':'Spearman','P':"Pearson"}).to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrr}
\toprule
 & Spearman & Pearson \\
\midrule
0 & 0.289 & 0.299 \\
\bottomrule
\end{tabular}



In [82]:
# fig,axs = plt.subplots(2,2)
# axs[0,0].hist(dev_score)
# axs[0,1].hist(test_score)
# axs[1,0].hist(predicted_dev_scores)
# axs[1,1].hist(predicted_test_scores)