In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import torch
import numpy as np
import scipy
from sklearn import metrics
import matplotlib.pyplot as plt
import random
from sklearn.metrics import classification_report
from transformers import GenerationConfig
import torch

In [3]:
from transformers.utils import logging
logging.get_logger("transformers").setLevel(logging.ERROR)

In [4]:
data_path = './empathic-stories/data/'

In [5]:
train_df = pd.read_csv(f'{data_path}/PAIRS (train).csv')
dev_df = pd.read_csv(f'{data_path}/PAIRS (dev).csv')
test_df = pd.read_csv(f'{data_path}/PAIRS (test).csv')

In [6]:

label_columns = "similarity_empathy_human_AGG	similarity_event_human_AGG	similarity_emotion_human_AGG	similarity_moral_human_AGG".split()[::-1]
text_preprocess = lambda x: x.strip().replace("\n", " ")

In [7]:
import trl

In [8]:
from trl.commands.cli_utils import SftScriptArguments

In [9]:
story_in_use = 'summary'
label_in_use = 'empathy'

In [10]:

text_columns = {
    "summary": ["story_A_summary", "story_B_summary"],
    "full": ["story_A", "story_B"],
}.get(story_in_use)
label_column = {
    "empathy": ["similarity_empathy_human_AGG"],
    "event": ["similarity_event_human_AGG"],
    "emotion": ["similarity_emotion_human_AGG"],
    "moral": ["similarity_moral_human_AGG"],
    "all": label_columns,
}.get(label_in_use)

In [11]:
score_conversion_funcs = {
    "none": lambda x:x,
    "original_paper": lambda x: x / 4,
    "01_continue": lambda x:(x - 1) / 3,
}
score_recover_funcs = {
    "none": lambda x:x,
    "original_paper": lambda x: x * 4,
    "01_continue": lambda x:(x * 3) + 1,
}

In [12]:
# Choose score conversion method
score_conversion_in_use = "none"

In [13]:
score_conversion_func = score_conversion_funcs.get(score_conversion_in_use)
score_recover_func = score_recover_funcs.get(score_conversion_in_use)

In [14]:

def create_data(df, text_pps, score_conversion_funcs):
    required_columns = text_columns + label_column
    score_names = [f"score_{i}" for i in range(len(label_column))]
    df = df[required_columns].rename(
        columns={
            k: v
            for k, v in zip(
                required_columns, ["sentence1", "sentence2"] + score_names
            )
        }
    )
    for i in [1, 2]:
        df[f"sentence{i}"] = df[f"sentence{i}"].apply(text_pps)
    for i in range(len(label_column)):
        df[f"score_{i}"] = score_conversion_funcs(df[f"score_{i}"])
    return df

In [15]:
train_df = create_data(train_df, text_preprocess, score_conversion_func)
dev_df = create_data(dev_df, text_preprocess, score_conversion_func)
test_df = create_data(test_df, text_preprocess, score_conversion_func)

In [16]:
import datasets
from datasets import Dataset

In [17]:
optimized_similarity_system_prompt = """Rate the extent to which you agree with the statement "the narrators of the two stories would empathize with each other." We define empathy as feeling, understanding, and relating to what another person is experiencing. Note that it is possible to have empathy even without sharing the exact same experience or circumstance. Importantly, for two stories to be empathetically similar, both narrators should be able to empathize with each other (if narrator A’s story was shared in response to narrator B’s story, narrator B would empathize with narrator A and vice versa). Give your answer on a scale from 1-4 (1-not at all, 2-not so much, 3-very much, 4-extremely), with 0.5 increments in each level between 1-4 are allowed. Please only return the score without any explanation.""".strip()

In [18]:

optimized_similarity_user_prompt = """
### Narrative A:
{story_a}

### Narrative B:
{story_b}

### Similarity Score:
"""

In [19]:
from transformers import AutoTokenizer

In [20]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

In [21]:
# tokenizer.padding_side

In [22]:

def promptify(x1, x2, labels):
    user_input = optimized_similarity_user_prompt.format(
        story_a=x1, story_b=x2
    ).strip()
    label = ' '.join([str(score_conversion_func(l)) for l in labels])
    model_input = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": optimized_similarity_system_prompt},
            {"role": "user", "content": user_input},
            {"role": "assistant", "content": label},
        ],
        tokenize=False,
    )
    return {"text": model_input}

def create_dataset(df, concat_reverse=False, shuffle=False):
    score_columns = [f"score_{i}" for i in range(len(label_column))]
    dataset = (
        Dataset.from_pandas(df)
        .map(lambda x: promptify(x["sentence1"], x["sentence2"], [x[s] for s in score_columns]))
        .remove_columns(["sentence1", "sentence2"] + score_columns)
    )
    if concat_reverse:
        _ = (
            Dataset.from_pandas(df)
            .map(lambda x: promptify(x["sentence2"], x["sentence1"], [x[s] for s in score_columns]))
            .remove_columns(["sentence1", "sentence2"] + score_columns)
        )
        dataset = datasets.concatenate_datasets([dataset, _])
    if shuffle:
        dataset = dataset.shuffle()
    return dataset


In [23]:
train_dataset = create_dataset(train_df, concat_reverse=True, shuffle=True)
dev_dataset = create_dataset(dev_df, concat_reverse=False, shuffle=False)
test_dataset = create_dataset(test_df, concat_reverse=False, shuffle=False)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [24]:
from transformers import AutoModelForCausalLM
from peft import AutoPeftModelForCausalLM

In [25]:
model = AutoPeftModelForCausalLM.from_pretrained('./sft_llama3-emp_2gpus-summary-3/')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [27]:
DEVICE='cuda:0'

In [28]:
model = model.to(DEVICE).to(torch.bfloat16).eval()
model = model.base_model.merge_and_unload()


In [29]:
generation_config = GenerationConfig.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct')
tokenizer.pad_token_id = tokenizer.added_tokens_encoder[
    "<|reserved_special_token_0|>"
]
tokenizer.padding_side = "left"

generation_config.max_length = 2048

generation_config.temperature = 0.0
generation_config.pad_token_id = tokenizer.pad_token_id
generation_config.do_sample = False
model.generation_config.pad_token_id=tokenizer.pad_token_id

In [30]:
def test_predict_batch(samples):
    model_inputs, labels = zip(*[ sample.split('<|start_header_id|>assistant<|end_header_id|>') for sample in samples['text']])
    model_inputs = [ x+'<|start_header_id|>assistant<|end_header_id|>' for x in model_inputs]
    # model_input, label = sample['text'].split('<|start_header_id|>assistant<|end_header_id|>')
    labels = [ label.strip().split('<')[0] for label in labels]
    model_inputs = tokenizer(model_inputs, add_special_tokens=False, return_tensors='pt', padding=True)
    
    response = model.generate(**{ k:v.to(DEVICE) for k,v in model_inputs.items()}, generation_config=generation_config, pad_token_id=tokenizer.eos_token_id)
    return model_inputs, response

In [31]:
model_inputs, response = test_predict_batch(dev_dataset[:1])



In [33]:
def predict_batch(samples):
    model_inputs, labels = zip(*[ sample.split('<|start_header_id|>assistant<|end_header_id|>') for sample in samples['text']])
    # model_input, label = sample['text'].split('<|start_header_id|>assistant<|end_header_id|>')
    labels = [ label.strip().split('<')[0] for label in labels]
    model_inputs = tokenizer(model_inputs, add_special_tokens=False, return_tensors='pt', padding=True)
    
    response = model.generate(**{ k:v.to(DEVICE) for k,v in model_inputs.items()}, generation_config=generation_config, pad_token_id=tokenizer.eos_token_id)
    predicted_scores = [ tokenizer.decode(response[i][model_inputs.input_ids[i].masked_select(model_inputs.input_ids[i]!=tokenizer.pad_token_id).shape[0]:]).split('\n')[-1].split('<')[0] for i in range(len(labels))]
    predicted_scores = [ multi_scores.split() for multi_scores in predicted_scores]
    labels = [ multi_labels.split() for multi_labels in labels]
    return predicted_scores, labels

In [34]:
model.generation_config.padding_side='left'

In [35]:
import tqdm

In [36]:
def make_prediction(dataset, bsz=20):
    predictions = []
    labels = []
    for i in tqdm.tqdm(range(len(dataset) // bsz + 1)):
        batch = dataset[i*bsz:(i+1)*bsz]
        if len(batch['text']) == 0:
            break
        predicted_score, label = predict_batch(batch)
        predictions.extend(predicted_score)
        labels.extend(label)
    bin_labels = [ [int(float(x)>2.5) for x in multi_labels] for multi_labels in labels]
    bin_prediction = [ [int(float(x)>2.5) for x in multi_predictions ] for multi_predictions in predictions]
    float_predictions = [ [float(x) for x in multi_predictions ] for multi_predictions in predictions]
    float_labels = [ [float(x) for x in multi_labels ] for multi_labels in labels]
    return float_predictions, float_labels, bin_prediction, bin_labels

In [37]:
# float_predictions, float_labels, bin_prediction, bin_labels = make_prediction(dev_dataset)

In [39]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [45]:
def evaluate(dataset):
    float_predictions, float_labels, bin_prediction, bin_labels = make_prediction(dataset)
    cls_dfs = [ pd.DataFrame(classification_report(bin_label, bin_prediction, output_dict=True)).T for bin_label, bin_prediction in zip(zip(*bin_prediction), zip(*bin_labels))]
    cls_metrics = [ precision_recall_fscore_support(bin_label, bin_prediction, average="macro") for bin_label, bin_prediction in zip(zip(*bin_prediction), zip(*bin_labels))]
    precision, recall, F1, support = cls_metrics[0]
    acc_metrics = [ accuracy_score(bin_label, bin_prediction) for bin_label, bin_prediction in zip(zip(*bin_prediction), zip(*bin_labels))]
    accuracy = acc_metrics[0]
    spearmanrs = [scipy.stats.spearmanr(np.array(float_label),np.array(float_prediction)).statistic for float_label, float_prediction in zip(zip(*float_predictions), zip(*float_labels))]
    pearsonrs = [ scipy.stats.pearsonr(np.array(float_label),np.array(float_prediction)).statistic  for float_label, float_prediction in zip(zip(*float_predictions), zip(*float_labels))]
    mses = [  np.square(np.subtract(np.array(float_label),np.array(float_prediction))).mean()  for float_label, float_prediction in zip(zip(*float_predictions), zip(*float_labels))]
    sp_df = pd.DataFrame({'S': spearmanrs, 'P': pearsonrs, 'M':mses})
    metrics = {
        "accuracy": round(accuracy, 3),
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "F1": round(F1, 3),
    }
    return cls_dfs, sp_df, float_predictions, float_labels, bin_prediction, bin_labels, metrics

In [46]:
dev_cls_df, dev_sp_df, predicted_dev_scores, dev_score, dev_bin_predictions, dev_bin_labels, dev_metrics=evaluate(dev_dataset)

 83%|█████████████████████████████████████▌       | 5/6 [00:08<00:01,  1.63s/it]


In [38]:
dev_cls_df[0].to_csv('./results/dev_cls.csv',float_format=lambda x:f'{x:.3f}')

In [47]:
dev_cls_df[0]

Unnamed: 0,precision,recall,f1-score,support
0,0.887097,0.797101,0.839695,69.0
1,0.631579,0.774194,0.695652,31.0
accuracy,0.79,0.79,0.79,0.79
macro avg,0.759338,0.785647,0.767673,100.0
weighted avg,0.807886,0.79,0.795041,100.0


In [48]:
dev_metrics

{'accuracy': 0.79, 'precision': 0.759, 'recall': 0.786, 'F1': 0.768}

In [50]:
print(dev_cls_df[0].to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrrrr}
\toprule
 & precision & recall & f1-score & support \\
\midrule
0 & 0.887 & 0.797 & 0.840 & 69.000 \\
1 & 0.632 & 0.774 & 0.696 & 31.000 \\
accuracy & 0.790 & 0.790 & 0.790 & 0.790 \\
macro avg & 0.759 & 0.786 & 0.768 & 100.000 \\
weighted avg & 0.808 & 0.790 & 0.795 & 100.000 \\
\bottomrule
\end{tabular}



In [40]:
dev_sp_df

Unnamed: 0,S,P
0,0.469965,0.475901


In [41]:
dev_sp_df.rename(columns={'S':'Spearman','P':"Pearson"}).to_csv('./results/dev_reg.csv',float_format=lambda x:f'{x:.3f}')

In [51]:
print(dev_sp_df.rename(columns={'S':'Spearman','P':"Pearson", "M": "MSE"}).to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrrr}
\toprule
 & Spearman & Pearson & MSE \\
\midrule
0 & 0.470 & 0.476 & 0.497 \\
\bottomrule
\end{tabular}



In [52]:
test_cls_df, test_sp_df, predicted_test_scores, test_score, test_bin_predictions, test_bin_labels, test_metrics=evaluate(test_dataset)

 95%|████████████████████████████████████████▉  | 20/21 [00:33<00:01,  1.66s/it]


In [53]:
test_cls_df[0]

Unnamed: 0,precision,recall,f1-score,support
0,0.661538,0.578475,0.617225,223.0
1,0.541463,0.627119,0.581152,177.0
accuracy,0.6,0.6,0.6,0.6
macro avg,0.601501,0.602797,0.599188,400.0
weighted avg,0.608405,0.6,0.601263,400.0


In [45]:
test_cls_df[0].to_csv('./results/test_cls.csv',float_format=lambda x:f'{x:.3f}')

In [54]:
print(test_cls_df[0].to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrrrr}
\toprule
 & precision & recall & f1-score & support \\
\midrule
0 & 0.662 & 0.578 & 0.617 & 223.000 \\
1 & 0.541 & 0.627 & 0.581 & 177.000 \\
accuracy & 0.600 & 0.600 & 0.600 & 0.600 \\
macro avg & 0.602 & 0.603 & 0.599 & 400.000 \\
weighted avg & 0.608 & 0.600 & 0.601 & 400.000 \\
\bottomrule
\end{tabular}



In [47]:
test_sp_df.rename(columns={'S':'Spearman','P':"Pearson"})

Unnamed: 0,Spearman,Pearson
0,0.30706,0.320428


In [48]:
test_sp_df.rename(columns={'S':'Spearman','P':"Pearson"}).to_csv('./results/test_reg.csv',float_format=lambda x:f'{x:.3f}')

In [55]:
print(test_sp_df.rename(columns={'S':'Spearman','P':"Pearson", "M": "MSE"}).to_latex(float_format=lambda x:f'{x:.3f}'))

\begin{tabular}{lrrr}
\toprule
 & Spearman & Pearson & MSE \\
\midrule
0 & 0.307 & 0.320 & 0.643 \\
\bottomrule
\end{tabular}

