In [72]:
from datasets import load_dataset
from metrics import exact_match, calculate_positional_divergence
from scipy.stats import pearsonr


# News

In [75]:
news_data = load_dataset('json', data_files="news_annotations.jsonl")['train']

def filter_nan(datapoint):
    for k, v in datapoint.items():
        if v == None:
            return False
    return True

news_data = news_data.filter(filter_nan)


Found cached dataset json (/home/yinhong/.cache/huggingface/datasets/json/default-c6058db0efc681fd/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/yinhong/.cache/huggingface/datasets/json/default-c6058db0efc681fd/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-6c02e425b4d3aac1.arrow


In [102]:
 
pred_discourse = news_data['discourse_v1'] + news_data['discourse_v2']
ref_discourse = news_data['discourse'] + news_data['discourse']
human_scores = [s[1] for s in news_data['gpt_scores']] + [s[2] for s in news_data['gpt_scores']]


def evaluate_corr(pred_discourse, ref_discourse, num_class=8, num_bins_default=5):
    acc_list = []
    pos_div_list = []
    for pred, ref in zip(pred_discourse, ref_discourse):
        acc_list.append(exact_match(preds=pred, refs=ref))
        pos_div_list.append(calculate_positional_divergence(
                                predictions=[pred], 
                                references=[ref], 
                                num_class=num_class, 
                                num_bins_default=num_bins_default, 
                                return_normalized_score=True
                                )
                            )
    correlation_coefficient, p_value = pearsonr(human_scores, acc_list)
    print('Corr(Human, Exact match)=', correlation_coefficient, p_value)
    correlation_coefficient, p_value = pearsonr(human_scores, pos_div_list)
    print('Corr(Human, Pos. Div.)=', correlation_coefficient, p_value)


Corr(Human, Exact match)= 0.22739878199237906 0.0008523712423231125
Corr(Human, Pos. Div.)= 0.20758824753077437 0.0023829481926786584


In [87]:
len(human_scores), len(ref_discourse)

(212, 212)

In [88]:
correlation_coefficient, p_value = pearsonr(human_scores, acc_list)
correlation_coefficient, p_value

(0.22739878199237906, 0.0008523712423231125)

In [101]:
correlation_coefficient, p_value = pearsonr(human_scores, pos_div_list)
correlation_coefficient, p_value

(0.20758824753077437, 0.0023829481926786584)