# analysis

## check annotation
After requesting re-annotation, there should be no errors.

In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, f1_score
from sklearn.feature_selection import r_regression
import numpy as np
import scipy.stats as stats
from typing import List
import json
from util import read_excelfile, detect_annotation_errors

In [2]:
df_p1 = read_excelfile('data/participant1_task2.xlsx')
df_p2 = read_excelfile('data/participant2_task2.xlsx')
df_p3 = read_excelfile('data/participant3_task2.xlsx')

In [3]:
for df in (df_p1, df_p2, df_p3):
    errors = detect_annotation_errors(df)
    assert all([len(el) == 0 for el in errors.values()]), f'There are annotation errors: {errors}'
print('INFO: there are no annotation errors')

INFO: there are no annotation errors


## Score and T-test

### load data
- order: human_eval_order.json
- df_p1_r1: roudn1 data of participant1
- df_p2_r1: roudn1 data of participant2
- df_p3_r1: roudn1 data of participant3

In [4]:
order = json.load(open('data/human_eval_order.json'))

In [5]:
df_p1_r1 = read_excelfile('data/participant1_task1.xlsx')
df_p2_r1 = read_excelfile('data/participant2_task1.xlsx')
df_p3_r1 = read_excelfile('data/participant3_task1.xlsx')

Any potential errors in round-1 annotations should have been found in notebook 1.

### functions

In [6]:
def extract_criterion(criterion, data) -> List[float]:
    assert criterion in ['Informativeness', 'Non-redundancy', 'Fluency']
    scores = []
    for line in list(data[criterion]):
        if line != criterion:
            if isinstance(line, str):
                nums = list(map(int, line.replace(' ', '').replace('，', ',').split(',')))
                assert all([el in [0, 1, 2] for el in nums])
                ave = sum(nums) / len(nums)
                scores.append(ave)
            else:
                assert line in [0, 1, 2]
                scores.append(line)
    return scores

def split_list(my_list, n):
    for idx in range(0, len(my_list), n):
        yield my_list[idx:idx + n]
        
def c_order(order, data):
    # first batch
    # [1,0] = [PGN-both, PGN-multi]
    # [0,1] = [PGN-multi, PGN-both]
    # second batch
    # [1,0] = [BERT-both, BERT-multi]
    # [0,1] = [BERT-multi, BERT-both]
    # ie
    # [[1, 0], [0, 1]] = [PGN-both, PGN-multi, BERT-multi, BERT-both]
    # [[0, 1], [1, 0]] = [PGN-multi, PGN-both, BERT-both, BERT-multi]
    # [[1, 0], [1, 0]] = [PGN-both, PGN-multi, BERT-both, BERT-multi]
    # [[0, 1], [0, 1]] = [PGN-multi, PGN-both, BERT-multi, BERT-both]
        
    assert len(order) == len(order[0]) == len(order[1]) == 2
    assert len(data) == 4
    
    # order[0] is PGN
    if order[0] == [0, 1]:
        PGNmulti = data[0]
        PGNboth = data[1]
    elif order[0] == [1, 0]:
        PGNmulti = data[1]
        PGNboth = data[0]
        
    # order[1] is BERT
    if order[1] == [0, 1]:
        BERTmulti = data[2]
        BERTboth = data[3]
    elif order[1] == [1, 0]:
        BERTmulti = data[3]
        BERTboth = data[2]
    return {'PGN-multi': PGNmulti, 'PGN-both': PGNboth, 'BERT-multi': BERTmulti, 'BERT-both': BERTboth}

In [7]:
def read_data(data: pd.DataFrame, order):
    criteria = ['Informativeness', 'Non-redundancy', 'Fluency']
    user_dfs = []
    agent_dfs = []
    
    for criterion in criteria:
        # There are 2 architectures (PGN, BERT), 2 models (multi, both), 2 roles (agent, user)
        # These are 8 values per summary
        # Here we extract the list of values per summary
        chunk8s = [c for c in split_list(extract_criterion(criterion, data), 8)]
        assert len(chunk8s) == 100
        assert all([len(el) == 8 for el in chunk8s])

        user_data = []
        agent_data = []
        
        assert len(chunk8s) == len(order['user']) == len(order['agent'])
        
        for chunk8, user_order, agent_order in zip(chunk8s, order['user'], order['agent']):
            # user: 4 instance 
            # agent: 4 instance
            # User comes before agent (see documentation on Google Drive)
            user_scores = chunk8[0:4]
            agent_scores = chunk8[4:8]
            assert len(user_scores) == len(agent_scores) == 4
            out_user = c_order(user_order, user_scores)
            user_data.append(out_user)
            
            out_agent = c_order(agent_order, agent_scores)
            agent_data.append(out_agent)
            
        user_df = pd.DataFrame(user_data)
        user_df.columns = [f"{criterion}_{m}_user" for m in user_df.columns]
        user_dfs.append(user_df)
        
        agent_df = pd.DataFrame(agent_data)
        agent_df.columns = [f"{criterion}_{m}_agent" for m in agent_df.columns]
        agent_dfs.append(agent_df)
        
    combined_user_df = pd.concat(user_dfs, axis=1)
    combined_agent_df = pd.concat(agent_dfs, axis=1)
    combined_df = pd.concat([combined_user_df, combined_agent_df], axis=1)
    
    # cal_overall
    for model in  ['PGN-multi', 'PGN-both', 'BERT-multi', 'BERT-both']:
        for u_or_a in ['user', 'agent']:
            cols = [f"{c}_{model}_{u_or_a}" for c in criteria]
            combined_df = pd.concat((combined_df, combined_df[cols].mean(axis=1).rename(f"Overall_{model}_{u_or_a}")), axis=1)
    return combined_df

In [8]:
def ttest(df) -> dict:
    criteria = ['Informativeness', 'Non-redundancy', 'Fluency', 'Overall']
    models = ['PGN', 'BERT']
    u_or_a = ['user', 'agent']
    criterion_results = {}
    for criterion in criteria:
        model_results = {}
        for model in models:
            role_results = {}
            for u_or_a in ['user', 'agent']:
                test_res = stats.ttest_rel(df[f"{criterion}_{model}-both_{u_or_a}"], 
                                           df[f"{criterion}_{model}-multi_{u_or_a}"], 
                                           alternative='two-sided')
                role_results[u_or_a] = test_res
            model_results[model] = role_results
        criterion_results[criterion] = model_results
    return criterion_results

In [9]:
def format_number(value: float, p_value: float) -> str:
    return f"{value:.2f}{'*' if abs(p_value) < 0.05 else ''}"

def report_results(data_df: pd.DataFrame) -> None:
    results = (data_df.mean(axis=0) / 2).to_dict()
    p_values = ttest(data_df)
    print('CSDS\t\tInfo\t\tNon-Red\t\tFlu\t\tOverall')
    for model in ('PGN-multi', 'PGN-both', 'BERT-multi', 'BERT-both'):
        number_cells = ['/'.join([format_number(results[f'{metric}_{model}_{role}'], 
                                                p_values[metric][model.split('-')[0]][role].pvalue) \
                                  for role in ('user', 'agent')]) \
                        for metric in ('Informativeness', 'Non-redundancy', 'Fluency', 'Overall')]
        print(model + '\t' + '\t'.join(number_cells))

## Case1
Data for 1-10 were annotated by Participant1

In [10]:
all_data_r1p1 = pd.concat([df_p1_r1, df_p1, df_p2, df_p3])
df1 = read_data(all_data_r1p1, order)
report_results(df1)

CSDS		Info		Non-Red		Flu		Overall
PGN-multi	0.63/0.59*	0.58/0.55	0.69/0.70	0.63/0.61*
PGN-both	0.62/0.64*	0.61/0.59	0.68/0.74	0.64/0.65*
BERT-multi	0.55/0.45	0.69*/0.61	0.82/0.80	0.69*/0.62
BERT-both	0.56/0.47	0.62*/0.58	0.78/0.80	0.65*/0.62


## Case2
Data for 1-10 were annotated by Participant2.

In [11]:
all_data_r1p2 = pd.concat([df_p2_r1, df_p1, df_p2, df_p3])
df2 = read_data(all_data_r1p2, order)
report_results(df2)

CSDS		Info		Non-Red		Flu		Overall
PGN-multi	0.62/0.58	0.57/0.56	0.68/0.69	0.62/0.61*
PGN-both	0.61/0.62	0.60/0.58	0.67/0.71	0.63/0.64*
BERT-multi	0.55/0.45	0.70*/0.60	0.82/0.78	0.69*/0.61
BERT-both	0.55/0.47	0.62*/0.57	0.78/0.78	0.65*/0.61


## Case3
Data for 1-10 were annotated by Participant3.

In [12]:
all_data_r1p3 = pd.concat([df_p3_r1, df_p1, df_p2, df_p3])
df3 = read_data(all_data_r1p3, order)
report_results(df3)

CSDS		Info		Non-Red		Flu		Overall
PGN-multi	0.64/0.60*	0.59/0.58	0.69/0.72	0.64/0.63*
PGN-both	0.63/0.65*	0.62/0.60	0.68/0.75	0.64/0.67*
BERT-multi	0.57/0.46	0.72*/0.62	0.83/0.81	0.71*/0.63
BERT-both	0.57/0.49	0.63*/0.59	0.79/0.80	0.67*/0.63


## Case4
Data for 1-10 is the average of all Participant annotations

In [13]:
df4 = (df1 + df2 + df3)/3

report_results(df4)

CSDS		Info		Non-Red		Flu		Overall
PGN-multi	0.63/0.59*	0.58/0.56	0.69/0.70	0.63/0.62*
PGN-both	0.62/0.64*	0.61/0.59	0.68/0.73	0.64/0.65*
BERT-multi	0.56/0.45	0.71*/0.61	0.82/0.80	0.70*/0.62
BERT-both	0.56/0.48	0.62*/0.58	0.78/0.79	0.66*/0.62


# Create Latex tables for the paper

In [14]:
def format_number_latex(value: float, p_value: float, other_value: float) -> str:
    output_value = f'{value:.2f}'
    if float(f'{value:.2f}') >= float(f'{other_value:.2f}'):
        if abs(p_value) < 0.05:
            output_value += '*'
        output_value = r'\textbf{' + output_value + r'}'
    return output_value

def get_latex_table(data_df: pd.DataFrame, case: int) -> str:
    results = (data_df.mean(axis=0) / 2).to_dict()
    p_values = ttest(data_df)
    lines = []
    lines.append(r'\multicolumn{5}{c}{Case ' + str(case) + r'} \\')
    lines.append('\hline')
    options = ('multi', 'both')
    for i, architecture in enumerate(('PGN', 'BERT')):
        for j, option in enumerate(options):
            cells = []
            for metric in ('Informativeness', 'Non-redundancy', 'Fluency', 'Overall'):
                role_numbers = []
                for role in ('user', 'agent'):
                    this_value = results[f'{metric}_{architecture}-{option}_{role}']
                    other_value = results[f'{metric}_{architecture}-{options[(j+1)%2]}_{role}']
                    p_value = p_values[metric][architecture][role].pvalue
                    formatted_value = format_number_latex(this_value, p_value, other_value)
                    role_numbers.append(formatted_value)
                cell = '/'.join(role_numbers)
                cells.append(cell)
            row = ' & '.join([f'{architecture}-{option}'] + cells) + r' \\'
            lines.append(row)
        lines.append('\hline')
    return '\n'.join(lines)

In [15]:
headers = ('CSDS', 'Info', 'Non-Red', 'Flu', 'Overall')
preamble = [r'\begin{table*}', 
            r'\centering', 
            r'\begin{tabular}{lllll}', 
            ' & '.join([r'\textbf{' + el + '}' for el in headers]) + r' \\',
           '\hline']
body = [get_latex_table(v, k) for k, v in {1: df1, 2: df2, 3: df3, 4: df4}.items()]
closing = [r'\end{tabular}', 
              r'\caption{\label{tab:results} ' + \
              r"Results of the present human evaluation under the four ``cases'' (see Section~\ref{sec:experimental_setup}).}", 
              r'\end{table*}']
full_table = '\n'.join(preamble + body + closing)
print(full_table)

\begin{table*}
\centering
\begin{tabular}{lllll}
\textbf{CSDS} & \textbf{Info} & \textbf{Non-Red} & \textbf{Flu} & \textbf{Overall} \\
\hline
\multicolumn{5}{c}{Case 1} \\
\hline
PGN-multi & \textbf{0.63}/0.59 & 0.58/0.55 & \textbf{0.69}/0.70 & 0.63/0.61 \\
PGN-both & 0.62/\textbf{0.64*} & \textbf{0.61}/\textbf{0.59} & 0.68/\textbf{0.74} & \textbf{0.64}/\textbf{0.65*} \\
\hline
BERT-multi & 0.55/0.45 & \textbf{0.69*}/\textbf{0.61} & \textbf{0.82}/\textbf{0.80} & \textbf{0.69*}/\textbf{0.62} \\
BERT-both & \textbf{0.56}/\textbf{0.47} & 0.62/0.58 & 0.78/\textbf{0.80} & 0.65/\textbf{0.62} \\
\hline
\multicolumn{5}{c}{Case 2} \\
\hline
PGN-multi & \textbf{0.62}/0.58 & 0.57/0.56 & \textbf{0.68}/0.69 & 0.62/0.61 \\
PGN-both & 0.61/\textbf{0.62} & \textbf{0.60}/\textbf{0.58} & 0.67/\textbf{0.71} & \textbf{0.63}/\textbf{0.64*} \\
\hline
BERT-multi & \textbf{0.55}/0.45 & \textbf{0.70*}/\textbf{0.60} & \textbf{0.82}/\textbf{0.78} & \textbf{0.69*}/\textbf{0.61} \\
BERT-both & \textbf{0.55}/\textb

# Reproducibility scores

## Pearson correlation coefficient

In [16]:
def concatenate_values(data_df: pd.DataFrame) -> list:
    results = (data_df.mean(axis=0) / 2).to_dict()
    p_values = ttest(data_df)
    values = []
    for model in ('PGN-multi', 'PGN-both', 'BERT-multi', 'BERT-both'):
        for metric in ('Informativeness', 'Non-redundancy', 'Fluency', 'Overall'):
            for role in ('user', 'agent'):
                values.append(results[f'{metric}_{model}_{role}'])
    return values

In [17]:
case_values = [concatenate_values(v) for v in (df1, df2, df3, df4)]
case_values = [[case_values[j][i] for j in range(len(case_values))] for i in range(len(case_values[0]))]

In [18]:
orig_values = [0.69, 0.65, 0.54, 0.55, 0.70, 0.79, 0.64, 0.66,
0.66, 0.69, 0.58, 0.59, 0.73, 0.81, 0.66, 0.70,
0.58, 0.56        , 0.66, 0.61 , 0.84, 0.87  , 0.69, 0.68,
0.62, 0.60, 0.62, 0.60       , 0.85, 0.87       , 0.70, 0.69]

In [19]:
rhos = r_regression(case_values, orig_values)

In [20]:
pearson_table = [r'\begin{table*}', r'\centering', r'\begin{tabular}{cc}', r'Case & Pearson $\rho$ \\', '\hline']
for i, rho in enumerate(rhos):
    pearson_table.append(f'{i+1} & {rho:.2f}' + r' \\')
pearson_table += [r'\end{tabular}', 
              r'\caption{\label{tab:pearson} ' + \
              r"Pearson correlation coefficient between the results of the original experiment and our results.}", 
              r'\end{table*}']

In [21]:
pearson_table = '\n'.join(pearson_table)
print(pearson_table)

\begin{table*}
\centering
\begin{tabular}{cc}
Case & Pearson $\rho$ \\
\hline
1 & 0.90 \\
2 & 0.89 \\
3 & 0.90 \\
4 & 0.90 \\
\end{tabular}
\caption{\label{tab:pearson} Pearson correlation coefficient between the results of the original experiment and our results.}
\end{table*}


## Fraction of matching both/multi pairs

In [22]:
arch_orig = {'PGN': orig_values[:16], 'BERT': orig_values[16:]}
arch_orig = {k: {'multi': v[:8], 'both': v[8:]} for k, v in arch_orig.items()}
arch_orig = {k: {kk: {'Informativeness': vv[0:2], 'Non-redundancy': vv[2:4], 'Fluency': vv[4:6], 'Overall': vv[6:8]} \
                 for kk, vv in v.items()} \
             for k, v in arch_orig.items()}
arch_orig = {arch: {option: {metric: {'user': vvv[0], 'agent': vvv[1]} \
                             for metric, vvv in vv.items()} \
                    for option, vv in v.items()} \
             for arch, v in arch_orig.items()}

In [23]:
def matches(v1: float, v2: float, matching_function) -> int:
    if matching_function(v1, v2):
        return 1
    else:
        return 0

def get_matching_list(data_df: pd.DataFrame) -> list:
    results = (data_df.mean(axis=0) / 2).to_dict()
    options = ('multi', 'both')
    scores = []
    for i, architecture in enumerate(('PGN', 'BERT')):
        cells = []
        option_orig = arch_orig[architecture]
        for metric in ('Informativeness', 'Non-redundancy', 'Fluency', 'Overall'):
            role_numbers = []
            for role in ('user', 'agent'):
                values = {option: float(str(f"{results[f'{metric}_{architecture}-{option}_{role}']:.2f}")) \
                          for option in ('multi', 'both')}
                if values['multi'] > values['both']:
                    if matches(option_orig['multi'][metric][role], option_orig['both'][metric][role], lambda x,y: x>y) == 1:
                        scores.append(1)
                    else:
                        scores.append(0)
                        #print(architecture, metric, role, values['multi'], values['both'])
                    #scores.append(int(option_orig['multi'][metric][role] > option_orig['both'][metric][role]))
                elif values['multi'] == values['both']:
                    if option_orig['multi'][metric][role] == option_orig['both'][metric][role]:
                        scores.append(1)
                    else:
                        scores.append(0)
                        #print(architecture, metric, role, values['multi'], values['both'])
                    #scores.append(matches(option_orig['multi'][metric][role], option_orig['both'][metric][role], lambda x,y: x==y))
                else:
                    if option_orig['multi'][metric][role] < option_orig['both'][metric][role]:
                        scores.append(1)
                    else:
                        scores.append(0)
                        #print(architecture, metric, role, values['multi'], values['both'])
    return scores

In [24]:
def get_matching_accuracy(data_df: pd.DataFrame) -> float:
    matches = get_matching_list(data_df)
    return sum(matches) / len(matches)

In [25]:
get_matching_accuracy(df1)

0.75

In [26]:
get_matching_accuracy(df2)

0.6875

In [27]:
get_matching_accuracy(df3)

0.5625

In [28]:
get_matching_accuracy(df4)

0.625

## F1 score of statistical significance

In [29]:
y_true = [int(el) for el in "0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0".split()]

In [30]:
def is_statistically_significant(value: float, p_value: float, other_value: float) -> str:
    output_value = f'{value:.2f}'
    if float(f'{value:.2f}') >= float(f'{other_value:.2f}'):
        if abs(p_value) < 0.05:
            return 1
    return 0
        
def get_statistical_significance(data_df: pd.DataFrame) -> list:
    results = (data_df.mean(axis=0) / 2).to_dict()
    p_values = ttest(data_df)
    options = ('multi', 'both')
    lines = []
    for i, architecture in enumerate(('PGN', 'BERT')):
        for j, option in enumerate(options):
            cells = []
            for metric in ('Informativeness', 'Non-redundancy', 'Fluency', 'Overall'):
                role_numbers = []
                for role in ('user', 'agent'):
                    this_value = results[f'{metric}_{architecture}-{option}_{role}']
                    other_value = results[f'{metric}_{architecture}-{options[(j+1)%2]}_{role}']
                    p_value = p_values[metric][architecture][role].pvalue
                    role_numbers.append(is_statistically_significant(this_value, p_value, other_value))
                cells += role_numbers
            lines += cells
    return lines

In [31]:
def get_f1_score(data_df: pd.DataFrame) -> float:
    return f1_score(y_true, get_statistical_significance(data_df))

In [32]:
[f'{get_f1_score(df):.2f}' for df in (df1, df2, df3, df4)]

['0.25', '0.29', '0.25', '0.25']