In [None]:
# @formatter:off
import random

import numpy as np
%load_ext autoreload
%autoreload 2
# @formatter:on
import os

os.chdir('../')
print(os.getcwd())

In [None]:
## adjust this accordingly
#@formatter:off
files = !ls data/processed/*.jsonl
#@formatter:on
print(files)

In [None]:
from handystuff.loaders import load_jsonl
data = []
for f in files:
    data.extend(load_jsonl(f))

In [None]:
len(data)

In [None]:
chatgpt_answers = load_jsonl('chatgpt0-2621.jsonl')

In [None]:
assert len(data) == len(chatgpt_answers)

In [None]:
binary = ['contra-article_1', 'contra-self_1', 'convince_1', 'new_1', 'verdict_1', 'simplified_verdict_1']
ordinal = ['rate_1']

In [None]:
ordinal_scale_mapping = {
    'false': 'false',
    'true': 'true',
    'satire': 'false',
    'unk': 'half',
    'half': 'half',
    'hardly': 'half',
    'almost': 'half',
    'false': 'false',
    -1: -1,
    False: 'false',
    True: 'true',
    'inbetween': 'half',
}

In [None]:
simplified_ordinal_scale_mapping = {
    'false': 'false',
    'true': 'true',
    'satire': 'false',
    'unk': 'inbetween',
    'half': 'inbetween',
    'hardly': 'inbetween',
    'almost': 'inbetween',
    -1: -1,
    False: 'false',
    True: 'true',
    'inbetween': 'inbetween',
}

In [None]:
print(set(a['verdict_1'] for d in data for a in d['answers']))

In [None]:
from typing import Any, List


def how_many_equal(values: List[Any]):
    max_agreements = max(sum(k == values[i] for j, k in enumerate(values) if i != j) + 1 for i in range(len(values)))
    return max_agreements if max_agreements > 1 else 0

In [None]:
import json
failed = 0
for chatgpt_answer, d in zip(chatgpt_answers, data):
    try:
        answer = json.loads(chatgpt_answer[:chatgpt_answer.index('}')+1])
    except:
        failed += 1
        print(chatgpt_answer)
        continue
    answer = {f"{k}_1":v for k,v in answer.items()}
    #answer['verdict'] = simplified_ordinal_scale_mapping[answer['verdict']]
    answer['worker'] = 'CHATGPT'
    d['answers'].append(answer)
print(failed)

In [None]:
for d in data:
    for a in d['answers']:
        a['simplified_verdict_1'] = ordinal_scale_mapping[a['verdict_1']]

In [None]:
data[0]

In [None]:
from tqdm import tqdm
def calculate_data_agreement(data, key_suffix=''):
    for d in tqdm(data):
        d[f'agreements{key_suffix}'] = dict()
        for answers in zip(*[a.items() for a in d['answers']]):
            keys, values = zip(*answers)
            values = [v for v in values if v != -1]
            k = f"{keys[0]}{key_suffix}"
            if k in ordinal:
                d[k] = sum(values)/len(values)
            elif k in binary:
                if k == 'verdict_1':
                    agreement = how_many_equal([ordinal_scale_mapping[v] for v in values])
                else:
                    agreement = how_many_equal(values)
                d['agreements'][k] = agreement/len(values)
                d[k] = max(values, key=values.count) if agreement > 0.5 else 'NO_AGREEMENT'
calculate_data_agreement(data)

In [None]:
data[1025]

In [None]:
for k in ['contra-article_1', 'contra-self_1', 'verdict_1', 'convince_1', 'new_1', 'simplified_verdict_1']:
    percent_perfect_agreement = sum(d['agreements'][k] == 1.0 for d in data)/len(data)
    print(f"{k} perfect agreement: {percent_perfect_agreement:.2f}")
print('----'*20)
for k in ['contra-article_1', 'contra-self_1', 'verdict_1', 'convince_1', 'new_1', 'simplified_verdict_1']:
    percent_at_least_some_agreement = sum(d['agreements'][k] > 0.5 for d in data)/len(data)
    print(f"{k} some agreement: {percent_at_least_some_agreement:.2f}")

In [None]:
from collections import defaultdict
keys = binary + ordinal
def get_agreement_per_author(data):
    per_key = {k: defaultdict(list) for k in keys}
    binary_only = defaultdict(list)
    overall = defaultdict(list)
    for d in tqdm(data):
        for a in d['answers']:
            worker = a['worker']
            for k in keys:
                answer = a[k]
                other_answers = [oa[k] for oa in d['answers'] if oa['worker'] != worker and oa[k] != -1]
                if k == 'verdict_1':
                    answer = ordinal_scale_mapping[answer]
                    other_answers = [ordinal_scale_mapping[a] for a in other_answers]
                # print(d['answers'])
                # print(a)
                # print(k)
                # print(answer)
                # print(other_answers)
                # print(int(any(answer == oa for oa in other_answers)))
                if answer != -1:
                    score = any(answer == oa for oa in other_answers)
                    per_key[k][worker].append(score)
                    overall[worker].append(score)
                    if k in binary and k != 'verdict_1' and k!='simplified_verdict_1':
                        binary_only[worker].append(score)
                    
    return per_key, overall, binary_only
per_key, overall, binary_only = get_agreement_per_author(data)

In [None]:
overall_percentage = {worker: sum(v)/len(v) for worker, v in overall.items()}
binary_percentage = {worker: sum(v)/len(v) for worker, v in binary_only.items()}

In [None]:
by_key_percentage = {key: {worker: sum(v)/len(v) for worker, v in value.items()} for key, value in per_key.items()}

In [None]:
for k, v in by_key_percentage.items():
    print(k)
    print(v)

In [None]:
print(overall_percentage)

In [None]:
for k, value in by_key_percentage.items():
    print(k)
    for k,v in sorted(value.items(), key=lambda x: x[1], reverse=True):
        print(f'{k}: {v:.2f}')
print('overall')
for k,v in sorted(overall_percentage.items(), key=lambda x: x[1], reverse=True):
    print(f'{k}: {v:.2f}')
print('binary only')
for k,v in sorted(binary_percentage.items(), key=lambda x: x[1], reverse=True):
    print(f'{k}: {v:.2f}')

In [None]:
data[0]['answers']

In [None]:
def calculate_data_agreement_reliability(data, reliability_dict, key_suffix='', min_reliability=0.5, by_key=False):
    key_suffix = key_suffix or f'@{min_reliability}{"_bk" if by_key else ""}'
    for d in tqdm(data):
        d[f'agreements{key_suffix}'] = dict()
        all_answers = [item for item in zip(*[a.items() for a in d['answers']]) if item[0][0] != 'worker']
        workers = [a['worker'] for a in d['answers']]
        #print(list(all_answers))
        for answers in all_answers:
            # print(d['answers'])
            keys, values = zip(*answers)
            # print(keys)
            # print(values)
            # print(workers)
            original_k = keys[0]
            k = f"{keys[0]}{key_suffix}"
            if by_key:
                values = [v for v,w in zip(values, workers) if v != -1 and reliability_dict[original_k][w] >= min_reliability]
            else:
                values = [v for v,w in zip(values, workers) if v != -1 and reliability_dict[w] >= min_reliability]
            if values:                
                #print(k)
                if original_k in ordinal:
                    d[k] = sum(values)/len(values)
                elif original_k in binary:
                    if original_k == 'verdict_1':
                        agreement = how_many_equal([ordinal_scale_mapping[v] for v in values])
                    else:
                        agreement = how_many_equal(values)
                    d[f'agreements{key_suffix}'][k] = agreement/len(values)
                    #print(k)
                    d[k] = max(values, key=values.count) if agreement > 0.5 else 'NO_AGREEMENT'
            else:
                d[k] = None
calculate_data_agreement_reliability(data, overall_percentage, min_reliability=0.5, by_key=False)

In [None]:
calculate_data_agreement_reliability(data, by_key_percentage, min_reliability=0.5, by_key=True)

In [None]:
calculate_data_agreement_reliability(data, overall_percentage, min_reliability=0.75)

In [None]:
calculate_data_agreement_reliability(data, by_key_percentage, min_reliability=0.75, by_key=True)

In [None]:
calculate_data_agreement_reliability(data, overall_percentage, min_reliability=0.69, by_key=False)

In [None]:
calculate_data_agreement_reliability(data, by_key_percentage, min_reliability=0.69, by_key=True)

In [None]:
data[0].keys()

In [None]:
for p in 'simplified_', '':
    for k in ('0.5', '0.5_bk', '0.69', '0.69_bk', '0.75', '0.75_bk'):
        for d in data:
            d[f'{p}plausible@{k}'] = True if d[f'{p}verdict_1@{k}'] != 'NO_AGREEMENT' else False

In [None]:
for d in data:
    chatgpt_answer = next((a for a in d['answers'] if a['worker'] == 'CHATGPT'), None)
    for k in 'contra-article_1', 'contra-self_1', 'rate_1', 'verdict_1', 'convince_1', 'new_1', 'simplified_verdict_1':
        d[f"{k}_chatgpt"] = chatgpt_answer[k] if chatgpt_answer else None

In [None]:
def calculate_data_agreement_reliability_excluding(data, reliability_dict, key_suffix='', min_reliability=0.5, by_key=False, excluding=None):
    excluding = excluding or set()
    key_suffix = key_suffix or f'@{min_reliability}{"_bk" if by_key else ""}{"excluding" if excluding else None}'
    for d in tqdm(data):
        d[f'agreements{key_suffix}'] = dict()
        all_answers = [item for item in zip(*[a.items() for a in d['answers']]) if item[0][0] != 'worker']
        workers = [a['worker'] for a in d['answers']]
        #print(list(all_answers))
        for answers in all_answers:
            # print(d['answers'])
            keys, values = zip(*answers)
            # print(keys)
            # print(values)
            # print(workers)
            original_k = keys[0]
            k = f"{keys[0]}{key_suffix}"
            if by_key:
                values = [v for v,w in zip(values, workers) if v != -1 and reliability_dict[original_k][w] >= min_reliability and w not in excluding]
            else:
                values = [v for v,w in zip(values, workers) if v != -1 and reliability_dict[w] >= min_reliability and w not in excluding]
            if values:                
                #print(k)
                if original_k in ordinal:
                    d[k] = sum(values)/len(values)
                elif original_k in binary:
                    if original_k == 'verdict_1':
                        agreement = how_many_equal([ordinal_scale_mapping[v] for v in values])
                    else:
                        agreement = how_many_equal(values)
                    d[f'agreements{key_suffix}'][k] = agreement/len(values)
                    #print(k)
                    d[k] = max(values, key=values.count) if agreement > 0.5 else 'NO_AGREEMENT'
            else:
                d[k] = None
calculate_data_agreement_reliability_excluding(data, by_key_percentage, min_reliability=0.75, by_key=True, excluding={"CHATGPT"})
calculate_data_agreement_reliability_excluding(data, overall_percentage, min_reliability=0.75, by_key=False, excluding={"CHATGPT"})

In [None]:
for k in 'contra-article_1', 'contra-self_1', 'rate_1', 'verdict_1', 'convince_1', 'new_1', 'plausible', 'simplified_verdict_1', 'simplified_plausible': 
    for ts in [None, 0.5, 0.69, 0.75]:
        for bk in ['','_bk']:
            for ex in ['', 'excluding']:
                full_key = f"{k}@{ts}{bk}{ex if ts == 0.75 else ''}" if ts else k
                label_dict = defaultdict(int)
                for d in data:
                    if full_key not in d:
                        label_dict[None] += 1
                    else:
                        label_dict[d[full_key]] += 1
                print(full_key, dict(label_dict))
for k in 'contra-article_1', 'contra-self_1', 'rate_1', 'verdict_1', 'convince_1', 'new_1', 'simplified_verdict_1':
    full_key = f"{k}_chatgpt"
    label_dict = defaultdict(int)
    for d in data:
        if full_key not in d:
            label_dict[None] += 1
        else:
            label_dict[d[full_key]] += 1
    print(full_key, dict(label_dict))

In [None]:
data[1234]

In [None]:
from handystuff.loaders import write_jsonl

In [None]:
def we_need(k):
    return k in ('claim', 'verdict', 'sources', 'text') or any(k.startswith(p) for p in ('contra', 'verdict', 'convince', 'new', 'simplified_verdict', 'rate', 'plausible', 'simplified_plausible'))
def parse(v, k):
    if isinstance(v, list):
        return '\n'.join(lv for lv in v)
    if isinstance(v, float):
        return v
    if k.startswith('rate') and v is None:
        return 0
    else:
        return str(v)

In [None]:
parsed_data = [{k:parse(v,k) for k,v in d.items() if we_need(k)} for d in data]

In [None]:
write_jsonl(parsed_data, 'data-main-no-answers.json')

In [None]:
set(pd['rate_1@0.69_bk'] for pd in parsed_data)

In [None]:
sum((pd['rate_1@0.75']-1)/4 for pd in parsed_data)/len(parsed_data)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print(mean_squared_error([0.7022128958412811]*len(parsed_data), [(pd['rate_1@0.75']-1)/4 for pd in parsed_data]))
print(mean_absolute_error([0.7022128958412811]*len(parsed_data), [(pd['rate_1@0.75']-1)/4 for pd in parsed_data]))