In [123]:
import json
import sys
from collections import Counter
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
from functools import reduce
from pprint import pprint 
from trueskill import Rating, rate, quality_1vs1, rate_1vs1, quality


In [124]:
def iter_lines(file):
    with open(file, 'r', encoding='utf8') as f:
        for line in f:
            yield json.loads(line.strip())

def load_jsonl_as_df(jsonl_file, annotater=None):
    df = pd.DataFrame([line for line in iter_lines(jsonl_file)])
    df['annotator'] = annotater
    df.set_index(['id', 'annotator'], inplace=True)
    df.drop(columns=['src_text', 'ref_text', 'hyp_a_text', 'hyp_b_text', 'src_text_title', 'src_text_body', '_input_hash', '_task_hash', '_session_id', '_view_id'], inplace=True)
    return df


In [125]:
annotations = {
    'fa': 'data/hospo_respo_data/annotated_v1/pref_test_a.jsonl',
    'as': 'data/hospo_respo_data/annotated_v1/pref_test_b.jsonl',
    'tk': 'data/hospo_respo_data/annotated_v1/pref_test_c.jsonl',
}

dfs = [load_jsonl_as_df(v, k) for k, v in annotations.items()]
dfs[0]
print(type(list(df['hyp_a_id'].unique())))

<class 'list'>


In [126]:
models = np.unique([pd.unique(df[['hyp_a_id', 'hyp_b_id']].values.ravel('K')) for df in dfs])
models = {model_id: Rating() for model_id in models}
print(models)

{'/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt': trueskill.Rating(mu=25.000, sigma=8.333), '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt': trueskill.Rating(mu=25.000, sigma=8.333), '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt': trueskill.Rating(mu=25.000, sigma=8.333), '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_tgt_ppl/inference/bs5.txt': trueskill.Rating(mu=25.000, sigma=8.333)}


In [136]:

def update_model_rarings(df, models, draw_threshold=9):
    for i, row in df.iterrows():
        a = row.hyp_a_id
        b = row.hyp_b_id
        score = int(row.score)
        
        if row.answer != 'reject':
        
            if score < -draw_threshold: # a wins
                (models[a],), (models[b],) = rate([(models[a],), (models[b],)], ranks=[0, 1])
            
            elif score > draw_threshold: # b wins
                (models[a],), (models[b],) = rate([(models[a],), (models[b],)], ranks=[1, 0])
            else:
                (models[a],), (models[b],) = rate([(models[a],), (models[b],)], ranks=[0, 0])
                    
    return models

for df in dfs:
    pprint(models)
    models = update_model_rarings(df, models)

{'/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt': trueskill.Rating(mu=20.329, sigma=1.220),
 '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt': trueskill.Rating(mu=19.187, sigma=1.445),
 '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt': trueskill.Rating(mu=23.538, sigma=1.151),
 '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_tgt_ppl/inference/bs5.txt': trueskill.Rating(mu=38.746, sigma=3.456)}
{'/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt': trueskill.Rating(mu=20.188, sigma=1.212),
 '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt': trueskill.Rating(mu=19.029, sigma=1.430),
 '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt': trueskill.Rating(mu=23.749, sigma=1.138),
 '/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_tgt_ppl/inference/bs5.txt': trueskill.Rating(mu=38.866, sigma=3.390)}
{'/srv/scratch6/kew/bart/hospo_respo/e

In [137]:
for model, trueskill in models.items():
    print(model, trueskill.exposure)

/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt 16.897370266298665
/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt 15.182387598774302
/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt 20.296607408188343
/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_tgt_ppl/inference/bs5.txt 28.943241962177993


In [128]:
# # for i in range(len(dfs)-1):
# # df = pd.merge(dfs[0], dfs[1])
# # df = pd.concat(dfs, axis=1)

# df = reduce(lambda left, right: pd.merge(left, right, how='outer', on=['id', 'annotator', 'hyp_a_id', 'hyp_b_id', 'score', 'answer'],), dfs).sort_index()


# # df = pd.concat(dfs, join='outer', axis=1, copy=False)
# df

Unnamed: 0_level_0,Unnamed: 1_level_0,hyp_a_id,hyp_b_id,score,answer,time_loaded,time_updated,winner
id,annotator,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7577,as,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,0,accept,,,
7577,fa,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_tgt_ppl/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,-70,accept,,,
7577,tk,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt,22,accept,2021-07-28 12:01:28,2021-07-28 12:02:10,B
9792,as,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt,-50,accept,,,
9792,fa,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt,-78,accept,,,
9792,tk,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt,21,accept,2021-07-28 12:02:10,2021-07-28 12:02:32,B
9974,as,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt,-70,accept,,,
9974,fa,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_tgt_ppl/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_freq_distro/inference/bs5.txt,-25,accept,,,
9974,tk,/srv/scratch6/kew/bart/hospo_respo/en/500k/baseline/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,62,accept,2021-07-28 12:02:47,2021-07-28 12:02:54,B
15764,as,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_gen_sent/inference/bs5.txt,/srv/scratch6/kew/bart/hospo_respo/en/500k/filt_tgt_ppl/inference/bs5.txt,100,accept,,,
