In [None]:
! pip install -q lexical-diversity statsmodels pymorphy2[fast]

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.inter_rater import fleiss_kappa
from lexical_diversity import lex_div as ld
from nltk import word_tokenize
from nltk import download as nltk_download
from sklearn.metrics import accuracy_score, f1_score
import pymorphy2

from itertools import chain
from typing import List, Set, MutableMapping, Optional, Sequence, Tuple, Union
from enum import Flag, auto
from string import punctuation

nltk_download('punkt', quiet=True)

In [None]:
morph = pymorphy2.MorphAnalyzer()

In [None]:
dev = pd.read_csv('dev.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
ans = pd.read_csv('assignments.tsv', sep='\t')

ans = ans[ans['GOLDEN:class'].isna()]
ans = ans[~ans['OUTPUT:q_1_error']]
ans = ans[~ans['OUTPUT:q_2_error']]

ans = ans[['INPUT:question_1_id', 'INPUT:question_2_id', 'OUTPUT:class', 'ASSIGNMENT:worker_id']].drop_duplicates()
ans = ans.rename(columns={
    'INPUT:question_1_id': 'left_id',
    'INPUT:question_2_id': 'right_id',
    'OUTPUT:class': 'class',
    'ASSIGNMENT:worker_id': 'worker_id'
})

dev['key'] = dev['left_id'].apply(lambda x: str(x)) + '_' + dev['right_id'].apply(lambda x: str(x))
test['key'] = test['left_id'].apply(lambda x: str(x)) + '_' + test['right_id'].apply(lambda x: str(x))
ans['key'] = ans['left_id'].apply(lambda x: str(x)) + '_' + ans['right_id'].apply(lambda x: str(x))

dev_ans = ans[ans['key'].isin(dev['key'])]
test_ans = ans[ans['key'].isin(test['key'])]

Estimating human baseline metrics

In [None]:
ctrl_ans = pd.read_csv('assignments.tsv', sep='\t')
ctrl_ans = ctrl_ans[~ctrl_ans['GOLDEN:class'].isna()]
human_preds = ctrl_ans['OUTPUT:class'].to_numpy()
labels = ctrl_ans['GOLDEN:class'].to_numpy()both
print('Human baseline')
print('\tAccuracy: {:.4f}'.format(accuracy_score(labels, human_preds)))
print('\tF1: {:.4f}'.format(f1_score(labels, human_preds)))

Human baseline
	Accuracy: 0.8434
	F1: 0.8660


Count Interrater Reliability

In [None]:
def fleiss_kappa_table(df: pd.DataFrame) -> pd.DataFrame:
    neg_cnt = (df['class'] == 0).astype(np.long).sum()
    pos_cnt = (df['class'] == 1).astype(np.long).sum()
    return pd.DataFrame({
        'pos_cnt': [pos_cnt],
        'neg_cnt': [neg_cnt],
        'num_raters': [neg_cnt + pos_cnt]
    })

def fleiss_kappa_agg(df: pd.DataFrame) -> float:
    table = df[['neg_cnt', 'pos_cnt']].to_numpy(dtype=np.long, copy=True)
    return fleiss_kappa(table)

def set_fleiss_kappa(df: pd.DataFrame, final_agg: bool = True) -> Union[float, pd.DataFrame]:
    table = df.groupby('key').apply(fleiss_kappa_table).reset_index()
    grouped = table.groupby('num_raters')

    fleiss_agg = grouped.apply(fleiss_kappa_agg).reset_index()
    fleiss_agg = fleiss_agg.rename(columns={0: 'fleiss_kappa'})

    cnt = grouped.key.agg('count').reset_index()
    cnt = cnt.rename(columns={'key': 'num_entities'})

    aggregated = pd.merge(fleiss_agg, cnt, left_on='num_raters', right_on='num_raters')
    if final_agg:
        w = aggregated['num_entities'].to_numpy(dtype=np.long)
        w = w / w.sum()
        coefs = aggregated['fleiss_kappa'].to_numpy()
        return (w * coefs).sum().item()
    else:
        return agg

In [None]:
print('Test set inter-rater reliability: {:.4f}'.format(set_fleiss_kappa(test_ans)))
print('Dev set inter-rater reliability: {:.4f}'.format(set_fleiss_kappa(dev_ans)))

Test set inter-rater reliability: 0.7744
Dev set inter-rater reliability: 0.6314


Morphological stats

In [None]:
def get_corpus_lemmas(df: pd.DataFrame) -> List[str]:
    text_it = zip(df['left_text'], df['right_text'])
    text_it = chain.from_iterable(text_it)
    text_it = map(lambda sent: word_tokenize(sent, language='russian'), text_it)
    text_it = chain.from_iterable(text_it)
    text_it = map(lambda token: morph.parse(token)[0], text_it)
    text_it = filter(lambda p: 'PNCT' not in p.tag, text_it)
    text_it = map(lambda p: p.normal_form, text_it)
    return list(text_it)

In [None]:
punct_set = set(punctuation)

def bag_of_tags(sent: str, tokens: Optional[Sequence[str]] = None) -> Set[str]:
    if tokens is None:
        tokens = word_tokenize(sent, language='russian')
    parsed = map(lambda tok: morph.parse(tok)[0], tokens)
    parsed = map(lambda p: str(p.tag).split(','), parsed)
    parsed = chain.from_iterable(parsed)
    return set(parsed)

class Prop(Flag):
    ENG = auto()
    NUMBR = auto()
    NUM_WORD = auto()
    GRND = auto()
    NEGATION = auto()
    DBL_NEGATION = auto()
    COMP_SENT = auto()

def analyze(sent: str) -> Tuple[int, Prop]:
    tokens = word_tokenize(sent, language='russian')

    word_toks = filter(lambda token: len(set(token) & punct_set) == 0, tokens)
    sent_len = len(list(word_toks))
    tags = bag_of_tags(sent, tokens=tokens)
    properties = Prop.GRND & Prop.NUMBR
    if 'LATN' in tags:
        properties |= Prop.ENG
    if len({'NUMB', 'intg', 'real', 'ROMN'} & tags) > 0:
        properties |= Prop.NUMBR
    if 'NUMR' in tags:
        properties |= Prop.NUM_WORD
    if 'GRND' in tags:
        properties |= Prop.GRND
    
    morphs = [morph.parse(tok)[0] for tok in tokens]
    neg_cnt = 0
    for p in morphs:
        if p.tag.POS == 'PRCL' and p.normal_form in ('не', 'ни'):
            neg_cnt += 1
        if p.tag.POS in ('ADVB', 'NPRO') and p.normal_form.startswith('ни'):
            neg_cnt += 1
        if p.tag.POS == 'PRED' and p.normal_form.startswith('не'):
            neg_cnt += 1
        
    if neg_cnt > 0:
        properties |= Prop.NEGATION
    if neg_cnt == 2:
        properties |= Prop.DBL_NEGATION
    return sent_len, properties

def corpus_stat(df: pd.DataFrame):
    len_s = 0
    len_diff_s = 0
    cnt = 0

    stats = {item.name:0 for item in list(Prop)}

    for left, right in zip(df['left_text'], df['right_text']):
        left_len, left_props = analyze(left)
        right_len, right_props = analyze(right)
        len_s += left_len + right_len
        len_diff_s += abs(left_len - right_len)
        props = left_props | right_props
        for item in list(Prop):
            if item in props:
                stats[item.name] += 1
        cnt += 1
    stats = {key: val / cnt for key, val in stats.items()}
    stats['mean_len'] = len_s / (2 * cnt)
    stats['mean_diff'] = len_diff_s / cnt
    stats['pos_frac'] = df['class'].astype(np.float).mean()
    lemmas = get_corpus_lemmas(df)
    stats['diversity'] = ld.mtld(lemmas)
 
    return stats

In [None]:
corpus_stat(test)

{'COMP_SENT': 0.0,
 'DBL_NEGATION': 0.005,
 'ENG': 0.0,
 'GRND': 0.03166666666666667,
 'NEGATION': 0.085,
 'NUMBR': 0.115,
 'NUM_WORD': 0.0,
 'diversity': 25.10119964835842,
 'mean_diff': 1.335,
 'mean_len': 8.034166666666666,
 'pos_frac': 0.5}

In [None]:
corpus_stat(dev)

{'COMP_SENT': 0.0,
 'DBL_NEGATION': 0.010704727921498661,
 'ENG': 0.0,
 'GRND': 0.031222123104371096,
 'NEGATION': 0.10526315789473684,
 'NUMBR': 0.14540588760035683,
 'NUM_WORD': 0.0008920606601248885,
 'diversity': 31.89700841316214,
 'mean_diff': 1.4424620874219447,
 'mean_len': 8.085191793041927,
 'pos_frac': 0.23996431757359502}