In [10]:

import argparse
from collections import defaultdict
import re
from typing import List, Pattern, Tuple

FORMALITY_PHRASES = re.compile("(\[F\](.*?)\[/F\])")


def compute_score(
    hypotheses: str,
    annotated_formal_refs: str,
    annotated_informal_refs: str,
    tok_split: bool=True
) -> Tuple[float, float]:
    """
    Compute formal and informal matched-accuracy scores.

    Args:
        hypothesis: file containing system detokenized output translations
        annotated_formal_refs: formal reference file with annotated grammatical formality
        annotated_informal_refs: informal reference file with annotated grammatical formality
        tok_split: Split hypotheses and references into tokens before phrase matching.

    Returns:
        (formal_acc, informal_acc), the formal accuracy and informal matched-accuracy scores respectively.
    """
    hypotheses = _read_lines(hypotheses)
    annotated_references_formal = _read_lines(annotated_formal_refs)
    annotated_references_informal = _read_lines(annotated_informal_refs)

    if not (len(hypotheses) == len(annotated_references_formal) == len(annotated_references_informal) > 0):
        raise RuntimeError("Empty or mismatched hypotheses and reference files.")

    scores = defaultdict(int)
    count_neutral = 0
    for hyp, ref_formal, ref_informal in zip(hypotheses, annotated_references_formal, annotated_references_informal):
        formal_phrases_hyp, formal_phrases_ref = get_matching_phrases(hyp, ref_formal, tok_split, FORMALITY_PHRASES)
        informal_phrases_hyp, informal_phrases_ref = get_matching_phrases(hyp, ref_informal, tok_split, FORMALITY_PHRASES)
    
        label = predict_formality_label(formal_phrases_hyp, informal_phrases_hyp)
        print(hyp, formal_phrases_ref, informal_phrases_ref, label, formal_phrases_hyp, informal_phrases_hyp)
        if len(formal_phrases_ref) == 0:
            count_neutral+=1
        scores[f"ref_matched_count_{label}"] += 1
    n_matched = scores["ref_matched_count_INFORMAL"] + scores["ref_matched_count_FORMAL"]
    formal_acc = scores["ref_matched_count_FORMAL"]/n_matched if n_matched>0 else 0
    informal_acc = scores["ref_matched_count_INFORMAL"]/n_matched if n_matched>0 else 0
    print(scores)
    print("n_matched:", n_matched)
    print("Total samples:", len(hypotheses))
    print("True neutral:", count_neutral)
    return formal_acc, informal_acc


def get_matching_phrases(
    hyp: str,
    anno_ref: str,
    tok_split: bool=True,
    phrase_regex: Pattern=FORMALITY_PHRASES
):
    """
    Get annotated phrases that match in the hypothesis.

    Args:
        hyp: system hypothesis
        anno_ref: reference translation with annotated grammatical register
        tok_split: whether phrases should be split (on whitespace) into tokens before phrase matching
        phrase_regex: regular expression for finding references.

    Returns:
        list of annotated phrases occuring in the hypothesis.
    """
    anno_ph = re.findall(phrase_regex, anno_ref)
    if not tok_split:
        anno_ph_hyp = [ph for _, ph in anno_ph if ph in hyp]
    else:
        anno_ph_hyp = [ph for _, ph in anno_ph if set(ph.split(" ")).issubset(hyp.split(" "))]
    return anno_ph_hyp, anno_ph


def predict_formality_label(
    ph_formal_hyp: List[str],
    ph_informal_hyp: List[str]
) -> str:
    """
    Predict the formality label depending on the number of matched formal/informal phrases.

    Args:
        ph_formal_hyp: list of phrases in the hypothesis matching annotated formal reference phrases
        ph_informal_hyp: list of phrases in the hypothesis matching annotated informal reference phrases

    Returns:
        formality label, "FORMAL","INFORMAL","OTHER","NEUTRAL"
    """
    if ph_formal_hyp and not ph_informal_hyp:
        return "FORMAL"
    elif ph_informal_hyp and not ph_formal_hyp:
        return "INFORMAL"
    elif not ph_informal_hyp and not ph_formal_hyp:
        return "NEUTRAL"
    return "OTHER"

def _read_lines(file: str) -> List[str]:
    with open(file, "r", encoding="utf-8") as file:
        raw_text = [line.strip() for line in file.readlines()]
    return raw_text


In [11]:
whitespace_delimited_tokens = True
domain="combined"
hypotheses=f"../experiments/en-es/mBART/{domain}/out.dev"
# hypotheses="../../unpaired_res/formal.topical-chat.es"
formal_refs=f"../internal_split/en-es/dev.{domain}.formal.annotated.es"
informal_refs=f"../internal_split/en-es/dev.{domain}.informal.annotated.es"

In [12]:
formal_acc, informal_acc = compute_score(
        hypotheses, formal_refs, informal_refs, tok_split=whitespace_delimited_tokens
    ) 

Well there's actually a hidden subculture dedicated to find hidden Mickey mouse images in all things Disney, you should definitely check out! [('[F]debería[/F]', 'debería')] [('[F]deberías[/F]', 'deberías')] NEUTRAL [] []
Me complace mucho que tu cana pueda ser donadora. [('[F]sus[/F]', 'sus')] [('[F]tus[/F]', 'tus')] NEUTRAL [] []
Es cierto, pero creo que los gatos son más inteligentes que la mayoría de los políticos. [('[F]Puede imaginarse[/F]', 'Puede imaginarse')] [('[F]Puedes imaginarte[/F]', 'Puedes imaginarte')] NEUTRAL [] []
No le garantizaría ganar el juego, pero lo haría mucho más fácil. [('[F]ganara[/F]', 'ganara'), ('[F]pase[/F]', 'pase')] [('[F]ganaras[/F]', 'ganaras'), ('[F]pases[/F]', 'pases')] NEUTRAL [] []
Sí, eso tiene sentido, ¿tenen oído hablar del buque de 10 millones de dólares que tiene? [('[F]Ha oído[/F]', 'Ha oído')] [('[F]Has oído[/F]', 'Has oído')] NEUTRAL [] []
Sí, y la mayoría de los lugares en los que viajas todavía puedes acceder a tu cuenta ya que está d