In [1]:
def read_file(fname):
    data = []
    with open(fname) as f:
        for line in f:
            data.append(line.strip())
    return data

In [2]:
def get_data(tgt_lang, domain, split):
    source = read_file(f"../internal_split/en-{tgt_lang}/{split}.{domain}.en")
    formal_translations = read_file(f"../internal_split/en-{tgt_lang}/{split}.{domain}.formal.{tgt_lang}")
    informal_translations = read_file(f"../internal_split/en-{tgt_lang}/{split}.{domain}.informal.{tgt_lang}")
    return source, formal_translations, informal_translations

In [3]:
src_lang = "en"
tgt_lang = "hi"
domain="combined"
split = "train"

In [4]:
import numpy as np
def get_average_length(texts, tokenization):
    if tokenization=="white_space":
        lengths = [len(x.split(" ")) for x in texts]
        return np.average(lengths)
    elif tokenization=="char":
        lengths = [len(x) for x in texts]
    return np.average(lengths)

In [5]:
def is_similar(x, y):
    return 1 if x==y else 0

def get_no_diff_counts(formal_translations, informal_translations):
    count = np.sum([is_similar(x, y) for (x, y) in zip(formal_translations, informal_translations)])
    return np.average(count)

In [6]:
for tgt_lang in ["de", "es", "hi", "ja"]:
    
    source, formal_translations, informal_translations=get_data(tgt_lang, domain, split)
    print("Language", tgt_lang)
    print("Source Length: ", get_average_length(source, "white_space"))
    if tgt_lang == "ja":
        tokenization = "char"
    else:
        tokenization = "white_space"
    print("Formal Length: ", get_average_length(formal_translations, tokenization))
    print("Informal Length: ", get_average_length(informal_translations, tokenization))
    
    print("No Difference", get_no_diff_counts(formal_translations, informal_translations))

Language de
Source Length:  20.036666666666665
Formal Length:  20.363333333333333
Informal Length:  20.263333333333332
No Difference 20.0
Language es
Source Length:  20.03
Formal Length:  19.563333333333333
Informal Length:  19.523333333333333
No Difference 44.0
Language hi
Source Length:  20.21
Formal Length:  23.72
Informal Length:  23.723333333333333
No Difference 60.0
Language ja
Source Length:  22.065555555555555
Formal Length:  55.032222222222224
Informal Length:  51.84888888888889
No Difference 16.0


In [9]:
import re
FORMALITY_PHRASES = re.compile("(\[F\](.*?)\[/F\])")

def get_annotated_data(tgt_lang, domain, split):
    formal_annotated = read_file(f"../internal_split/en-{tgt_lang}/{split}.{domain}.formal.annotated.{tgt_lang}")
    informal_annotated = read_file(f"../internal_split/en-{tgt_lang}/{split}.{domain}.informal.annotated.{tgt_lang}")
    return formal_annotated, informal_annotated

def get_sentence_phrasal_counts(sent):
    anno_ph = re.findall(FORMALITY_PHRASES, sent)
    return len(anno_ph)

def get_phrasal_annotations_counts(formal_annotated, informal_annotated):
    formal_counts = []
    for text in formal_annotated:
        formal_counts.append(get_sentence_phrasal_counts(text))
    
    informal_counts = []
    for text in informal_annotated:
        informal_counts.append(get_sentence_phrasal_counts(text))
    
    return np.average(formal_counts), np.average(informal_counts)

In [10]:
for tgt_lang in ["de", "es", "hi", "ja"]:
    
    formal_annotated, informal_annotated=get_annotated_data(tgt_lang, domain, split)
    print("Language", tgt_lang)
    print("Counts: ", get_phrasal_annotations_counts(formal_annotated, informal_annotated))

Language de
Counts:  (1.8866666666666667, 1.8866666666666667)
Language es
Counts:  (1.5433333333333332, 1.5433333333333332)
Language hi
Counts:  (1.53, 1.53)
Language ja
Counts:  (2.4544444444444444, 2.37)


In [None]:
output = read_file("../experiments/en-hi/baseline/combined/out.dev")

In [None]:
formal_translations

In [None]:
output