In [4]:
import pandas as pd
import os

Download the data from:
- http://www.statmt.org/wmt17/results.html 

News translation task: **segment-level human data, submissions and scripts to reproduce the results published in the metrics task paper.**. 

In [5]:
SRC_PREFIX = 'wmt17-metrics-task-package/input/wmt17-metrics-task/wmt17-submitted-data/txt/sources/'
REF_PREFIX = 'wmt17-metrics-task-package/input/wmt17-metrics-task/wmt17-submitted-data/txt/references/'
SYS_PREFIX = 'wmt17-metrics-task-package/input/wmt17-metrics-task/wmt17-submitted-data/txt/system-outputs/newstest2017/'

In [6]:
sources = [line.strip() for line in open(SRC_PREFIX+'newstest2017-ende-src.en')]
references = [line.strip() for line in open(REF_PREFIX+'newstest2017-ende-ref.de')]

### EN-DE corpus:

In [7]:
sys_outputs_files = os.listdir(SYS_PREFIX+'en-de/')
systems = ['.'.join(submission_name.split('.')[1:-1]) for submission_name in sys_outputs_files]

In [8]:
systems

['online-F.0',
 'LIUM-NMT.4900',
 'LMU-nmt-reranked.4934',
 'online-G.0',
 'C-3MA.4959',
 'fbk-nmt-combination.4870',
 'RWTH-nmt-ensemble.4921',
 'PROMT-Rule-based.4735',
 'uedin-nmt.4722',
 'KIT.4950',
 'online-A.0',
 'xmu.4910',
 'SYSTRAN.4847',
 'online-B.0',
 'LMU-nmt-single.4893',
 'TALP-UPC.4834']

In [9]:
def get_system_outputs(lp):
    sys_outputs_files = os.listdir(SYS_PREFIX+f'{lp}/')
    systems = ['.'.join(submission_name.split('.')[1:-1]) for submission_name in sys_outputs_files]
    system_outputs = {}
    for system in sys_outputs_files:
        system_name = ".".join(system.split('.')[1:-1])
        system_outputs[system_name] = [
            line.strip() for line in open(SYS_PREFIX+f'{lp}/'+system)
        ]
    return system_outputs

In [10]:
en_de_data = pd.DataFrame({'src': sources, 'ref': references, **get_system_outputs('en-de')})

In [11]:
en_de_data.head()

Unnamed: 0,src,ref,online-F.0,LIUM-NMT.4900,LMU-nmt-reranked.4934,online-G.0,C-3MA.4959,fbk-nmt-combination.4870,RWTH-nmt-ensemble.4921,PROMT-Rule-based.4735,uedin-nmt.4722,KIT.4950,online-A.0,xmu.4910,SYSTRAN.4847,online-B.0,LMU-nmt-single.4893,TALP-UPC.4834
0,28-Year-Old Chef Found Dead at San Francisco Mall,28-jähriger Koch in San Francisco Mall tot auf...,28-jähriger Chef Found Dead bei San Francisco ...,28-jähriger Chef Found tot bei San Francisco Mall,28-jähriger Chef fand Tote bei San Francisco Mall,"Der 28-Jährige Küchenchef Gefunden, Tot in San...",28-Jähriger-Chef Faund Dead bei San Francisco ...,28-jähriger Chef Found Dead bei San Francisco ...,28-Jähriger Chef Fund Dead bei San Francisco Mall,Der 28-jährige Chef gefundene Tote am San Fran...,28-jähriger Chef Found in der San Francisco Ma...,28-jähriger Chef Found Dead in San Francisco Mall,﻿28 Jahre alt Chef gefunden tot in San Francis...,28-jähriger Chef Found Dead bei San Francisco ...,28-Jähriger Chef Found Dead bei San Francisco ...,28-jährige Chef gefunden tot in San Francisco ...,28-jähriger Chef fand Tote bei San Francisco Mall,28-jähriger Chef Found Dead am San Francisco M...
1,A 28-year-old chef who had recently moved to S...,"Ein 28-jähriger Koch, der vor kurzem nach San ...","Ein 28-jähriger Chef, der vor kurzem nach San ...","Ein 28-jähriger Koch, der vor kurzem nach San ...","Ein 28-jähriger Küchenchef, der kürzlich nach ...","Eine 28-jährige Koch, der hatte vor kurzem nac...","Ein 28 Jahre alter Chefkoch, der vor kurzem in...","Ein 28-jähriger Koch, der kürzlich nach San Fr...","Ein 28 -jähriger Koch, der kürzlich nach San F...","Ein 28-jähriger Chef, der sich kürzlich nach S...","Ein 28-jähriger Koch, der kürzlich nach San Fr...","Ein 28-jähriger Chefkoch, der vor kurzem nach ...","Ein 28 Jahre altes Chef, der vor kurzem nach S...","Ein 28-jähriger Koch, der kürzlich nach San Fr...","Eine 28-jährige Köchin, die kürzlich nach San ...","Ein 28-jähriger Chef, der vor kurzem nach San ...","Ein 28-jähriger Küchenchef, der kürzlich nach ...","Ein 28-jähriger Küchenchef, der vor kurzem nac..."
2,But the victim's brother says he can't think o...,"Der Bruder des Opfers sagte aus, dass er sich ...","Aber der Bruder des Opfers sagt, dass er nicht...","Aber der Bruder des Opfers sagt, er könne niem...","Doch der Bruder des Opfers sagt, er könne niem...","Aber der Bruder des Opfers sagt, er kann nicht...","Aber der Bruder des Opfers sagt, er könne nich...","Aber der Bruder des Opfers sagt, er könne niem...","Aber der Bruder des Opfers sagt, er könne niem...","Aber der Bruder des Opfers sagt, dass er an ni...","Doch der Bruder des Opfers sagt, er könne nich...","Aber der Bruder des Opfers sagt, er könne nich...","Aber der Bruder des Opfers sagt, er könne nich...","Doch der Bruder des Opfers sagt, er könne nich...","Aber der Bruder des Opfers sagt, er könne nich...","Aber der Bruder des Opfers sagt, er könne nich...","Doch der Bruder des Opfers sagt, er könne niem...","Aber der Bruder des Opfers sagt, er könne jema..."
3,The body found at the Westfield Mall Wednesday...,Der am Mittwoch morgen in der Westfield Mall g...,"Der Körper, der am Westfield-Mall Mittwoch Mor...","Der Körper, der am Mittwochmorgen im Westfield...",Der Körper am Westfeld Mall am Mittwochmorgen ...,Der Körper fand in der Westfield Mall Mittwoch...,Das im Westfield-Mall-Mittwochmorgen aufgestel...,Die Leiche am Westfield Mall am Mittwochmorgen...,Der am Westfield Mall Mittwochmorgen gefundene...,"Der Körper, der am Einkaufszentrum von Westfie...",Der am Mittwochmorgen an der Westfield Mall ge...,"Der Körper, der am Westfield Mall am Mittwochm...",Die Leiche fand am Mittwoch Vormittag der Mall...,Die am Mittwochmorgen am Westfield Mall gefund...,Der am WestField Mall Mittwoch Morgen festgeno...,"Der Körper, der am Westfield Mall am Mittwochm...","Der Körper, der am Mittwochmorgen im Westfield...","Der Körper, der im Westfield Mall Mittwoch Mor..."
4,The San Francisco Police Department said the d...,"Das San Francisco Police Department sagte, das...","San Francisco Police Department sagte, dass de...","Die Polizei von San Francisco sagte, der Tod s...",Das Polizeiministerium von San Francisco sagte...,"Das San Francisco Police Department sagte, der...","Die Polizeiabteilung San Francisco sagte, der ...","Die Polizeibehörde San Francisco erklärte, der...","Die Polizeiabteilung San Francisco sagte, der ...","Die San Francisco Polizeibehörde hat gesagt, d...","Die Polizeibehörde von San Francisco erklärte,...","Die Polizeiabteilung von San Francisco sagte, ...","Die San Francisco Police Department, sagte der...","Die Polizeidirektion San Francisco sagte, der ...","Die PolizeiDirektion von San Francisco sagte, ...","Die San Francisco Polizeiabteilung sagte, der ...","Die Polizei von San Francisco sagte, der Tod s...","Die Abteilung San Francisco Polizei sagte, der..."


## DA Scores:

In [15]:
DA_DATA = 'humaneval-seg-da/ad-seg-scores-{}.csv'

In [16]:
def segment_level_da(path, lp):
    with open(path, "r") as fp:
        fp.readline()
        lines = [l.strip() for l in fp.readlines()]
    data = []
    for line in lines:
        sys, sid, raw_score, z, n = line.split()
        data.append({"system": sys, "SID": sid, "raw_score": raw_score, "z_score": z, "annotators": n, "lp": lp})
    return pd.DataFrame(data)

In [17]:
en_de_da_data = segment_level_da(DA_DATA.format('en-de'), "en-de")

In [18]:
en_de_da_data.head()

Unnamed: 0,system,SID,raw_score,z_score,annotators,lp
0,RWTH-nmt-ensemble.4921,1467,76,0.225731637374983,1,en-de
1,RWTH-nmt-ensemble.4921,1913,23,-1.54724786336601,1,en-de
2,RWTH-nmt-ensemble.4921,2766,53,-0.677766865960105,1,en-de
3,RWTH-nmt-ensemble.4921,823,67,-0.127811255234399,1,en-de
4,RWTH-nmt-ensemble.4921,881,85,0.875052043519762,2,en-de


In [19]:
from tqdm.notebook import tqdm 

def merge_submissions_with_human_scores(submissions, human_scores):
    
    def system_in_submissions(system):
        for col in submissions.columns:
            if system in col:
                return True
            if col in system:
                return True
        return False
    
    lp, data, src, ref, hyp, raw_score, z_score, annotators = [], [], [], [], [], [], [], []
    for i, row in tqdm(human_scores.iterrows(), total=len(human_scores), desc=list(human_scores.lp)[0]):
        sub_row = submissions.iloc[int(row['SID'])-1] # segments have ids from 1 to 1997
        
        # Not all systems belong to the newstranslation shared task. Some belong
        # to other shared tasks such as unsupervied MT.
        
        if system_in_submissions(row['system']):
            system = row['system']
            hyp.append(sub_row[system])
            lp.append(row['lp'])
            data.append('newstest2019')
            src.append(sub_row['src'])
            ref.append(sub_row['ref'])
            raw_score.append(float(row['raw_score']))
            z_score.append(float(row['z_score']))
            annotators.append(int(row['annotators']))
    
    data = {
        "lp": lp,
        "src": src,
        "mt": hyp,
        "ref": ref,
        "score": z_score,
        "raw_score": raw_score,
        "annotators": annotators
    }
    return pd.DataFrame(data)


In [20]:
en_de_scores_merged = merge_submissions_with_human_scores(en_de_data, en_de_da_data)

HBox(children=(FloatProgress(value=0.0, description='en-de', max=7025.0, style=ProgressStyle(description_width…




In [21]:
en_de_scores_merged.head()

Unnamed: 0,lp,src,mt,ref,score,raw_score,annotators
0,en-de,"For older generations, singing together is see...",Für ältere Generationen wird Gesang gemeinsam ...,In den älteren Generationen hatte das gemeinsa...,0.225732,76.0,1
1,en-de,"The complaints are many, covering everything f...","Die Beschwerden sind viele, die alles von der ...","Es gibt zahlreiche Beschwerden, die alles von ...",-1.547248,23.0,1
2,en-de,He also announced that security services would...,"Er kündigte zudem an, dass die Sicherheitsdien...",Er kündigte auch eine weitere personelle Verst...,-0.677767,53.0,1
3,en-de,The Catalan team will face off against FC Sevi...,Das katalanische Team wird sich gegen FC Sevil...,Die Katalanen treten im Duell um den spanische...,-0.127811,67.0,1
4,en-de,"""The IAAF will perform an evaluation, dependin...","""Die IAAF wird eine Bewertung durchführen, abh...","""Abhängig von den Ergebnissen der Untersuchung...",0.875052,85.0,2


In [22]:
nd_en_de_scores_merged = en_de_scores_merged[~en_de_scores_merged.duplicated(['src', 'ref', 'mt', 'score'])]
len(en_de_scores_merged), len(nd_en_de_scores_merged)

(7025, 7025)

### All languages:

In [26]:
language_pairs = [
    'en-cs', 'en-de', 'en-fi', 'en-lv', 'en-tr', 'en-ru', 'en-zh',
    'cs-en', 'de-en', 'fi-en',          'tr-en', 'ru-en', 'zh-en'
]

In [28]:
for language_pair in language_pairs:
    try:
        os.makedirs(language_pair)
    except FileExistsError:
        pass
    src, trg = language_pair.split('-')
    lp, lp1 = src + '-' + trg, src + trg
    
    sources = [line.strip() for line in open(SRC_PREFIX+f'newstest2017-{lp1}-src.{src}')]
    references = [line.strip() for line in open(REF_PREFIX+f'newstest2017-{lp1}-ref.{trg}')]
    
    sys_outputs_files = os.listdir(SYS_PREFIX+f'{lp}/')
    systems = ['.'.join(submission_name.split('.')[1:-1]) for submission_name in sys_outputs_files]
    
    lp_data = pd.DataFrame({'src': sources, 'ref': references, **get_system_outputs(lp)})
    
    da_data = segment_level_da(DA_DATA.format(language_pair), language_pair)
    
    merged = merge_submissions_with_human_scores(lp_data, da_data)
    merged = merged[~merged.duplicated(['src', 'ref', 'mt', 'score'])]
    print ("LP {} - scores data size: {}".format(language_pair, len(merged)))
    merged.to_csv(f'{lp}/scores.csv', index=False)

HBox(children=(FloatProgress(value=0.0, description='en-cs', max=20532.0, style=ProgressStyle(description_widt…


LP en-cs - scores data size: 20532


HBox(children=(FloatProgress(value=0.0, description='en-de', max=7025.0, style=ProgressStyle(description_width…


LP en-de - scores data size: 7025


HBox(children=(FloatProgress(value=0.0, description='en-fi', max=6748.0, style=ProgressStyle(description_width…


LP en-fi - scores data size: 6748


HBox(children=(FloatProgress(value=0.0, description='en-lv', max=5810.0, style=ProgressStyle(description_width…


LP en-lv - scores data size: 5810


HBox(children=(FloatProgress(value=0.0, description='en-tr', max=2039.0, style=ProgressStyle(description_width…


LP en-tr - scores data size: 2039


HBox(children=(FloatProgress(value=0.0, description='en-ru', max=17861.0, style=ProgressStyle(description_widt…


LP en-ru - scores data size: 17358


HBox(children=(FloatProgress(value=0.0, description='en-zh', max=10267.0, style=ProgressStyle(description_widt…


LP en-zh - scores data size: 10221


HBox(children=(FloatProgress(value=0.0, description='cs-en', max=11685.0, style=ProgressStyle(description_widt…


LP cs-en - scores data size: 11585


HBox(children=(FloatProgress(value=0.0, description='de-en', max=24158.0, style=ProgressStyle(description_widt…


LP de-en - scores data size: 21704


HBox(children=(FloatProgress(value=0.0, description='fi-en', max=15275.0, style=ProgressStyle(description_widt…


LP fi-en - scores data size: 15159


HBox(children=(FloatProgress(value=0.0, description='tr-en', max=18103.0, style=ProgressStyle(description_widt…


LP tr-en - scores data size: 17335


HBox(children=(FloatProgress(value=0.0, description='ru-en', max=19116.0, style=ProgressStyle(description_widt…


LP ru-en - scores data size: 17980


HBox(children=(FloatProgress(value=0.0, description='zh-en', max=27374.0, style=ProgressStyle(description_widt…


LP zh-en - scores data size: 26419


## Relative Ranks:

In [29]:
rr_data = pd.read_csv('wmt17-metrics-task-package/manual-evaluation/RR-seglevel.csv', delimiter=' ')

In [30]:
en_de_rr_data = rr_data[rr_data['LP'] == 'en-de']
en_de_rr_data.head()

Unnamed: 0,LP,DATA,SID,BETTER,WORSE
32810,en-de,newstest2017,345,online-B.0,TALP-UPC.4834
32811,en-de,newstest2017,345,LMU-nmt-reranked.4934,TALP-UPC.4834
32812,en-de,newstest2017,345,RWTH-nmt-ensemble.4921,TALP-UPC.4834
32813,en-de,newstest2017,345,C-3MA.4959,TALP-UPC.4834
32814,en-de,newstest2017,345,fbk-nmt-combination.4870,TALP-UPC.4834


In [31]:
len(en_de_rr_data.BETTER.unique()), len(en_de_rr_data.BETTER.unique()), len(systems)

(16, 16, 16)

In [32]:
from tqdm.notebook import tqdm 

def merge_submissions_with_RR(submissions, rr_data):
    
    def system_in_submissions(system):
        for col in submissions.columns:
            if system in col:
                return True
            if col in system:
                return True
        return False
    
    lp, data, model1, model2, src, ref, hyp1, hyp2, bestmodel = [], [], [], [], [], [], [], [], []
    for i, row in tqdm(rr_data.iterrows(), total=len(rr_data), desc=list(rr_data.LP)[0]):
        sub_row = submissions.iloc[row['SID']-1] # segments have ids from 1 to 1997
        
        # Not all systems belong to the newstranslation shared task. Some belong
        # to other shared tasks such as unsupervied MT.
        if system_in_submissions(row['BETTER']) and system_in_submissions(row['WORSE']):
            better_sys = row['BETTER']
            worse_sys = row['WORSE']
            
            hyp1.append(sub_row[better_sys])
            hyp2.append(sub_row[worse_sys])
            lp.append(row['LP'])
            data.append('newstest2019')
            src.append(sub_row['src'])
            ref.append(sub_row['ref'])
            model1.append(better_sys)
            model2.append(worse_sys)
            bestmodel.append(better_sys)
    
    data = {
        "data": data,
        "lp": lp,
        "src": src,
        "ref": ref,
        "pos": hyp1,
        "neg": hyp2,
        "pos.model": model1,
        "neg.model": model2,
        "bestmodel": bestmodel
    }
    return pd.DataFrame(data)

In [33]:
merged = merge_submissions_with_RR(en_de_data, en_de_rr_data)

HBox(children=(FloatProgress(value=0.0, description='en-de', max=3227.0, style=ProgressStyle(description_width…




In [34]:
merged.head()

Unnamed: 0,data,lp,src,ref,pos,neg,pos.model,neg.model,bestmodel
0,newstest2019,en-de,"SPD group chairman, Christoph Bratmann, says: ...",SPD-Fraktionsvorsitzender Christoph Bratmann m...,"Der Vorsitzende der SPD-Fraktion, Christoph Br...","SPD Group Chairman, Christoph Bratmann, sagt: ...",online-B.0,TALP-UPC.4834,online-B.0
1,newstest2019,en-de,"SPD group chairman, Christoph Bratmann, says: ...",SPD-Fraktionsvorsitzender Christoph Bratmann m...,SPD-Fraktionsvorsitzender Christoph Bratmann s...,"SPD Group Chairman, Christoph Bratmann, sagt: ...",LMU-nmt-reranked.4934,TALP-UPC.4834,LMU-nmt-reranked.4934
2,newstest2019,en-de,"SPD group chairman, Christoph Bratmann, says: ...",SPD-Fraktionsvorsitzender Christoph Bratmann m...,SPD-Fraktionsvorsitzender Christoph Bratmann s...,"SPD Group Chairman, Christoph Bratmann, sagt: ...",RWTH-nmt-ensemble.4921,TALP-UPC.4834,RWTH-nmt-ensemble.4921
3,newstest2019,en-de,"SPD group chairman, Christoph Bratmann, says: ...",SPD-Fraktionsvorsitzender Christoph Bratmann m...,SPD-Fraktionsvorsitzender Christoph Bratmann s...,"SPD Group Chairman, Christoph Bratmann, sagt: ...",C-3MA.4959,TALP-UPC.4834,C-3MA.4959
4,newstest2019,en-de,"SPD group chairman, Christoph Bratmann, says: ...",SPD-Fraktionsvorsitzender Christoph Bratmann m...,"Fraktionsvorsitzender SPD, Christoph Bratmann,...","SPD Group Chairman, Christoph Bratmann, sagt: ...",fbk-nmt-combination.4870,TALP-UPC.4834,fbk-nmt-combination.4870


## All languages:

#### Note: Relative ranks where not collected for all LPs

In [46]:
language_pairs = ['en-cs', 'en-de', 'en-fi', 'en-lv', 'en-tr']

In [48]:
rr_data = pd.read_csv('wmt17-metrics-task-package/manual-evaluation/RR-seglevel.csv', delimiter=' ')

for language_pair in language_pairs:
    try:
        os.makedirs(language_pair)
    except FileExistsError:
        pass
    src, trg = language_pair.split('-')
    lp, lp1 = src + '-' + trg, src + trg
    
    sources = [line.strip() for line in open(SRC_PREFIX+f'newstest2017-{lp1}-src.{src}')]
    references = [line.strip() for line in open(REF_PREFIX+f'newstest2017-{lp1}-ref.{trg}')]
    
    sys_outputs_files = os.listdir(SYS_PREFIX+f'{lp}/')
    systems = ['.'.join(submission_name.split('.')[1:-1]) for submission_name in sys_outputs_files]
    
    lp_data = pd.DataFrame({'src': sources, 'ref': references, **get_system_outputs(lp)})
    
    lp_rr_data = rr_data[rr_data['LP'] == lp]
    merged = merge_submissions_with_RR(lp_data, lp_rr_data)
    # print ("LP {} - daRR size: {}".format(list(lp_rr_data.LP)[0], len(merged)))
    merged.to_csv(f'{lp}/relative-ranks.csv', index=False)

HBox(children=(FloatProgress(value=0.0, description='en-cs', max=32810.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='en-de', max=3227.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='en-fi', max=3270.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='en-lv', max=3456.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='en-tr', max=247.0, style=ProgressStyle(description_width=…




## Multilingual Corpus:

In [49]:
scores_language_pairs = [
    'en-cs', 'en-de', 'en-fi', 'en-lv', 'en-tr', 'en-ru', 'en-zh',
    'cs-en', 'de-en', 'fi-en',          'tr-en', 'ru-en', 'zh-en'
]
rr_language_pairs = ['en-cs', 'en-de', 'en-fi', 'en-lv', 'en-tr']

In [50]:
from sklearn.model_selection import train_test_split

all_scores_train, all_scores_test = [], []
all_rr_train, all_rr_test = [], []

all_rr, all_scores = [], []
for lp in tqdm(scores_language_pairs):
    
    if lp in rr_language_pairs:
        # Relative Ranks
        rr_data = pd.read_csv(f'{lp}/relative-ranks.csv')
        all_rr.append(rr_data)

        train_rr, test_rr = train_test_split(rr_data, test_size=0.2, random_state=42, shuffle=True)
        all_rr_train.append(train_rr)
        all_rr_test.append(test_rr)
    
    # DA scores
    scores_data = pd.read_csv(f'{lp}/scores.csv')
    all_scores.append(scores_data)
    
    train_scores, test_scores = train_test_split(scores_data, test_size=1000, random_state=42, shuffle=True)
    all_scores_train.append(train_scores)
    all_scores_test.append(test_scores)

# Relative-Ranks 
all_rr = pd.concat(all_rr, ignore_index=True)
rr_train = pd.concat(all_rr_train, ignore_index=True)
rr_test = pd.concat(all_rr_test, ignore_index=True)

# Scores 
all_scores = pd.concat(all_scores, ignore_index=True)
scores_train = pd.concat(all_scores_train, ignore_index=True)
scores_test = pd.concat(all_scores_test, ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [51]:
all_rr.to_csv('relative-ranks.csv', index=False)
rr_train.to_csv('train-relative-ranks.csv', index=False)
rr_test.to_csv('test-relative-ranks.csv', index=False)

In [52]:
all_scores.to_csv('scores.csv', index=False)
scores_train.to_csv('train_scores.csv', index=False)
scores_test.to_csv('test_scores.csv', index=False)

### end.