# Automated Evaluation Metric for Terminology Consistency in MT: Metric Comparison

_Author: Kirill Semenov, Charles University, 2022, kir\[dоt]semenov[аt]yandex[dоt]ru_

Here I compare the ranking (and the absolute numbers) of my metrics (different setups of the suggested metric) with the mainstream ones (BLEU, chrF, direct assessment)

In [1]:
import os
import nltk
import re
import subprocess
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import f1_score

In [2]:
import itertools
from scipy.stats import kendalltau, ttest_rel

In [3]:
first_f1_2021_df = pd.read_csv('statistics/2021_first_f1.csv', sep=';', index_col='Unnamed: 0')
first_own_2021_df = pd.read_csv('statistics/2021_first_own.csv', sep=';', index_col='Unnamed: 0')
frequent_f1_2021_df = pd.read_csv('statistics/2021_frequent_f1.csv', sep=';', index_col='Unnamed: 0')
frequent_own_2021_df = pd.read_csv('statistics/2021_frequent_own.csv', sep=';', index_col='Unnamed: 0')

In [4]:
first_f1_2022_df = pd.read_csv('statistics/2022_first_f1.csv', sep=';', index_col='Unnamed: 0')
first_own_2022_df = pd.read_csv('statistics/2022_first_own.csv', sep=';', index_col='Unnamed: 0')
frequent_f1_2022_df = pd.read_csv('statistics/2022_frequent_f1.csv', sep=';', index_col='Unnamed: 0')
frequent_own_2022_df = pd.read_csv('statistics/2022_frequent_own.csv', sep=';', index_col='Unnamed: 0')

In [5]:
def create_stats_df(first_f1, first_own, frequent_f1, frequent_own, rank=True):
    data = {'1st;F1': first_f1.mean(), '1st;Own': first_own.mean(), 
            'Freq;F1': frequent_f1.mean(), 'Freq;Own': frequent_own.mean()}
    df = pd.DataFrame(data)
    if rank:
        df['1st;F1 rank'] = df['1st;F1'].rank(ascending=False).astype(int)
        df['1st;Own rank'] = df['1st;Own'].rank(ascending=False).astype(int)
        df['Freq;F1 rank'] = df['Freq;F1'].rank(ascending=False).astype(int)
        df['Freq;Own rank'] = df['Freq;Own'].rank(ascending=False).astype(int)
    
    return df


In [6]:
df_2021 = create_stats_df(first_f1_2021_df, first_own_2021_df, frequent_f1_2021_df, frequent_own_2021_df)

In [7]:
df_2021_ = create_stats_df(first_f1_2021_df, first_own_2021_df, frequent_f1_2021_df, frequent_own_2021_df, rank=False)

### Results for 2021

In [9]:
df_2021.round(3).sort_index()

Unnamed: 0,1st;F1,1st;Own,Freq;F1,Freq;Own,1st;F1 rank,1st;Own rank,Freq;F1 rank,Freq;Own rank
CUNI-DocTransformer,0.897,0.804,0.915,0.835,3,4,4,4
CUNI-Transformer2018,0.857,0.776,0.895,0.827,8,7,8,7
Facebook-AI,0.907,0.838,0.93,0.871,1,1,1,1
Online-A,0.883,0.795,0.914,0.829,4,5,5,6
Online-B,0.88,0.792,0.925,0.852,6,6,2,2
Online-G,0.871,0.771,0.9,0.811,7,8,6,8
Online-W,0.881,0.807,0.898,0.831,5,3,7,5
Online-Y,0.9,0.813,0.921,0.84,2,2,3,3


### Results for 2022

In [11]:
df_2022 = create_stats_df(first_f1_2022_df, first_own_2022_df, frequent_f1_2022_df, frequent_own_2022_df)
df_2022_ = create_stats_df(first_f1_2022_df, first_own_2022_df, frequent_f1_2022_df, frequent_own_2022_df, rank=False)

In [13]:
df_2022.round(3).sort_index()

Unnamed: 0,1st;F1,1st;Own,Freq;F1,Freq;Own,1st;F1 rank,1st;Own rank,Freq;F1 rank,Freq;Own rank
ALMAnaCH-Inria,0.816,0.688,0.885,0.807,11,11,10,9
CUNI-DocTransformer,0.897,0.805,0.916,0.836,4,6,4,6
CUNI-Transformer,0.848,0.751,0.882,0.79,10,10,11,11
JDExploreAcademy,0.899,0.817,0.928,0.863,3,4,1,1
Lan-Bridge,0.902,0.826,0.918,0.846,2,2,3,2
Online-A,0.877,0.773,0.924,0.836,7,7,2,7
Online-B,0.902,0.831,0.912,0.842,1,1,5,4
Online-G,0.871,0.772,0.898,0.807,8,8,8,10
Online-W,0.889,0.816,0.903,0.838,6,5,7,5
Online-Y,0.86,0.767,0.892,0.809,9,9,9,8


## Comparison with other metrics

In [14]:
df_automatic_2021 = pd.read_csv('./statistics/automatic_scores_2021.tsv', sep='\t')
df_automatic_2021

Unnamed: 0,pair,system,id,is_constrained,metric,score
0,cs-en,Facebook-AI,885,False,bleu-all,43.457651
1,cs-en,Facebook-AI,885,False,chrf-all,63.617553
2,cs-en,Facebook-AI,885,False,bleu-B,26.365726
3,cs-en,Facebook-AI,885,False,chrf-B,54.943713
4,cs-en,Facebook-AI,885,False,bleu-A,31.095372
5,cs-en,Facebook-AI,885,False,chrf-A,59.917951
6,cs-en,CUNI-Transformer2018,583,True,bleu-all,36.68823
7,cs-en,CUNI-Transformer2018,583,True,chrf-all,58.387521
8,cs-en,CUNI-Transformer2018,583,True,bleu-B,21.691043
9,cs-en,CUNI-Transformer2018,583,True,chrf-B,50.414194


In [15]:
df_automatic_2022 = pd.read_csv('./statistics/automatic_scores_2022.tsv', sep='\t')
df_automatic_2022

Unnamed: 0,pair,system,id,is_constrained,metric,score
0,cs-en,SHOPLINE-PL,819,True,bleu-all,53.467258
1,cs-en,SHOPLINE-PL,819,True,chrf-all,70.116619
2,cs-en,SHOPLINE-PL,819,True,COMET-C,0.396165
3,cs-en,SHOPLINE-PL,819,True,bleu-C,24.632149
4,cs-en,SHOPLINE-PL,819,True,chrf-C,53.197034
...,...,...,...,...,...,...
83,cs-en,Online-B,917,False,bleu-C,25.491162
84,cs-en,Online-B,917,False,chrf-C,53.980732
85,cs-en,Online-B,917,False,COMET-B,0.718403
86,cs-en,Online-B,917,False,bleu-B,54.301453


In [16]:
def reformat_automatic_metrics(df, da=None, rank=None, rank_metrics=True):
    systems = list(df.system.unique())
    df_dict = {}
    for system in systems:
        subset = df[df['system'] == system]
        bleu_value = float(subset[subset['metric'] == 'bleu-all']['score'])
        chrf_value = float(subset[subset['metric'] == 'chrf-all']['score'])
        df_dict[system] = [bleu_value, chrf_value]
    
    df_final = pd.DataFrame.from_dict(df_dict, orient='index', columns=['BLEU', 'chrf'])
    if da is not None:
        df_final['DA'] = df_final.apply(lambda row: da[str(row.name)], axis=1)
    if rank is not None:
        df_final['rank'] = df_final.apply(lambda row: rank[str(row.name)], axis=1)
    if rank_metrics:
        metrics_list = [m for m in df_final.columns.to_list() if m != 'rank']
        #print(metrics_list)
        for metric in metrics_list:
            #print(metric)
            col_name = metric + '_rank'
            df_final[col_name] = df_final[metric].rank(ascending=False).astype(int)
    return df_final

In [17]:
da_2021 = {'Online-Y': 74.4, 'CUNI-Transformer2018': 71.5, 
           'Online-A': 78.4, 'Facebook-AI': 77.8, 
           'CUNI-DocTransformer': 72.0, 'Online-B': 74.0, 
           'Online-W': 74.5, 'Online-G': 67.2}

rank_2021 = {
    'Facebook-AI': 1, 'Online-A': 1,
    'CUNI-DocTransformer': 2, 'Online-B': 2,
    'CUNI-Transformer2018': 3, 'Online-W': 3,
    'Online-G': 4, 'Online-Y': 5
}

df_results_2021 = reformat_automatic_metrics(df_automatic_2021, da=da_2021, rank_metrics=True)

In [22]:
df_results_2021.sort_index()

Unnamed: 0,BLEU,chrf,DA,BLEU_rank,chrf_rank,DA_rank
CUNI-DocTransformer,41.842998,61.869312,72.0,3,3,6
CUNI-Transformer2018,36.68823,58.387521,71.5,7,7,7
Facebook-AI,43.457651,63.617553,77.8,1,1,2
Online-A,39.567165,60.360021,78.4,5,6,1
Online-B,43.112826,62.572802,74.0,2,2,5
Online-G,38.951147,60.756117,67.2,6,5,8
Online-W,41.330534,61.34923,74.5,4,4,3
Online-Y,34.286958,57.981122,74.4,8,8,4


In [20]:
df_results_2022 = reformat_automatic_metrics(df_automatic_2022, rank_metrics=True)

In [21]:
df_results_2022.sort_index()

Unnamed: 0,BLEU,chrf,BLEU_rank,chrf_rank
ALMAnaCH-Inria,36.837432,58.343952,11,11
CUNI-DocTransformer,57.875406,72.960318,6,6
CUNI-Transformer,57.592826,72.370276,7,7
JDExploreAcademy,60.898186,75.046036,2,2
Lan-Bridge,60.757824,74.629556,3,3
Online-A,59.415152,74.070347,5,5
Online-B,60.600093,74.541934,4,4
Online-G,54.280415,71.063694,8,8
Online-W,68.657282,79.696211,1,1
Online-Y,50.968785,68.763297,10,10


In [24]:
def compare_my_metrics(df, metric):
    if metric == kendalltau:
        rank_df = df.drop(['Freq;F1', 'Freq;Own', '1st;F1', '1st;Own'], axis=1)
        statistics_name = 'tau'
        suffix = -5
    elif metric == ttest_rel:
        rank_df = df.drop(['freq_f1_rank', 'freq_own_rank', 'first_f1_rank', 'first_own_rank'], axis=1)
        statistics_name = 'paired t-test'
        suffix = None
    corr_dict = {}
    for column_pair in itertools.combinations(rank_df.columns.to_list(), 2):
        col_1, col_2 = column_pair
        try:
            statistic = metric(rank_df[col_1], rank_df[col_2]).correlation
        except: 
            statistic = metric(rank_df[col_1], rank_df[col_2]).statistic
        pvalue = metric(rank_df[col_1], rank_df[col_2]).pvalue
        
        dict_key = col_1[:suffix] + ' VS ' + col_2[:suffix]
        corr_dict[dict_key] = statistic, pvalue
    
    corr_df = pd.DataFrame.from_dict(corr_dict, orient='index', columns = [statistics_name, 'p value'])
    return corr_df

def compare_my_metrics_with_mainstream(my_df, mainstream_df, metric):
    
    if metric == kendalltau:
        my_subset = my_df.drop(['Freq;F1', 'Freq;Own', '1st;F1', '1st;Own'], axis=1)
        mainstream_subset = mainstream_df.drop(['BLEU', 'chrf'], axis=1)
        try: 
            mainstream_subset = mainstream_subset.drop(['DA'], axis=1)
        except:
            pass
        statistics_name = 'kendall tau'
        suffix = -5
    elif metric == ttest_rel:
        my_subset = my_df.drop(['freq_f1_rank', 'freq_own_rank', 'first_f1_rank', 'first_own_rank'], axis=1)
        mainstream_subset = mainstream_df.drop(['bleu_rank', 'chrf_rank'], axis=1)
        try: 
            mainstream_subset = mainstream_subset.drop(['DA_rank'], axis=1)    
        except:
            pass
        statistics_name = 'paired t-test'
        suffix = None
    
    if metric == ttest_rel:
        mainstream_subset = mainstream_subset / 100
    corr_dict = {}
    for column_pair in itertools.product(my_subset.columns.to_list(), mainstream_subset.columns.to_list()):

        col_1, col_2 = column_pair
        try:
            statistic = metric(my_subset[col_1], mainstream_subset[col_2]).correlation
        except: 
            statistic = metric(my_subset[col_1], mainstream_subset[col_2]).statistic
        pvalue = metric(my_subset[col_1], mainstream_subset[col_2]).pvalue
        #print(col_1, col_2)
        #print(statistic, pvalue)
        
        dict_key = col_1[:suffix] + ' VS ' + col_2[:suffix]
        corr_dict[dict_key] = statistic, pvalue
    
    corr_df = pd.DataFrame.from_dict(corr_dict, orient='index', columns = [statistics_name, 'p value'])

    return corr_df

def compare_different_years(df_2021, df_2022, rank=False):
    overlap = list(set(df_2021.index.to_list()) & set(df_2022.index.to_list()))
    df_2021_subset = df_2021.loc[overlap]
    df_2022_subset = df_2022.loc[overlap]
    df_2021_subset = df_2021_subset.drop([c for c in df_2021_subset.columns.to_list() if c.endswith('rank')], axis=1)
    df_2022_subset = df_2022_subset.drop([c for c in df_2022_subset.columns.to_list() if c.endswith('rank')], axis=1) 
    
    difference_dict = {}
    
    #for column in df_2021_subset.columns.to_list():
    difference_df = df_2022_subset - df_2021_subset
    
    if rank:
        try:
            difference_df = difference_df.drop(['DA'], axis=1)
        except:
            pass
        for col in difference_df.columns.to_list():
            print(col)
            ranked_col = col + '_rank'
            difference_df[ranked_col] = difference_df[col].rank(ascending=False).astype(int)
            #if round_3:
            #    difference_df[col] = difference_df[col].astype('float').round(3)
    return difference_df

### statistical comparison of different setups of my metric, for the year 2021 and 2022:

In [26]:
compare_my_metrics(df_2021, metric=kendalltau).round(3)#.sort_index()

Unnamed: 0,tau,p value
1st;F1 VS 1st;Own,0.786,0.006
1st;F1 VS Freq;F1,0.643,0.031
1st;F1 VS Freq;Own,0.571,0.061
1st;Own VS Freq;F1,0.429,0.179
1st;Own VS Freq;Own,0.643,0.031
Freq;F1 VS Freq;Own,0.786,0.006


In [27]:
compare_my_metrics(df_2022, metric=kendalltau).round(3)#.sort_index()

Unnamed: 0,tau,p value
1st;F1 VS 1st;Own,0.891,0.0
1st;F1 VS Freq;F1,0.636,0.006
1st;F1 VS Freq;Own,0.673,0.003
1st;Own VS Freq;F1,0.527,0.026
1st;Own VS Freq;Own,0.709,0.002
Freq;F1 VS Freq;Own,0.6,0.01


### Comparing my metrics with the mainstream ones, years 2021 and 2022:

In [29]:
compare_my_metrics_with_mainstream(df_2021, df_results_2021, metric=kendalltau).round(3)

Unnamed: 0,kendall tau,p value
1st;F1 VS BLEU,0.357,0.275
1st;F1 VS chrf,0.286,0.399
1st;F1 VS DA,0.714,0.014
1st;Own VS BLEU,0.143,0.72
1st;Own VS chrf,0.071,0.905
1st;Own VS DA,0.5,0.109
Freq;F1 VS BLEU,0.143,0.72
Freq;F1 VS chrf,0.071,0.905
Freq;F1 VS DA,0.786,0.006
Freq;Own VS BLEU,-0.071,0.905


In [30]:
compare_my_metrics_with_mainstream(df_2022, df_results_2022, metric=kendalltau).round(3)

Unnamed: 0,kendall tau,p value
1st;F1 VS BLEU,-0.527,0.026
1st;F1 VS chrf,-0.527,0.026
1st;Own VS BLEU,-0.636,0.006
1st;Own VS chrf,-0.636,0.006
Freq;F1 VS BLEU,-0.527,0.026
Freq;F1 VS chrf,-0.527,0.026
Freq;Own VS BLEU,-0.636,0.006
Freq;Own VS chrf,-0.636,0.006


### Comparing the metrics of different years, my setups and the standard ones:

In [33]:
compare_different_years(df_2021, df_2022, rank=True).sort_index()

1st;F1
1st;Own
Freq;F1
Freq;Own


Unnamed: 0,1st;F1,1st;Own,Freq;F1,Freq;Own,1st;F1_rank,1st;Own_rank,Freq;F1_rank,Freq;Own_rank
CUNI-DocTransformer,0.000631,0.001263,0.000631,0.001263,3,3,3,3
Online-A,-0.006544,-0.021866,0.00988,0.00645,5,5,1,2
Online-B,0.022333,0.03951,-0.012604,-0.009689,1,1,5,5
Online-G,-0.000533,0.000576,-0.00117,-0.00488,4,4,4,4
Online-W,0.008081,0.008461,0.005111,0.006589,2,2,2,1
Online-Y,-0.039884,-0.046452,-0.028766,-0.030479,6,6,6,6


In [36]:
compare_different_years(df_results_2021, df_results_2022, rank=True)#/100

BLEU
chrf


Unnamed: 0,BLEU,chrf,BLEU_rank,chrf_rank
Online-G,15.329268,10.307577,6,6
CUNI-DocTransformer,16.032408,11.091005,5,4
Online-Y,16.681827,10.782175,4,5
Online-B,17.487267,11.969132,3,3
Online-W,27.326748,18.346982,1,1
Online-A,19.847987,13.710326,2,2
