# Automated Evaluation Metric for Terminology Consistency in MT: Howto

_Author: Kirill Semenov, Charles University, 2022, kir\[dоt]semenov[аt]yandex[dоt]ru_

This is an example of the step-by-step application of the automated evaluation metric for terminology consistency in MT.

In [1]:
import os
import nltk
import re
import subprocess
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import f1_score

## 1. Tokenization

With this module, you can:
1. Tokenize separate text (`tokenize_file` method)
2. Tokenize the whole folder (`tokenize_folder` method)

Currently, only Morphodita tokenizer is implemented. To run this tokenizer, you need to download the models to it from [here](https://ufal.mff.cuni.cz/morphodita#language_models) and specify the directory. 

In [2]:
from tokenization import Tokenizer

In [3]:
tokenizer_cs = Tokenizer('morphodita', path_to_model='../agreement-corpus/tokenize_new.py')
tokenizer_en = Tokenizer('morphodita', path_to_model='../agreement-corpus/tokenize_new.py')

In [4]:
tokenizer_en.tokenize_file('./input_en/ALMAnaCH-Inria.txt', 'tokenized_en', language='English')

a
./input_en/ALMAnaCH-Inria.txt
<class 'str'>
ALMAnaCH-Inria.txt
ALMAnaCH-Inria_tokenized.txt
tokenized_en\ALMAnaCH-Inria_tokenized.txt


'tokenization for file ALMAnaCH-Inria_tokenized.txt done successfully'

In [4]:
tokenizer_cs.tokenize_folder('./raw_data/inputs_cs', './tokenized_data/inputs_cs', language='Czech')

au1tc.txt
au2tc.txt
au3tc.txt
au4tc.txt
au5tc.txt
au6tc.txt
au7tc.txt
au8tc.txt
au9tc.txt
kn1tc.txt
kn2tc.txt
kn3tc.txt
kn4tc.txt
kn5tc.txt
kn6tc.txt
kn7tc.txt
kn8tc.txt
kn9tc.txt
na1tc.txt
na2tc.txt
na3tc.txt
na4tc.txt
na5tc.txt
na6tc.txt
na7tc.txt
pn1tc.txt
pn2tc.txt
pn3tc.txt
pn4tc.txt
pn5tc.txt
robot.txt
SLA.txt
sublease.txt


'tokenization for folder ./raw_data/inputs_cs done successfully'

In [6]:
outputs_2021_folders = os.listdir('./raw_data/outputs_en_2021/')
outputs_2022_folders = os.listdir('./raw_data/outputs_en_2022/')

In [7]:
for output_2021_folder in outputs_2021_folders:
    i_folder = './raw_data/outputs_en_2021/' + output_2021_folder
    o_folder = './tokenized_data/outputs_en_2021/' + output_2021_folder
    # create a folder
    os.mkdir(o_folder)
    # make tokenization
    tokenizer_en.tokenize_folder(i_folder, o_folder, language='english')

CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
CUNI-DocTransformer.txt
CUNI-Transformer2018.txt
Facebook-AI.txt
Online-A.txt
Online-B.txt

In [8]:
for output_2022_folder in outputs_2022_folders:
    i_folder = './raw_data/outputs_en_2022/' + output_2022_folder
    o_folder = './tokenized_data/outputs_en_2022/' + output_2022_folder
    # create a folder
    os.mkdir(o_folder)
    # make tokenization
    tokenizer_en.tokenize_folder(i_folder, o_folder, language='english')

ALMAnaCH-Inria.txt
CUNI-DocTransformer.txt
CUNI-Transformer.txt
JDExploreAcademy.txt
Lan-Bridge.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
SHOPLINE-PL.txt
ALMAnaCH-Inria.txt
CUNI-DocTransformer.txt
CUNI-Transformer.txt
JDExploreAcademy.txt
Lan-Bridge.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
SHOPLINE-PL.txt
ALMAnaCH-Inria.txt
CUNI-DocTransformer.txt
CUNI-Transformer.txt
JDExploreAcademy.txt
Lan-Bridge.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
SHOPLINE-PL.txt
ALMAnaCH-Inria.txt
CUNI-DocTransformer.txt
CUNI-Transformer.txt
JDExploreAcademy.txt
Lan-Bridge.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
SHOPLINE-PL.txt
ALMAnaCH-Inria.txt
CUNI-DocTransformer.txt
CUNI-Transformer.txt
JDExploreAcademy.txt
Lan-Bridge.txt
Online-A.txt
Online-B.txt
Online-G.txt
Online-W.txt
Online-Y.txt
SHOPLINE-PL.txt
ALMAnaCH-Inria.txt
CUNI-DocTransformer.txt
CUNI-Transformer.txt
JDExploreAcademy.txt
Lan-Bridge

## 2. Alignment

Currently, `fast-align` algorithm is applied, and the code is adjusted only to this approach. You can consider using another automatic alignment approaches (such as `MGIZA++`).

**TIP:** the unsupervised aligners work better if applied to big collections of parallel sentences (>100K sentence pairs). So  consider:
1. merging your files into one (but remember to split them after!): this is what I did there and obtained at least 4K sentence pairs per system;
2. adding some bigger parallel corpus to your data.

In [2]:
from alignment import Aligner

In [4]:
fast_align_dir = fast_align_path = os.path.join('/mnt', 'c', 'Users', 'Kirill Semenov',
                               'multiling_nlp_for_me', 'pos_tagging', 'fast_align', 'build')

fast_align_dir = fast_align_path = os.path.join('/mnt/c/Users/Kirill Semenov/multiling_nlp_for_me/pos_tagging/fast_align/build')

In [5]:
i = './raw_data/inputs_cs/au1tc.txt'
o = './raw_data/outputs_en_2021/au1tc/CUNI-DocTransformer.txt'

In [3]:
aligner = Aligner()

In [7]:
os.listdir('./tokenized_data/outputs_en_2021/au1tc')

['CUNI-DocTransformer_tokenized.txt',
 'CUNI-Transformer2018_tokenized.txt',
 'Facebook-AI_tokenized.txt',
 'Online-A_tokenized.txt',
 'Online-B_tokenized.txt',
 'Online-G_tokenized.txt',
 'Online-W_tokenized.txt',
 'Online-Y_tokenized.txt']

In [4]:
import re
input_folder = './tokenized_data/inputs_cs/'
input_files = os.listdir(input_folder)
output_folder = './tokenized_data/outputs_en_2022/'
for i in input_files:
    i_strip = re.sub('_tokenized.txt', '', i)
    o_list = os.listdir(f'{output_folder}{i_strip}')
    for output_file in o_list:
        #o_strip = re.sub('_tokenized.txt', '', output_file)
        i_path = f'{input_folder}{i}'
        o_path = f'{output_folder}{i_strip}/{output_file}'
        aligner.create_bitext(i_path, o_path, './alignments/2022/bitexts/')

./tokenized_data/inputs_cs/au1tc_tokenized.txt
au1tc.txt
./tokenized_data/outputs_en_2022/au1tc/ALMAnaCH-Inria_tokenized.txt
ALMAnaCH-Inria.txt
./tokenized_data/inputs_cs/au1tc_tokenized.txt
au1tc.txt
./tokenized_data/outputs_en_2022/au1tc/CUNI-DocTransformer_tokenized.txt
CUNI-DocTransformer.txt
./tokenized_data/inputs_cs/au1tc_tokenized.txt
au1tc.txt
./tokenized_data/outputs_en_2022/au1tc/CUNI-Transformer_tokenized.txt
CUNI-Transformer.txt
./tokenized_data/inputs_cs/au1tc_tokenized.txt
au1tc.txt
./tokenized_data/outputs_en_2022/au1tc/JDExploreAcademy_tokenized.txt
JDExploreAcademy.txt
./tokenized_data/inputs_cs/au1tc_tokenized.txt
au1tc.txt
./tokenized_data/outputs_en_2022/au1tc/Lan-Bridge_tokenized.txt
Lan-Bridge.txt
./tokenized_data/inputs_cs/au1tc_tokenized.txt
au1tc.txt
./tokenized_data/outputs_en_2022/au1tc/Online-A_tokenized.txt
Online-A.txt
./tokenized_data/inputs_cs/au1tc_tokenized.txt
au1tc.txt
./tokenized_data/outputs_en_2022/au1tc/Online-B_tokenized.txt
Online-B.txt
./toke

CUNI-Transformer.txt
./tokenized_data/inputs_cs/au9tc_tokenized.txt
au9tc.txt
./tokenized_data/outputs_en_2022/au9tc/JDExploreAcademy_tokenized.txt
JDExploreAcademy.txt
./tokenized_data/inputs_cs/au9tc_tokenized.txt
au9tc.txt
./tokenized_data/outputs_en_2022/au9tc/Lan-Bridge_tokenized.txt
Lan-Bridge.txt
./tokenized_data/inputs_cs/au9tc_tokenized.txt
au9tc.txt
./tokenized_data/outputs_en_2022/au9tc/Online-A_tokenized.txt
Online-A.txt
./tokenized_data/inputs_cs/au9tc_tokenized.txt
au9tc.txt
./tokenized_data/outputs_en_2022/au9tc/Online-B_tokenized.txt
Online-B.txt
./tokenized_data/inputs_cs/au9tc_tokenized.txt
au9tc.txt
./tokenized_data/outputs_en_2022/au9tc/Online-G_tokenized.txt
Online-G.txt
./tokenized_data/inputs_cs/au9tc_tokenized.txt
au9tc.txt
./tokenized_data/outputs_en_2022/au9tc/Online-W_tokenized.txt
Online-W.txt
./tokenized_data/inputs_cs/au9tc_tokenized.txt
au9tc.txt
./tokenized_data/outputs_en_2022/au9tc/Online-Y_tokenized.txt
Online-Y.txt
./tokenized_data/inputs_cs/au9tc_to

./tokenized_data/inputs_cs/kn8tc_tokenized.txt
kn8tc.txt
./tokenized_data/outputs_en_2022/kn8tc/CUNI-DocTransformer_tokenized.txt
CUNI-DocTransformer.txt
./tokenized_data/inputs_cs/kn8tc_tokenized.txt
kn8tc.txt
./tokenized_data/outputs_en_2022/kn8tc/CUNI-Transformer_tokenized.txt
CUNI-Transformer.txt
./tokenized_data/inputs_cs/kn8tc_tokenized.txt
kn8tc.txt
./tokenized_data/outputs_en_2022/kn8tc/JDExploreAcademy_tokenized.txt
JDExploreAcademy.txt
./tokenized_data/inputs_cs/kn8tc_tokenized.txt
kn8tc.txt
./tokenized_data/outputs_en_2022/kn8tc/Lan-Bridge_tokenized.txt
Lan-Bridge.txt
./tokenized_data/inputs_cs/kn8tc_tokenized.txt
kn8tc.txt
./tokenized_data/outputs_en_2022/kn8tc/Online-A_tokenized.txt
Online-A.txt
./tokenized_data/inputs_cs/kn8tc_tokenized.txt
kn8tc.txt
./tokenized_data/outputs_en_2022/kn8tc/Online-B_tokenized.txt
Online-B.txt
./tokenized_data/inputs_cs/kn8tc_tokenized.txt
kn8tc.txt
./tokenized_data/outputs_en_2022/kn8tc/Online-G_tokenized.txt
Online-G.txt
./tokenized_data/i

./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt
./tokenized_data/outputs_en_2022/na6tc/JDExploreAcademy_tokenized.txt
JDExploreAcademy.txt
./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt
./tokenized_data/outputs_en_2022/na6tc/Lan-Bridge_tokenized.txt
Lan-Bridge.txt
./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt
./tokenized_data/outputs_en_2022/na6tc/Online-A_tokenized.txt
Online-A.txt
./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt
./tokenized_data/outputs_en_2022/na6tc/Online-B_tokenized.txt
Online-B.txt
./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt
./tokenized_data/outputs_en_2022/na6tc/Online-G_tokenized.txt
Online-G.txt
./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt
./tokenized_data/outputs_en_2022/na6tc/Online-W_tokenized.txt
Online-W.txt
./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt
./tokenized_data/outputs_en_2022/na6tc/Online-Y_tokenized.txt
Online-Y.txt
./tokenized_data/inputs_cs/na6tc_tokenized.txt
na6tc.txt

./tokenized_data/inputs_cs/robot_tokenized.txt
robot.txt
./tokenized_data/outputs_en_2022/robot/Online-B_tokenized.txt
Online-B.txt
./tokenized_data/inputs_cs/robot_tokenized.txt
robot.txt
./tokenized_data/outputs_en_2022/robot/Online-G_tokenized.txt
Online-G.txt
./tokenized_data/inputs_cs/robot_tokenized.txt
robot.txt
./tokenized_data/outputs_en_2022/robot/Online-W_tokenized.txt
Online-W.txt
./tokenized_data/inputs_cs/robot_tokenized.txt
robot.txt
./tokenized_data/outputs_en_2022/robot/Online-Y_tokenized.txt
Online-Y.txt
./tokenized_data/inputs_cs/robot_tokenized.txt
robot.txt
./tokenized_data/outputs_en_2022/robot/SHOPLINE-PL_tokenized.txt
SHOPLINE-PL.txt
./tokenized_data/inputs_cs/SLA_tokenized.txt
SLA.txt
./tokenized_data/outputs_en_2022/SLA/ALMAnaCH-Inria_tokenized.txt
ALMAnaCH-Inria.txt
./tokenized_data/inputs_cs/SLA_tokenized.txt
SLA.txt
./tokenized_data/outputs_en_2022/SLA/CUNI-DocTransformer_tokenized.txt
CUNI-DocTransformer.txt
./tokenized_data/inputs_cs/SLA_tokenized.txt
SLA

In [10]:
bitexts = os.listdir('./alignments/2021/bitexts/')

In [11]:
bitexts

['au1tc_CUNI-DocTransformer.s',
 'au1tc_CUNI-Transformer2018.s',
 'au1tc_Facebook-AI.s',
 'au1tc_Online-A.s',
 'au1tc_Online-B.s',
 'au1tc_Online-G.s',
 'au1tc_Online-W.s',
 'au1tc_Online-Y.s',
 'au2tc_CUNI-DocTransformer.s',
 'au2tc_CUNI-Transformer2018.s',
 'au2tc_Facebook-AI.s',
 'au2tc_Online-A.s',
 'au2tc_Online-B.s',
 'au2tc_Online-G.s',
 'au2tc_Online-W.s',
 'au2tc_Online-Y.s',
 'au3tc_CUNI-DocTransformer.s',
 'au3tc_CUNI-Transformer2018.s',
 'au3tc_Facebook-AI.s',
 'au3tc_Online-A.s',
 'au3tc_Online-B.s',
 'au3tc_Online-G.s',
 'au3tc_Online-W.s',
 'au3tc_Online-Y.s',
 'au4tc_CUNI-DocTransformer.s',
 'au4tc_CUNI-Transformer2018.s',
 'au4tc_Facebook-AI.s',
 'au4tc_Online-A.s',
 'au4tc_Online-B.s',
 'au4tc_Online-G.s',
 'au4tc_Online-W.s',
 'au4tc_Online-Y.s',
 'au5tc_CUNI-DocTransformer.s',
 'au5tc_CUNI-Transformer2018.s',
 'au5tc_Facebook-AI.s',
 'au5tc_Online-A.s',
 'au5tc_Online-B.s',
 'au5tc_Online-G.s',
 'au5tc_Online-W.s',
 'au5tc_Online-Y.s',
 'au6tc_CUNI-DocTransformer.s'

In [25]:
import pandas as pd 

bitexts_doc_list = list(set([bitext.split('_')[0] for bitext in bitexts]))
bitexts_alg_list = list(set([bitext.split('_')[1][:-2] for bitext in bitexts]))

#bitexts_split = pd.DataFrame(bitexts_split, columns=['doc', 'alg'])

In [29]:
for alg in bitexts_alg_list:
    for doc in bitexts_doc_list:
        filename = doc + '_' + alg + '.s'
        print(filename)

pn4tc_Online-W.s
au9tc_Online-W.s
pn1tc_Online-W.s
pn3tc_Online-W.s
robot_Online-W.s
na3tc_Online-W.s
kn5tc_Online-W.s
au7tc_Online-W.s
pn2tc_Online-W.s
na6tc_Online-W.s
au2tc_Online-W.s
na7tc_Online-W.s
au6tc_Online-W.s
kn7tc_Online-W.s
kn6tc_Online-W.s
SLA_Online-W.s
sublease_Online-W.s
kn4tc_Online-W.s
kn9tc_Online-W.s
kn2tc_Online-W.s
kn1tc_Online-W.s
au8tc_Online-W.s
au1tc_Online-W.s
kn8tc_Online-W.s
au3tc_Online-W.s
na4tc_Online-W.s
pn5tc_Online-W.s
kn3tc_Online-W.s
na1tc_Online-W.s
au4tc_Online-W.s
na5tc_Online-W.s
au5tc_Online-W.s
na2tc_Online-W.s
pn4tc_Facebook-AI.s
au9tc_Facebook-AI.s
pn1tc_Facebook-AI.s
pn3tc_Facebook-AI.s
robot_Facebook-AI.s
na3tc_Facebook-AI.s
kn5tc_Facebook-AI.s
au7tc_Facebook-AI.s
pn2tc_Facebook-AI.s
na6tc_Facebook-AI.s
au2tc_Facebook-AI.s
na7tc_Facebook-AI.s
au6tc_Facebook-AI.s
kn7tc_Facebook-AI.s
kn6tc_Facebook-AI.s
SLA_Facebook-AI.s
sublease_Facebook-AI.s
kn4tc_Facebook-AI.s
kn9tc_Facebook-AI.s
kn2tc_Facebook-AI.s
kn1tc_Facebook-AI.s
au8tc_Facebook-AI

In [28]:
bitexts

['au1tc_CUNI-DocTransformer.s',
 'au1tc_CUNI-Transformer2018.s',
 'au1tc_Facebook-AI.s',
 'au1tc_Online-A.s',
 'au1tc_Online-B.s',
 'au1tc_Online-G.s',
 'au1tc_Online-W.s',
 'au1tc_Online-Y.s',
 'au2tc_CUNI-DocTransformer.s',
 'au2tc_CUNI-Transformer2018.s',
 'au2tc_Facebook-AI.s',
 'au2tc_Online-A.s',
 'au2tc_Online-B.s',
 'au2tc_Online-G.s',
 'au2tc_Online-W.s',
 'au2tc_Online-Y.s',
 'au3tc_CUNI-DocTransformer.s',
 'au3tc_CUNI-Transformer2018.s',
 'au3tc_Facebook-AI.s',
 'au3tc_Online-A.s',
 'au3tc_Online-B.s',
 'au3tc_Online-G.s',
 'au3tc_Online-W.s',
 'au3tc_Online-Y.s',
 'au4tc_CUNI-DocTransformer.s',
 'au4tc_CUNI-Transformer2018.s',
 'au4tc_Facebook-AI.s',
 'au4tc_Online-A.s',
 'au4tc_Online-B.s',
 'au4tc_Online-G.s',
 'au4tc_Online-W.s',
 'au4tc_Online-Y.s',
 'au5tc_CUNI-DocTransformer.s',
 'au5tc_CUNI-Transformer2018.s',
 'au5tc_Facebook-AI.s',
 'au5tc_Online-A.s',
 'au5tc_Online-B.s',
 'au5tc_Online-G.s',
 'au5tc_Online-W.s',
 'au5tc_Online-Y.s',
 'au6tc_CUNI-DocTransformer.s'

In [26]:
bitexts_alg_list

['Online-W',
 'Facebook-AI',
 'Online-Y',
 'Online-G',
 'Online-B',
 'CUNI-Transformer2018',
 'CUNI-DocTransformer',
 'Online-A']

In [21]:
list(bitexts_split['doc'].unique())

['au1tc',
 'au2tc',
 'au3tc',
 'au4tc',
 'au5tc',
 'au6tc',
 'au7tc',
 'au8tc',
 'au9tc',
 'kn1tc',
 'kn2tc',
 'kn3tc',
 'kn4tc',
 'kn5tc',
 'kn6tc',
 'kn7tc',
 'kn8tc',
 'kn9tc',
 'na1tc',
 'na2tc',
 'na3tc',
 'na4tc',
 'na5tc',
 'na6tc',
 'na7tc',
 'pn1tc',
 'pn2tc',
 'pn3tc',
 'pn4tc',
 'pn5tc',
 'robot',
 'SLA',
 'sublease']

## 3. Pseudo-Reference Initialization

By this moment, you should have the components for creating the last part of the data: pseudo-reference terms, against which the real translations would be compared. To run that, use the `TermBasedMetricPreparation` class.

As a result, you will have `.csv` files with five columns:
 - source sentence
 - translated sentence (tgt_sentence)
 - source terms
 - translated "candidate" terms (the translations in each exact sentence)
 - pseudo-reference terms (the translations that we automatically chose as "correct" ones)
 

Run the whole cycle of preparation with `pipeline` method.

In [1]:
import os
import nltk
import re
import subprocess

In [3]:
src1, tgt1, alg1 = 'au1tc_tokenized.txt', 'CUNI-Transformer2018_tokenized.txt', 'au1tc_CUNI-Transformer2018.i'
src2, tgt2, alg2 = 'SLA_tokenized.txt', 'Facebook-AI_tokenized.txt', 'SLA_Facebook-AI.i'

In [2]:
from metric_preparation import TermBasedMetricPreparator

In [4]:
tbm = TermBasedMetricPreparator(pseudoref_strategy='first')

In [18]:
tort_folder = './tort/2022'
input_folder = './tokenized_data/inputs_cs'
output_folder = './tokenized_data/outputs_en_2022'
alignment_folder = './alignments/2022/split_alignments'
for alg in os.listdir(alignment_folder):
    a_path = alignment_folder + '/' + alg
    #print(alg)
    i_fname, o_fname = alg[:-2].split('_')
    i_path = input_folder + '/' + i_fname + '_tokenized.txt'
    o_path = output_folder + '/' + i_fname + '/' + o_fname + '_tokenized.txt'
    try:
        tbm_first = TermBasedMetricPreparator(pseudoref_strategy='first')
        tbm_first.pipeline(i_path, o_path, a_path, tort_folder)
        tbm_frequent = TermBasedMetricPreparator(pseudoref_strategy='frequent')
        tbm_frequent.pipeline(i_path, o_path, a_path, tort_folder)
    except:
        print(alg)

In [5]:
df = tbm.pipeline(src2, tgt2, alg2, 'alignment_sandbox')#.shape

In [11]:
df

Unnamed: 0,src_sentence,tgt_sentence_str,src_terms_str,tgt_terms_str,gt_terms_str
0,Smlouva o poskytování služeb podpory provozu a...,contract for the provision of services to supp...,Smlouva,contract,contract
1,"J & T BANKA , a . s . , sídlem Pobřežní 297 / ...","j & amp ; t banka , a . s . , with its registe...",,,
2,"zapsaná v obchodním rejstříku , který vede Měs...",registered in the commercial register maintain...,,,
3,( dále jen „ Objednatel “ ),"( hereinafter referred to as the "" client "" )",Objednatel,client,client
4,a,and,,,
...,...,...,...,...,...
855,"J & T BANKA , a . s .","j & amp ; t bank , a . s .",,,
856,Jméno : [ ● ],name : [ ● ],,,
857,Jméno : [ ● ],name : [ ● ],,,
858,Funkce : [ ● ],function : [ ● ],,,


In [6]:
tbm = TermBasedMetric(gt_strategy='frequent')

In [8]:
df = tbm.pipeline(src2, tgt2, alg2, 'alignment_sandbox')#.shape

In [6]:
df#.to_csv('df.csv')

Unnamed: 0,src_sentence,tgt_sentence_str,src_terms_str,tgt_terms_str,gt_terms_str
0,Smlouva o poskytování služeb podpory provozu a...,contract for the provision of services to supp...,Smlouva,contract,<NONE>
1,"J & T BANKA , a . s . , sídlem Pobřežní 297 / ...","j & amp ; t banka , a . s . , with its registe...",,,
2,"zapsaná v obchodním rejstříku , který vede Měs...",registered in the commercial register maintain...,,,
3,( dále jen „ Objednatel “ ),"( hereinafter referred to as the "" client "" )",Objednatel,client,client
4,a,and,,,
...,...,...,...,...,...
855,"J & T BANKA , a . s .","j & amp ; t bank , a . s .",,,
856,Jméno : [ ● ],name : [ ● ],,,
857,Jméno : [ ● ],name : [ ● ],,,
858,Funkce : [ ● ],function : [ ● ],,,


In [8]:
import pandas as pd 
d = pd.read_csv('Facebook-AI_tokenized.csv', delimiter='\t')

In [9]:
d

Unnamed: 0.1,Unnamed: 0,src_sentence,tgt_sentence_str,src_terms_str,tgt_terms_str,gt_terms_str
0,0,Smlouva o poskytování služeb podpory provozu a...,contract for the provision of services to supp...,Smlouva,contract,<NONE>
1,1,"J & T BANKA , a . s . , sídlem Pobřežní 297 / ...","j & amp ; t banka , a . s . , with its registe...",,,
2,2,"zapsaná v obchodním rejstříku , který vede Měs...",registered in the commercial register maintain...,,,
3,3,( dále jen „ Objednatel “ ),"( hereinafter referred to as the "" client "" )",Objednatel,client,client
4,4,a,and,,,
...,...,...,...,...,...,...
855,855,"J & T BANKA , a . s .","j & amp ; t bank , a . s .",,,
856,856,Jméno : [ ● ],name : [ ● ],,,
857,857,Jméno : [ ● ],name : [ ● ],,,
858,858,Funkce : [ ● ],function : [ ● ],,,


## 4. Metric Application

Now we can compare the real translation of terms to the pseudo-references. To do that, use `TermBasedMetric` class of the `statistics` module. You should call `make_metrics` method and specify the TORT file. You will obtain two numbers: 
 - termwise metric result (specified as `own` in paper)
 - overall metric (F1 score) in our case

In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import f1_score

In [3]:
from statistics import TermBasedMetric

In [4]:
tbm = TermBasedMetric(f1_score)

In [4]:
tbm.make_metrics('./tort/2021/SLA_Online-G_first.csv')

(0.8127823291069866, 0.9224880382775119)

In [5]:
tbm.make_metrics('./tort/2021/SLA_Online-G_frequent.csv')

(0.8213293376539951, 0.9234449760765551)

In [5]:
tort_fpath = './tort/2022'
stats_path = './statistics'

In [6]:
tort_fpath[-4:]

'2022'

Below, I create the datasets for all metrics for 2021 and 2022 years to compare the rankings of the algorithms between each other and the mainstream metrics (BLEU, chrF, direct assessment)

In [37]:
version = 'frequent.csv'

docs_list = [file.split('_')[0] for file in os.listdir(tort_fpath)]
algs_list = [file.split('_')[1] for file in os.listdir(tort_fpath)]
#vers_list = [file.split('_')[2] for file in os.listdir(tort_fpath)]

docs_list = list(set(docs_list))
algs_list = list(set(algs_list))
#vers_list = list(set(vers_list))

frequent_own_df = pd.DataFrame(0, index=docs_list, columns=algs_list)
frequent_f1_df = pd.DataFrame(0, index=docs_list, columns=algs_list)

file_list = [file for file in os.listdir(tort_fpath) if file.endswith(version)]
for doc in docs_list:
    for alg in algs_list:
        filename = tort_fpath + '/' + doc + '_' + alg + '_' + version
        own, f1 = tbm.make_metrics(filename)
        frequent_own_df.at[doc, alg] = own
        frequent_f1_df.at[doc, alg] = f1

year = tort_fpath[-4:]
version_for_file = version[:-4]
filename_base = stats_path + '/' + year + '_' + version_for_file + '_'
frequent_own_df.to_csv(filename_base + 'own.csv', sep=';')
frequent_f1_df.to_csv(filename_base + 'f1.csv', sep=';')

In [38]:
version = 'first.csv'

docs_list = [file.split('_')[0] for file in os.listdir(tort_fpath)]
algs_list = [file.split('_')[1] for file in os.listdir(tort_fpath)]
#vers_list = [file.split('_')[2] for file in os.listdir(tort_fpath)]

docs_list = list(set(docs_list))
algs_list = list(set(algs_list))
#vers_list = list(set(vers_list))

first_own_df = pd.DataFrame(0, index=docs_list, columns=algs_list)
first_f1_df = pd.DataFrame(0, index=docs_list, columns=algs_list)

file_list = [file for file in os.listdir(tort_fpath) if file.endswith(version)]
for doc in docs_list:
    for alg in algs_list:
        filename = tort_fpath + '/' + doc + '_' + alg + '_' + version
        own, f1 = tbm.make_metrics(filename)
        first_own_df.at[doc, alg] = own
        first_f1_df.at[doc, alg] = f1

year = tort_fpath[-4:]
version_for_file = version[:-4]
filename_base = stats_path + '/' + year + '_' + version_for_file + '_'
first_own_df.to_csv(filename_base + 'own.csv', sep=';')
first_f1_df.to_csv(filename_base + 'f1.csv', sep=';')