In [32]:
import evaluate
import json

bleu = evaluate.load("bleu")

In [33]:
# reading the reference-candidate dataset and printing the prompts used for translation 

file_path = 'reference_candidate.json'

with open(file_path, 'r') as json_file:
    translation_data = json.load(json_file)

prompts = set()

for label in translation_data.values():
    for prompt in label:
        prompts.add(prompt)

prompt_names = {}
for num, prompt in enumerate(prompts):
     prompt_name = 'Prompt '+str(num+1)
     prompt_names[prompt] = prompt_name
     print(prompt_name, '\n', prompt)

Prompt 1 
 You are an expert translator. Translate the following text to Russian using vocabulary and expressions of a Russian native. The text to be translated is:
Prompt 2 
 You are an expert translator that will be tasked with translating a piece of text into Russian. The translation must be faithful to the original tone of voice and writing style. Ensure that the meaning of the original text is not changed. The text to be translated is:


In [27]:
# creating lists of values from the dataset for evaluation

output = []

# Iterate through each label
for label, prompts in translation_data.items():
    # Iterate through each prompt in the label
    for prompt, translations in prompts.items():
        human_translations = []
        machine_translations = []
        # Iterate through each translation and separate human and machine translations
        for translation in translations:
            human_translations.append(translation[0])
            machine_translations.append(translation[1])

        # Append to the output list
        output.append([label, prompt_names[prompt], [human_translations, machine_translations]])

In [31]:
# calculating the metrics

for item in output:
    print('label:', item[0])
    print(item[1])
    references = item[2][0]
    predictions = item[2][1]
    results = bleu.compute(predictions=predictions, references=references)
    print('BLEU score:', results['bleu'])
    print('Precisions:', results['precisions'])
    print('Brevity penalty:', results['brevity_penalty'])
    print('Length ratio:', results['length_ratio'], '\n')


label: literature
Prompt 1
BLEU score: 0.1479391363697395
Precisions: [0.4540755467196819, 0.19491525423728814, 0.10158730158730159, 0.05704534373476353]
Brevity penalty: 0.9830479158592157
Length ratio: 0.9831899921813917 

label: literature
Prompt 2
BLEU score: 0.1476482987962879
Precisions: [0.4513970877607241, 0.1922110552763819, 0.09932885906040269, 0.056195965417867436]
Brevity penalty: 0.9952885835296004
Length ratio: 0.9952996474735605 

label: medical
Prompt 1
BLEU score: 0.20121532850099372
Precisions: [0.47480482611781405, 0.2545045045045045, 0.15219123505976095, 0.08913412563667232]
Brevity penalty: 1.0
Length ratio: 1.0429311621021466 

label: medical
Prompt 2
BLEU score: 0.19289480479294646
Precisions: [0.4764089121887287, 0.24740124740124741, 0.14705882352941177, 0.07987470634299139]
Brevity penalty: 1.0
Length ratio: 1.0423497267759563 

label: law
Prompt 1
BLEU score: 0.17864525092905062
Precisions: [0.4863169897377423, 0.24211778703152886, 0.1449004975124378, 0.088369