### This notebook is intended to evaluate [BLEURT](!https://github.com/google-research/bleurt) score of file translation.

In [1]:
%%capture
# Install BLEURT and dependecies
! pip install --upgrade pip  # ensures that pip is current
! git clone https://github.com/google-research/bleurt.git
! cd bleurt; pip3 install .

# install checkpoint
! wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip .
! unzip BLEURT-20.zip
! rm BLEURT-20.zip

# wget for files downloading from git
! pip install wget

In [2]:
import os
import wget

from tqdm import tqdm
from timeit import default_timer as timer

In [3]:
# Download files to score: candidate and reference
git_path = r'https://raw.githubusercontent.com/tsimafeip/TV-distinction/main/'

candidate_filename = 'base_model_no_labels'
reference_filename = 'deixis_test_ru'

candidate_git_path = os.path.join(git_path, 'translations', candidate_filename)
reference_git_path = os.path.join(git_path, 'data', reference_filename)

if not os.path.isfile(candidate_filename):
    wget.download(candidate_git_path, candidate_filename)

if not os.path.isfile(reference_filename):
    wget.download(reference_git_path, reference_filename)

In [4]:
# BLEURT demo
from bleurt import score

bleurt_checkpoint = "BLEURT-20"

candidates = [
  "К моему великолепному удивлению, ваш фингерпечатник запал в AFIS.",
  "Конечно, они не являются.",
  "Я никогда не совершил никаких преступлений.",
  "Вы никогда не увидели.",
  "Вы никогда не увидели.",
]
references = [
  "К моему удивлению , твоих пальчиков не оказалось в АДИС .",
  "Естественно . ",
  "Я никогда не нарушала закон . ",
  "Тебя не ловили .",
  "Вас не ловили ."
]

scorer = score.BleurtScorer(bleurt_checkpoint)
scores = scorer.score(references=references, candidates=candidates)
assert type(scores) == list and len(scores) == len(references)
print(scores)

INFO:tensorflow:Reading checkpoint BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: BLEURT-20/sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer created.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.
[0.3393818140029907, 0.1906919777393341, 0.7563686966896057, 0.33146679401397705, 0.3932281732559204]


In [5]:
# batching helps to avoid out-of-memory problem, caused by keeping the whole file in memory
# we can also resume BLEURT evaluation

BATCH_SIZE = 32
START_SENTENCE = 0
scores_filename = candidate_filename + '.bleurt'
scores = []

with open(candidate_filename) as cand_file, \
    open(reference_filename) as ref_file, \
    open(scores_filename, 'w') as scores_file:
    
    candidates = cand_file.read().splitlines() 
    references = ref_file.read().splitlines()
    assert len(candidates) == len(references)

    start = timer()
    i, j = 0, BATCH_SIZE
    print(f"Starting BLEURT evaluation of '{candidate_filename}' file ...")
    for i in tqdm(range(START_SENTENCE, len(candidates), BATCH_SIZE)):
        cur_batch_candidates = candidates[i:i+BATCH_SIZE]
        cur_batch_references = references[i:i+BATCH_SIZE]
        #print(cur_batch_candidates, cur_batch_references)
        batch_scores = scorer.score(references=cur_batch_references, candidates=cur_batch_candidates)
        #print(batch_scores)
        scores.extend(batch_scores)

        scores_file.writelines([str(score) + '\n' for score in batch_scores])
    print("\nFinished evaluation in : %f seconds\n" % (timer() - start))
    print(f"Average score: {sum(scores)/len(scores)}.")

Starting BLEURT evaluation of 'base_model_no_labels' file ...


100%|██████████| 303/303 [1:21:28<00:00, 16.13s/it]


Finished evaluation in : 4888.077164 seconds




