In [494]:
import torch
import random
import numpy as np
import re

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT/Sampling/Manual/5th"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool
TERCOM = "/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE"

In [495]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [496]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT/Sampling/Manual/5th


# Translation English-German

In [336]:
# Tokenization
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')
md_en = MosesDetokenizer(lang='en')

for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '.txt'
    filename_out = 'sen' + str(i) + '.tok.en'
    with open(filename_in) as fin, open(filename_out,'w') as fout:
        for line in fin:
            tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
            print(tokens, end='\n', file=fout) 

print('Finished tokenizing.')

Finished tokenizing.


In [337]:
# Dividing text into subword units

for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '.tok.en'
    filename_out = 'sen' + str(i) + '.bpe.en'
    !$FASTBPE applybpe $filename_out $filename_in bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from sen1.tok.en ...
Read 81 words (18 unique) from text file.
Applying BPE to sen1.tok.en ...
Modified 81 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from sen2.tok.en ...
Read 121 words (19 unique) from text file.
Applying BPE to sen2.tok.en ...
Modified 121 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from sen3.tok.en ...
Read 441 words (40 unique) from text file.
Applying BPE to sen3.tok.en ...
Modified 441 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from sen4.tok.en ...
Read 501 words (43 unique) from text file.
Applying BPE to sen4.tok.en ...
Modified 501 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from sen5.tok.en .

In [338]:
# Binarize text

for i in  range(1, 11):
    src = 'sen' + str(i) + '.bpe'
    destDir = 'sen' + str(i) + '_data-bin_en-de'
    !fairseq-preprocess \
        --source-lang en \
        --target-lang de \
        --testpref $src \
        --only-source \
        --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
        --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
        --destdir $destDir \
        --workers 8

print('Finished preprocessing.')

2023-10-09 18:14:39 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='sen1_data-bin_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcdict=

In [497]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"

# Generate translations

for i in  range(1, 11):
    srcDir = 'sen' + str(i) + '_data-bin_en-de'
    filename_out = 'sen' + str(i) + '_en-de.decode_Beam_10.log'
    !fairseq-generate $srcDir  \
        --task translation \
        --source-lang en \
        --target-lang de \
        --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
        --sampling \
        --beam 50 \
        --nbest 50 \
        --batch-size 4 \
        --memory-efficient-fp16 \
        --remove-bpe > $filename_out

print('Finished translation.')

!fairseq-generate 'sen8_data-bin_en-de'  \
        --task translation \
        --source-lang en \
        --target-lang de \
        --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
        --sampling \
        --beam 50 \
        --nbest 50 \
        --batch-size 2 \
        --memory-efficient-fp16 \
        --remove-bpe > 'sen8_en-de.decode_Beam_10.log'

2023-10-24 13:30:53 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [498]:
# 'LC_ALL=C sort -V' sorts the results in natural order 

for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '_en-de.decode_Beam_10.log'
    filename_out = 'sen' + str(i) + '_hyp_50.txt'
    !grep ^H $filename_in | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > $filename_out

## Extract 10 unique sentences for every source

In [499]:
# List with nbest sentences for every source in original
for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '_hyp_50.txt' 
    nbest_original = []
    counter = 0
    temp = []
    with open(filename_in, 'r') as fin:
        for line in fin:
            temp.append(line.strip())
            counter += 1
            if (counter == 50):
                nbest_original.append(temp)
                counter = 0
                temp = []

    print(len(nbest_original))

    sent_hyp = []
    for elem in nbest_original:
        unique_sent = set()
        for sent in elem:
            unique_sent.add(sent)
            if (len(unique_sent) == 10):
                sent_hyp.extend(list(unique_sent))
                break

    print (len(sent_hyp))

    filename_out = 'sen' + str(i) + '_hyp.txt'
    with open(filename_out,'w') as fout:
        for sent in sent_hyp:
            print(sent, end='\n', file=fout)

9
90
11
110
21
210
21
210
27
270
30
300
27
270
40
400
20
200
39
390


# Backtranslation German-English

In [500]:
# Dividing tokenized text into subword units

for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '_hyp.txt'
    filename_out = 'sen' + str(i) + '.bpe.de.de'
    !$FASTBPE applybpe $filename_out $filename_in bpecodes.de
    
print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from sen1_hyp.txt ...
Read 723 words (97 unique) from text file.
Applying BPE to sen1_hyp.txt ...
Modified 723 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from sen2_hyp.txt ...
Read 1172 words (147 unique) from text file.
Applying BPE to sen2_hyp.txt ...
Modified 1172 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from sen3_hyp.txt ...
Read 4171 words (619 unique) from text file.
Applying BPE to sen3_hyp.txt ...
Modified 4171 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from sen4_hyp.txt ...
Read 5793 words (865 unique) from text file.
Applying BPE to sen4_hyp.txt ...
Modified 5793 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary

In [501]:
# Binarize text

for i in  range(1, 11):
    src = 'sen' + str(i) + '.bpe.de'
    destDir = 'sen' + str(i) + '_data-bin_de-en'
    !fairseq-preprocess \
        --source-lang de \
        --target-lang en \
        --testpref $src \
        --only-source \
        --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
        --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
        --destdir $destDir \
        --workers 8

print('Finished preprocessing.')

2023-10-24 14:04:27 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='sen1_data-bin_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcdict=

In [502]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"

# Generate backtranslations

for i in  range(1, 11):
    srcDir = 'sen' + str(i) + '_data-bin_de-en'
    filename_out = 'sen' + str(i) + '_de-en.decode_Beam_10_backtranslation.log'
    !fairseq-generate $srcDir  \
        --task translation \
        --source-lang de \
        --target-lang en \
        --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
        --sampling \
        --beam 50 \
        --nbest 50 \
        --batch-size 4 \
        --memory-efficient-fp16 \
        --remove-bpe > $filename_out

print('Finished translation.')

2023-10-24 14:05:18 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [503]:
# 'LC_ALL=C sort -V' sorts the results in natural order 

for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '_de-en.decode_Beam_10_backtranslation.log'
    filename_out = 'sen' + str(i) + '_hyp_back_50.txt'
    !grep ^H $filename_in | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > $filename_out

## Extract 10 unique sentences for every source



In [504]:
# List with nbest sentences for every source in original
for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '_hyp_back_50.txt' 
    nbest_original = []
    counter = 0
    temp = []
    with open(filename_in, 'r') as fin:
        for line in fin:
            temp.append(line.strip())
            counter += 1
            if (counter == 50):
                nbest_original.append(temp)
                counter = 0
                temp = []

    print(len(nbest_original))

    sent_hyp = []
    for elem in nbest_original:
        unique_sent = set()
        for sent in elem:
            unique_sent.add(sent)
            if (len(unique_sent) == 10):
                sent_hyp.extend(list(unique_sent))
                break

    print (len(sent_hyp))

    filename_out = 'sen' + str(i) + '_hyp_back.txt'
    with open(filename_out,'w') as fout:
        for sent in sent_hyp:
            print(sent, end='\n', file=fout)

90
900
110
1100
210
2100
210
2100
270
2700
300
3000
270
2700
400
4000
200
2000
390
3900


In [505]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

for i in  range(1, 11):
    filename_in = 'sen' + str(i) + '_hyp_back.txt'
    filename_out = 'sen' + str(i) + '_back.txt'
    with open(filename_in) as fin, open(filename_out, 'w') as fout:
        for line in fin:
            tokens = md_en.detokenize(line.split(), return_str=True)
            print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

## Count unique sentences

In [506]:
# Count unique sentences in source nbest list for each source sentence
def count_unique_sentences(translations_file, nbest):
    
    # List with nbest sentences for every source
    nbest_sentences = []
    counter = 0
    temp = []
    with open(translations_file, 'r') as fin:
        for line in fin:
            temp.append(line.strip())
            counter += 1
            #print (line)
            if (len(temp) == nbest):
                nbest_sentences.append(temp)
                #counter = 0
                temp = []
                
    #print(len(nbest_sentences))
    
    unique_sent = []
    for source_nbest in nbest_sentences:
        num_values = len(set(source_nbest))
        #print(num_values)
        unique_sent.append(num_values)

    #print(unique_sent)
    return unique_sent

In [507]:
# Value should be 10, because beam search generates 10 unique sentences

for i in  range(1, 11):
    translations_file = 'sen' + str(i) + '_hyp.txt'
    print(count_unique_sentences(translations_file, 10))

[10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


## Count unique words


In [107]:
# Count unique words in source nbest list for each source sentence of original
import spacy

def count_unique_words(translations_file, nbest, spacy_tok):
    
    # List with nbest sentences for every source
    nbest_sentences = []
    counter = 0
    temp = []
    with open(translations_file, 'r') as fin:
        for line in fin:
            temp.append(line.strip())
            counter += 1
            if (counter == nbest):
                nbest_sentences.append(temp)
                counter = 0
                temp = []
    
    unique_words = []
    counter = 0
    for source_nbest in nbest_sentences:
        words = set()
        for sent in source_nbest:
            tokens = sp(sent)
            for token in tokens:
                words.add(token.text)
        num_values = len(words)
        unique_words.append(num_values)

        counter += 1
        #print(counter)

    #print(unique_words)
    return unique_words

In [108]:
sp_de = spacy.load('de_core_news_sm')

for i in  range(1, 11):
    translations_file = 'sen' + str(i) + '_hyp.txt'
    print(count_unique_words(translations_file, 10, sp_de))

[16, 16, 14, 15, 15, 16, 10, 11]
[21, 18, 20, 18, 15, 14, 18, 20, 22, 22]
[28, 28, 27, 24, 30, 28, 27, 28, 26, 26, 28, 25, 29, 25, 28, 25, 25, 30, 27]
[30, 34, 33, 35, 29, 34, 34, 32, 32, 35, 31, 33, 33, 35, 34, 32, 36, 34]
[26, 29, 28, 29, 27, 31, 27, 30, 32, 28, 30, 31, 30, 31, 27, 32, 30, 30, 28, 28, 28, 29, 30]
[28, 23, 28, 26, 20, 27, 29, 29, 29, 29, 21, 30, 27, 26, 28, 30, 29, 25, 28, 24, 23, 29, 25, 26, 30, 29, 25]
[38, 34, 35, 34, 39, 36, 35, 36, 37, 37, 37, 34, 35, 37, 36, 37, 35, 37, 37, 36, 36, 37, 35, 34, 32]
[23, 23, 23, 22, 24, 26, 22, 23, 23, 21, 24, 23, 21, 22]
[33, 27, 28, 26, 28, 30, 31, 29, 28, 27, 26, 27, 26, 28, 29, 29, 28, 30, 30]
[38, 38, 38, 38, 37, 38, 36, 40, 40, 40, 38, 41, 37, 37, 40, 38, 39, 37, 37, 37, 39, 44, 40, 38, 39, 35, 40, 39, 39, 38, 38, 39, 40]


# Statistics on backtranslations

## Count unique sentences


In [509]:
# List with original source sentences
source = []
with open('samples_tok.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip().split(" "))

with open('unique_5th.txt','w') as fout:           
    for i in  range(1, 11):
        translations_file = 'sen' + str(i) + '_hyp_back.txt'
        unique = count_unique_sentences(translations_file, 100)
        print(unique)
        print(source[i-1])
        idx = unique.index(min(unique))
        print((min(unique), idx, source[i-1][idx]), end='\n\n')
        print(unique, end='\n', file=fout) # write to file

[89, 95, 99, 99, 85, 81, 77, 89, 91]
['So', 'now', 'Thomson', 'becomes', 'the', 'more', 'likely', 'suspect', '.']
(77, 6, 'likely')

[99, 95, 90, 96, 97, 98, 90, 92, 88, 83, 93]
['There', 'was', 'one', 'black', 'professor', 'and', 'one', 'black', 'assistant', 'dean', '.']
(83, 9, 'dean')

[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
['We', 'have', 'our', 'cognitive', 'biases', ',', 'so', 'that', 'I', 'can', 'take', 'a', 'perfect', 'history', 'on', 'a', 'patient', 'with', 'chest', 'pain', '.']
(100, 0, 'We')

[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
['That', '&apos;s', 'the', 'officer', 'who', 'emailed', 'me', 'back', ',', 'saying', 'I', 'think', 'you', 'can', 'have', 'a', 'few', 'classes', 'with', 'us', '.']
(100, 0, 'That')

[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
['

## Count unique words


In [None]:
sp_en = spacy.load('en_core_web_sm')

for i in  range(1, 11):
    translations_file = 'sen' + str(i) + '_hyp_back.txt'
    print(count_unique_words(translations_file, 100, sp_en))

## Source sentence reoccurrence

In [None]:
# Count how many of the source sentences reoccur in the backtranslation
def count_sentence_reoccurrence(source_file, backtranslations_file):
    
    # List with original source sentences
    source_sentences = []
    with open(source_file, 'r') as fin:
        for line in fin:
            source_sentences.append(line.strip())

    # List with nbest sentences for every source
    nbest_sentences = []
    counter = 0
    temp = []
    with open(backtranslations_file, 'r') as fin:
        for line in fin:
            temp.append(line.strip())
            counter += 1
            if (counter == 100):
                nbest_sentences.append(temp)
                counter = 0
                temp = []

    
    results = []
    counter = 0
    for sent in source_sentences:
        matches = 0
        for target in nbest_sentences[counter]: 
            if (sent == target):
                matches += 1
        results.append(matches)  
        counter += 1

    return results

In [None]:
for i in  range(1, 11):
    source_file = 'sen' + str(i) + '.txt'
    translations_file = 'sen' + str(i) + '_hyp_back.txt'
    print(count_sentence_reoccurrence(source_file, translations_file))