In [171]:
import torch
import random
import numpy as np
import re

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool
TERCOM = "/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE"

In [172]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [173]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT


# Translation English-German

In [137]:
# Tokenization
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')
md_en = MosesDetokenizer(lang='en')

with open('en_original.txt') as fin, open('tok.en_original.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout) 

print('Finished tokenizing.')

Finished tokenizing.


In [138]:
# Dividing text into subword units

!$FASTBPE applybpe bpe.en_original.en tok.en_original.en bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original.en ...
Read 245 words (147 unique) from text file.
Applying BPE to tok.en_original.en ...
Modified 245 words from text file.
Finished subword.


In [139]:
# Binarize text
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_en-de \
    --workers 8

print('Finished preprocessing.')

2023-10-09 15:10:42 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcd

In [140]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [141]:
# Generate translations
# Beam search
!fairseq-generate data-bin_original_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_en-de.decode_Beam_10.log

print('Finished translation.')

2023-10-09 15:10:47 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [142]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original.txt

# Backtranslation German-English

In [143]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original.de hyp_original.txt bpecodes.de

print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original.txt ...
Read 2516 words (207 unique) from text file.
Applying BPE to hyp_original.txt ...
Modified 2516 words from text file.
Finished subword.


In [144]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_de-en \
    --workers 8

print('Finished preprocessing.')

2023-10-09 15:11:37 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcd

In [145]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [146]:
# Generate backtranslations
!fairseq-generate data-bin_original_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_de-en.decode_Beam_10_backtranslation.log

print('Finished translation.')

2023-10-09 15:11:43 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [147]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_back.txt

In [148]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_back.txt', encoding='utf8') as fin, open('original_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [149]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
          
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))

10
10


In [150]:
# Count unique sentences in source nbest list for each source sentence
def count_unique_sentences(nbest_sentences):
    unique_sent = []
    for source_nbest in nbest_sentences:
        num_values = len(set(source_nbest))
        #print(num_values)
        unique_sent.append(num_values)

    #print(unique_sent)
    return unique_sent 

In [151]:
# Value should be 10, because beam search generates 10 unique sentences
print(count_unique_sentences(nbest_original))

[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


# Statistics on backtranslations

In [174]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())      
    
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
print(len(nbest_original))

10


In [175]:
print(count_unique_sentences(nbest_original))

[32, 63, 68, 61, 26, 67, 70, 48, 33, 48]


## Average results from masking

In [176]:
unique_1st = []
with open('unique_1st.txt', 'r') as fin:
    for line in fin:
        unique_1st.append([int(i) for i in line.strip().split(",")])    
        
print(unique_1st)

unique_2nd = []
with open('unique_2nd.txt', 'r') as fin:
    for line in fin:
        unique_2nd.append([int(i) for i in line.strip().split(",")])    
        
print(unique_2nd)

unique_3rd = []
with open('unique_3rd.txt', 'r') as fin:
    for line in fin:
        unique_3rd.append([int(i) for i in line.strip().split(",")])    
        
print(unique_3rd)

unique_4th = []
with open('unique_3rd.txt', 'r') as fin:
    for line in fin:
        unique_4th.append([int(i) for i in line.strip().split(",")])    
        
print(unique_4th)

unique_5th = []
with open('unique_3rd.txt', 'r') as fin:
    for line in fin:
        unique_5th.append([int(i) for i in line.strip().split(",")])    
        
print(unique_5th)


[[30, 66, 36, 33, 32, 20, 32, 36, 32], [54, 64, 64, 64, 79, 64, 64, 64, 64, 75, 63], [69, 70, 66, 83, 74, 68, 68, 68, 79, 68, 66, 68, 61, 38, 51, 68, 68, 68, 64, 74, 68], [53, 49, 40, 54, 58, 44, 58, 51, 57, 39, 69, 63, 57, 32, 61, 57, 57, 40, 57, 55, 57], [44, 26, 26, 28, 26, 26, 26, 26, 26, 29, 26, 22, 26, 26, 26, 26, 36, 26, 45, 26, 49, 26, 27, 45, 26, 67, 26], [44, 70, 70, 43, 54, 75, 59, 70, 70, 56, 70, 71, 70, 72, 67, 68, 70, 73, 70, 70, 70, 47, 59, 67, 66, 67, 72, 67, 67, 67], [70, 70, 66, 68, 70, 70, 67, 70, 51, 59, 70, 56, 56, 70, 70, 62, 70, 70, 73, 70, 70, 70, 70, 90, 60, 48, 70], [47, 48, 48, 47, 48, 60, 46, 51, 33, 50, 48, 66, 48, 48, 48], [37, 53, 63, 48, 31, 46, 31, 31, 39, 27, 31, 39, 32, 31, 31, 36, 35, 24, 19, 31], [44, 35, 35, 35, 35, 65, 35, 35, 35, 74, 56, 35, 36, 63, 48, 69, 35, 35, 35, 35, 35, 52, 35, 47, 73, 35, 35, 35, 38, 32, 35, 44, 37, 28, 35, 28, 39, 29, 34]]
[[39, 53, 38, 35, 50, 32, 29, 32, 32], [58, 70, 64, 64, 64, 74, 64, 63, 72, 48, 74], [77, 89, 70, 8

In [177]:
unique_average = []
for i in range(0, len(source)):
    #print(list(zip(unique_1st[i], unique_2nd[i], unique_3rd[i], unique_4th[i], unique_5th[i])))
    average = map(lambda x: (x[0] + x[1] + x[2] + x[3] + x[4])/5, zip(unique_1st[i], unique_2nd[i], unique_3rd[i], unique_4th[i], unique_5th[i]))
    unique_average.append(list(average))

print(unique_average)

[[38.4, 55.0, 46.0, 30.4, 49.4, 27.2, 24.8, 43.0, 30.2], [54.8, 68.2, 65.2, 65.2, 71.8, 72.0, 63.4, 60.8, 61.4, 63.0, 71.2], [70.0, 68.4, 68.6, 80.8, 68.6, 70.4, 65.4, 64.0, 75.0, 72.2, 65.4, 79.0, 69.4, 71.0, 63.6, 76.6, 73.2, 70.0, 78.4, 69.2, 74.2], [64.8, 54.8, 53.8, 49.0, 56.8, 56.0, 52.8, 59.6, 57.6, 45.2, 56.4, 58.4, 50.8, 52.8, 55.0, 59.8, 56.6, 36.2, 55.8, 55.8, 49.8], [38.2, 35.2, 27.8, 27.6, 26.8, 32.8, 26.8, 42.4, 28.4, 41.6, 26.8, 25.8, 34.0, 24.4, 39.6, 31.0, 37.8, 30.4, 43.8, 32.0, 40.8, 36.4, 24.8, 40.4, 47.4, 50.2, 33.0], [64.8, 62.8, 72.2, 52.0, 65.6, 60.6, 67.4, 58.8, 68.2, 65.6, 59.6, 58.2, 69.6, 67.4, 69.4, 49.8, 57.4, 65.6, 68.2, 72.4, 56.0, 67.4, 67.8, 59.8, 76.4, 68.6, 72.2, 61.2, 52.6, 60.0], [62.8, 66.2, 68.6, 64.8, 67.4, 65.2, 73.6, 55.6, 61.6, 64.0, 64.8, 55.2, 69.8, 71.2, 65.0, 54.0, 67.4, 78.6, 56.8, 73.0, 92.8, 75.0, 82.2, 77.8, 67.8, 69.2, 83.8], [44.8, 55.6, 53.4, 58.2, 41.2, 50.0, 68.6, 44.6, 43.4, 54.4, 44.0, 60.4, 44.2, 50.0, 54.8], [31.4, 44.4, 45.2

In [178]:
import statistics

unique_median = []
for i in range(0, len(source)):
    med = map(lambda x: statistics.median([x[0], x[1], x[2] + x[3] + x[4]]), zip(unique_1st[i], unique_2nd[i], unique_3rd[i], unique_4th[i], unique_5th[i]))
    unique_median.append(list(med))

print(unique_median)

[[39, 66, 38, 35, 50, 32, 32, 36, 32], [58, 70, 64, 64, 79, 74, 64, 64, 72, 75, 74], [77, 89, 70, 84, 77, 95, 68, 72, 79, 77, 66, 84, 61, 53, 63, 75, 76, 69, 91, 74, 78], [64, 54, 58, 54, 58, 47, 58, 58, 57, 46, 69, 63, 57, 61, 61, 68, 57, 40, 57, 55, 57], [44, 39, 26, 28, 36, 27, 27, 26, 38, 29, 27, 26, 27, 27, 37, 27, 36, 26, 45, 32, 49, 36, 27, 45, 28, 67, 28], [76, 70, 75, 70, 70, 75, 80, 70, 70, 71, 70, 71, 70, 72, 70, 70, 70, 73, 70, 70, 70, 68, 70, 67, 66, 67, 72, 67, 70, 67], [70, 70, 73, 68, 70, 70, 82, 70, 65, 69, 83, 76, 56, 70, 70, 62, 78, 71, 73, 70, 97, 71, 71, 90, 69, 70, 79], [47, 50, 66, 67, 53, 60, 48, 51, 43, 51, 48, 68, 48, 55, 55], [45, 53, 63, 48, 35, 46, 31, 37, 39, 29, 33, 39, 32, 32, 38, 36, 35, 24, 19, 31], [53, 35, 35, 44, 35, 65, 36, 41, 50, 74, 56, 53, 48, 63, 48, 69, 44, 90, 64, 60, 52, 52, 41, 64, 73, 40, 35, 40, 38, 45, 35, 44, 37, 35, 35, 35, 39, 29, 51]]


## Subtract original sentences value from average

In [179]:
unique_original = count_unique_sentences(nbest_original)

print(unique_original)

[32, 63, 68, 61, 26, 67, 70, 48, 33, 48]


In [180]:
# List with original source sentences
samples = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        samples.append(line.strip().split(" "))

unique_result = []
for c in range(0, len(unique_original)):
    subtracted = [round(abs(el - unique_original[c]), 2) for el in unique_average[c]] # calculate the absolute value
    print(subtracted)
    unique_result.append(subtracted)
    print(samples[c])
    idx = subtracted.index(max(subtracted)) # extracting the largest difference
    
    #Extract replacement words
    with open('1st/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('2nd/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('3rd/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('4th/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('5th/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    print((max(subtracted), idx, samples[c][idx]), end='\n\n')

[6.4, 23.0, 14.0, 1.6, 17.4, 4.8, 7.2, 11.0, 1.8]
['So', 'now', 'Thomson', 'becomes', 'the', 'more', 'likely', 'suspect', '.']
,
james
david
peter
john
(23.0, 1, 'now')

[8.2, 5.2, 2.2, 2.2, 8.8, 9.0, 0.4, 2.2, 1.6, 0.0, 8.2]
['There', 'was', 'one', 'black', 'professor', 'and', 'one', 'black', 'assistant', 'dean', '.']
and
,
.
;
&
(9.0, 5, 'and')

[2.0, 0.4, 0.6, 12.8, 0.6, 2.4, 2.6, 4.0, 7.0, 4.2, 2.6, 11.0, 1.4, 3.0, 4.4, 8.6, 5.2, 2.0, 10.4, 1.2, 6.2]
['We', 'have', 'our', 'cognitive', 'biases', ',', 'so', 'that', 'I', 'can', 'take', 'a', 'perfect', 'history', 'on', 'a', 'patient', 'with', 'chest', 'pain', '.']
own
usual
research
same
personal
(12.8, 3, 'cognitive')

[3.8, 6.2, 7.2, 12.0, 4.2, 5.0, 8.2, 1.4, 3.4, 15.8, 4.6, 2.6, 10.2, 8.2, 6.0, 1.2, 4.4, 24.8, 5.2, 5.2, 11.2]
['That', '&apos;s', 'the', 'officer', 'who', 'emailed', 'me', 'back', ',', 'saying', 'I', 'think', 'you', 'can', 'have', 'a', 'few', 'classes', 'with', 'us', '.']
words
drinks
minutes
questions
meetings
(24.8, 