In [1]:
import torch
import random
import numpy as np
import re

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool
TERCOM = "/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE"

In [2]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [3]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT


# Translation English-German

In [4]:
# Dividing text into subword units

!$FASTBPE applybpe bpe.en_original.en tok.en_original.en bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original.en ...
Read 220 words (145 unique) from text file.
Applying BPE to tok.en_original.en ...
Modified 220 words from text file.
Finished subword.


In [5]:
# Binarize text
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_en-de \
    --workers 8

print('Finished preprocessing.')

2023-10-05 11:23:16 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcd

In [6]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [9]:
# Generate translations
# Beam search
!fairseq-generate data-bin_original_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_en-de.decode_Beam_10.log

print('Finished translation.')

2023-10-05 11:23:49 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [10]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original.txt

# Backtranslation German-English

In [11]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original.de hyp_original.txt bpecodes.de

print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original.txt ...
Read 2235 words (198 unique) from text file.
Applying BPE to hyp_original.txt ...
Modified 2235 words from text file.
Finished subword.


In [12]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_de-en \
    --workers 8

print('Finished preprocessing.')

2023-10-05 11:25:57 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcd

In [13]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [14]:
# Generate backtranslations
!fairseq-generate data-bin_original_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_de-en.decode_Beam_10_backtranslation.log

print('Finished translation.')

2023-10-05 11:26:28 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [15]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_back.txt

In [16]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_back.txt', encoding='utf8') as fin, open('original_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [26]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
          
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))

10
10


In [27]:
# Count unique sentences in source nbest list for each source sentence
def count_unique_sentences(nbest_sentences):
    unique_sent = []
    for source_nbest in nbest_sentences:
        num_values = len(set(source_nbest))
        #print(num_values)
        unique_sent.append(num_values)

    #print(unique_sent)
    return unique_sent 

In [28]:
# Value should be 10, because beam search generates 10 unique sentences
print(count_unique_sentences(nbest_original))

[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


# Statistics on backtranslations

In [29]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())      
    
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
print(len(nbest_original))

10


In [30]:
print(count_unique_sentences(nbest_original))

[32, 63, 68, 61, 26, 67, 70, 48, 33, 48]


## Average results from masking

In [35]:
unique_1st = []
with open('unique_1st.txt', 'r') as fin:
    for line in fin:
        unique_1st.append([int(i) for i in line.strip().split(",")])    
        
print(unique_1st)

unique_2nd = []
with open('unique_2nd.txt', 'r') as fin:
    for line in fin:
        unique_2nd.append([int(i) for i in line.strip().split(",")])    
        
print(unique_2nd)

unique_3rd = []
with open('unique_3rd.txt', 'r') as fin:
    for line in fin:
        unique_3rd.append([int(i) for i in line.strip().split(",")])    
        
print(unique_3rd)



[[30, 66, 36, 33, 32, 20, 32, 36, 32], [54, 64, 64, 64, 79, 64, 64, 64, 64, 75, 63], [69, 70, 66, 83, 74, 68, 68, 68, 79, 68, 66, 68, 61, 38, 51, 68, 68, 68, 64, 74, 68], [53, 49, 40, 54, 58, 44, 58, 51, 57, 39, 69, 63, 57, 32, 61, 57, 57, 40, 57, 55, 57], [44, 26, 26, 28, 26, 26, 26, 26, 26, 29, 26, 22, 26, 26, 26, 26, 36, 26, 45, 26, 49, 26, 27, 45, 26, 67, 26], [44, 70, 70, 43, 54, 75, 59, 70, 70, 56, 70, 71, 70, 72, 67, 68, 70, 73, 70, 70, 70, 47, 59, 67, 66, 67, 72, 67, 67, 67], [70, 70, 66, 68, 70, 70, 67, 70, 51, 59, 70, 56, 56, 70, 70, 62, 70, 70, 73, 70, 70, 70, 70, 90, 60, 48, 70], [47, 48, 48, 47, 48, 60, 46, 51, 33, 50, 48, 66, 48, 48, 48], [37, 53, 63, 48, 31, 46, 31, 31, 39, 27, 31, 39, 32, 31, 31, 36, 35, 24, 19, 31], [44, 35, 35, 35, 35, 65, 35, 35, 35, 74, 56, 35, 36, 63, 48, 69, 35, 35, 35, 35, 35, 52, 35, 47, 73, 35, 35, 35, 38, 32, 35, 44, 37, 28, 35, 28, 39, 29, 34]]
[[39, 53, 38, 35, 50, 32, 29, 28], [58, 70, 64, 64, 64, 74, 64, 63, 72, 28], [77, 89, 70, 84, 71, 6

In [68]:
unique_average = []
for i in range(0, len(source)):
    average = map(lambda x: (x[0] + x[1] + x[2])/3, zip(unique_1st[i], unique_2nd[i], unique_3rd[i]))
    unique_average.append(list(average))

print(unique_average)

[[36.666666666666664, 57.0, 42.0, 32.0, 45.666666666666664, 26.666666666666668, 27.333333333333332, 31.333333333333332], [55.333333333333336, 67.66666666666667, 64.66666666666667, 64.66666666666667, 71.66666666666667, 70.66666666666667, 63.666666666666664, 62.333333333333336, 64.33333333333333, 48.0], [71.33333333333333, 73.33333333333333, 68.33333333333333, 82.0, 71.66666666666667, 65.66666666666667, 66.66666666666667, 72.66666666666667, 76.0, 65.66666666666667, 77.0, 66.66666666666667, 67.33333333333333, 56.333333333333336, 68.66666666666667, 72.66666666666667, 69.33333333333333, 79.33333333333333, 78.66666666666667], [48.333333333333336, 50.333333333333336, 41.666666666666664, 44.666666666666664, 50.666666666666664, 43.333333333333336, 49.333333333333336, 50.333333333333336, 58.666666666666664, 52.666666666666664, 60.333333333333336, 56.333333333333336, 65.0, 47.333333333333336, 56.0, 44.333333333333336, 61.666666666666664, 56.666666666666664], [39.0, 27.0, 29.333333333333332, 30.66

In [74]:
import statistics

unique_median = []
for i in range(0, len(source)):
    med = map(lambda x: statistics.median([x[0], x[1], x[2]]), zip(unique_1st[i], unique_2nd[i], unique_3rd[i]))
    unique_median.append(list(med))

print(unique_median)

[[39, 53, 38, 33, 50, 28, 29, 30], [54, 69, 64, 64, 72, 74, 64, 63, 64, 41], [69, 70, 69, 83, 71, 65, 68, 73, 77, 66, 81, 68, 61, 63, 75, 74, 69, 79, 83], [53, 49, 40, 45, 50, 44, 56, 51, 58, 51, 64, 61, 61, 53, 58, 43, 63, 65], [43, 26, 28, 28, 27, 26, 26, 26, 27, 27, 27, 23, 37, 27, 27, 26, 36, 26, 40, 25, 41, 28, 51], [68, 67, 72, 49, 68, 72, 66, 61, 68, 67, 60, 70, 70, 70, 70, 68, 70, 67, 63, 58, 70, 62, 61, 67, 67, 63, 59], [70, 70, 68, 67, 68, 67, 74, 65, 64, 59, 70, 66, 64, 68, 52, 63, 71, 70, 73, 97, 71, 71, 70, 70, 60], [45, 50, 51, 59, 48, 48, 48, 46, 46, 52, 46, 66, 44, 65], [37, 52, 42, 31, 35, 37, 31, 37, 36, 29, 33, 38, 32, 32, 31, 33, 35, 20, 26], [45, 48, 52, 53, 42, 60, 48, 48, 51, 62, 53, 35, 49, 63, 50, 69, 42, 64, 47, 48, 39, 51, 52, 47, 78, 62, 52, 58, 40, 36, 48, 48, 37]]


## Subtract original sentences value from average

In [50]:
unique_original = count_unique_sentences(nbest_original)

print(unique_original)

[32, 63, 68, 61, 26, 67, 70, 48, 33, 48]


In [80]:
# List with original source sentences
samples = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        samples.append(line.strip().split(" "))

unique_result = []
for c in range(0, len(unique_original)):
    subtracted = [el - unique_original[c] for el in unique_average[c]]
    print(subtracted)
    unique_result.append(subtracted)
    print(samples[c])
    idx = subtracted.index(max(subtracted))
    print((max(subtracted), idx, samples[c][idx]), end='\n\n')

[4.666666666666664, 25.0, 10.0, 0.0, 13.666666666666664, -5.333333333333332, -4.666666666666668, -0.6666666666666679]
['So', 'now', 'Thomson', 'becomes', 'the', 'more', 'likely', 'suspect', '.']
(25.0, 1, 'now')

[-7.666666666666664, 4.666666666666671, 1.6666666666666714, 1.6666666666666714, 8.666666666666671, 7.666666666666671, 0.6666666666666643, -0.6666666666666643, 1.3333333333333286, -15.0]
['There', 'was', 'one', 'black', 'professor', 'and', 'one', 'black', 'assistant', 'dean', '.']
(8.666666666666671, 4, 'professor')

[3.3333333333333286, 5.333333333333329, 0.3333333333333286, 14.0, 3.6666666666666714, -2.3333333333333286, -1.3333333333333286, 4.666666666666671, 8.0, -2.3333333333333286, 9.0, -1.3333333333333286, -0.6666666666666714, -11.666666666666664, 0.6666666666666714, 4.666666666666671, 1.3333333333333286, 11.333333333333329, 10.666666666666671]
['We', 'have', 'our', 'cognitive', 'biases', ',', 'so', 'that', 'I', 'can', 'take', 'a', 'perfect', 'history', 'on', 'a', 'patien