In [414]:
import torch
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool

In [415]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [416]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity


In [20]:
# Extract sentences
!cut -f3 -d'	' en.txt > en_sentences.txt

In [21]:
# Extract sentences containing 'because' and remove the second part of the clause
# 330 unique sentences in total
with open('en_sentences.txt', 'r') as fin, open('en_original.txt', 'w') as fout:
    for line in fin:
        sentence = ''
        tokens = line.split(" ")
        for token in tokens:
            if token == 'because':
                print(sentence + '.', end='\n', file=fout)
            sentence = sentence + token.replace(',', '') + ' '

In [23]:
# Modify gender ambiguous words with gender

# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())
        
with open('en_original.txt') as in_file, open('en_disambiguated.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "male " + token # could also replace with "female"
        print(' '.join(sentence), end='', file=out_file)

# Translation English-German

In [24]:
# Tokenization
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')
md_en = MosesDetokenizer(lang='en')

with open('en_original.txt') as fin, open('tok.en_original.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout) 
        
with open('en_disambiguated.txt') as fin, open('tok.en_disambiguated.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished tokenizing.')

Finished tokenizing.


In [25]:
# Dividing text into subword units

!$FASTBPE applybpe bpe.en_original.en tok.en_original.en bpecodes.en
!$FASTBPE applybpe bpe.en_disambiguated.en tok.en_disambiguated.en bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original.en ...
Read 2733 words (470 unique) from text file.
Applying BPE to tok.en_original.en ...
Modified 2733 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_disambiguated.en ...
Read 3388 words (471 unique) from text file.
Applying BPE to tok.en_disambiguated.en ...
Modified 3388 words from text file.
Finished subword.


In [26]:
# Binarize text
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_en-de \
    --workers 8

2023-05-09 11:28:37 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcd

In [27]:
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_disambiguated \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_disambiguated_en-de \
    --workers 8

2023-05-09 11:29:02 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_disambiguated_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en',

In [28]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [30]:
# Generate N hypothesis
!fairseq-generate data-bin_original_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_en-de.decode_Beam_10.log

2023-05-09 11:30:46 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [31]:
# Generate N hypothesis
!fairseq-generate data-bin_disambiguated_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_en-de.decode_Beam_10.log

2023-05-09 11:34:54 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

# Backtranslation German-English

In [32]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original.txt
!grep ^H disambiguated_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated.txt

In [33]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original.de hyp_original.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_disambiguated.de hyp_disambiguated.txt bpecodes.de

print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original.txt ...
Read 27703 words (1212 unique) from text file.
Applying BPE to hyp_original.txt ...
Modified 27703 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_disambiguated.txt ...
Read 32952 words (1120 unique) from text file.
Applying BPE to hyp_disambiguated.txt ...
Modified 32952 words from text file.
Finished subword.


In [34]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_de-en \
    --workers 8

2023-05-09 11:37:00 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcd

In [35]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_disambiguated \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_disambiguated_de-en \
    --workers 8

2023-05-09 11:37:08 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_disambiguated_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de',

In [36]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [38]:
# Generate N hypothesis
!fairseq-generate data-bin_original_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_de-en.decode_Beam_10_backtranslation.log

2023-05-09 12:09:43 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [39]:
# Generate N hypothesis
!fairseq-generate data-bin_disambiguated_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_de-en.decode_Beam_10_backtranslation.log

2023-05-09 12:21:17 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [40]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_back.txt
!grep ^H disambiguated_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated_back.txt

In [41]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_back.txt', encoding='utf8') as fin, open('original_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_disambiguated_back.txt', encoding='utf8') as fin, open('disambiguated_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [42]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
# List with disambiguated source sentences
source_disambiguated = []
with open('en_disambiguated.txt', 'r') as fin:
    for line in fin:
        source_disambiguated.append(line.strip())
    
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated            
nbest_modified = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_modified.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))
print(len(nbest_modified))

330
330
330


## Count unique sentences

In [44]:
# Count unique sentences in source nbest list for each source sentence of original; 9.945454545454545
# Value should be 10, because beam search generates 10 unique sentences
for source_nbest in nbest_original:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

9.945454545454545


In [45]:
# Count unique sentences in source nbest list for each source sentence of modified; 9.954545454545455
unique_sent = []
for source_nbest in nbest_modified:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

9.954545454545455


## Count unique words

In [None]:
# Count unique words in source nbest list for each source sentence of original; 16.836363636363636
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_original:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

In [None]:
# Count unique words in source nbest list for each source sentence of modified; 17.64848484848485
# !!! This is normal to generate more unique words, because the disambiguated sentences have more words in total
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_modified:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

# Statistics on backtranslations

In [257]:
# List with original source sentences
source_original = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source_original.append(line.strip())
        
# List with disambiguated source sentences
source_disambiguated = []
with open('en_disambiguated.txt', 'r') as fin:
    for line in fin:
        source_disambiguated.append(line.strip())
    
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated
nbest_disambiguated = []
with open('disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []
            
print(len(nbest_original))
print(len(nbest_disambiguated))

330
330


## Source sentence reoccurrence

In [252]:
# Count how many times the source sentence occurs in the nbest list of original; 258
results = []
counter = 0
for sent in source_original:
    matches = 0
    for target in nbest_original[counter]: 
        if (sent == target):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(sum(results)/330)
print(sum(x > 0 for x in results))

5.575757575757576
258


In [258]:
# Count how many times the source sentence occurs in the nbest list of disambiguated; 230
results = []
counter = 0
for sent in source_disambiguated:
    matches = 0
    for target in nbest_disambiguated[counter]: 
        if (sent == target):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(sum(results)/330)
print(sum(x > 0 for x in results))

3.903030303030303
230


## Ambiguous source words reoccurrence


In [259]:
# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())

# Extract ambiguous words from source sentences
ambiguous_words = [] 
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break
        
print(ambiguous_words)
print(len(ambiguous_words))
        
# List with nbest sentences for every source
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []

nbest_disambiguated = []
with open('disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []  

print(len(nbest_original))
print(len(nbest_modified))     

['developer', 'mechanic', 'mover', 'assistant', 'chief', 'salesperson', 'lawyer', 'cook', 'mover', 'farmer', 'CEO', 'hairdresser', 'developer', 'driver', 'auditor', 'CEO', 'guard', 'assistant', 'assistant', 'auditor', 'salesperson', 'manager', 'physician', 'laborer', 'physician', 'hairdresser', 'developer', 'farmer', 'receptionist', 'manager', 'cleaner', 'mechanic', 'writer', 'worker', 'editor', 'analyst', 'carpenter', 'cook', 'carpenter', 'cleaner', 'laborer', 'mechanic', 'mechanic', 'cook', 'farmer', 'CEO', 'librarian', 'chief', 'developer', 'nurse', 'lawyer', 'developer', 'mover', 'mover', 'worker', 'secretary', 'CEO', 'carpenter', 'sheriff', 'mechanic', 'analyst', 'assistant', 'chief', 'janitor', 'manager', 'supervisor', 'chief', 'worker', 'salesperson', 'lawyer', 'developer', 'sheriff', 'janitor', 'laborer', 'driver', 'mover', 'developer', 'janitor', 'salesperson', 'chief', 'laborer', 'guard', 'nurse', 'worker', 'laborer', 'lawyer', 'CEO', 'laborer', 'laborer', 'nurse', 'manager',

In [249]:
# Count how many times the source words occurs in the nbest list of original
results = []
counter = 0
for word in ambiguous_words:
    matches = 0
    for target in nbest_original[counter]: 
        if (word in target.split(" ")):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(results)
print(sum(results)/330)
print(sum(x > 0 for x in results))

[53, 100, 17, 72, 14, 3, 56, 81, 60, 73, 16, 55, 41, 100, 49, 64, 82, 100, 100, 46, 7, 74, 0, 0, 10, 46, 87, 74, 100, 100, 60, 95, 40, 70, 86, 100, 70, 78, 74, 81, 0, 98, 96, 78, 83, 29, 98, 25, 93, 100, 78, 52, 5, 0, 73, 92, 24, 80, 93, 100, 90, 66, 27, 35, 80, 35, 4, 48, 0, 66, 82, 88, 34, 0, 100, 80, 34, 43, 5, 15, 3, 85, 90, 64, 0, 77, 7, 0, 7, 70, 94, 98, 24, 95, 100, 94, 100, 97, 0, 72, 61, 100, 99, 36, 11, 99, 79, 61, 58, 100, 56, 54, 43, 1, 3, 100, 76, 53, 11, 13, 78, 96, 12, 10, 0, 99, 91, 100, 83, 100, 90, 85, 99, 90, 45, 2, 88, 96, 8, 37, 39, 67, 91, 89, 100, 100, 6, 10, 96, 91, 37, 5, 100, 9, 17, 11, 87, 11, 42, 10, 50, 1, 62, 100, 67, 56, 17, 79, 72, 75, 0, 77, 81, 95, 30, 62, 100, 57, 100, 19, 97, 9, 68, 53, 22, 99, 1, 79, 36, 21, 79, 68, 21, 86, 86, 2, 92, 26, 32, 6, 17, 60, 47, 37, 100, 100, 84, 75, 81, 75, 94, 35, 29, 61, 100, 6, 100, 83, 100, 76, 34, 13, 39, 75, 63, 46, 97, 91, 0, 100, 100, 100, 85, 28, 16, 100, 50, 55, 62, 46, 93, 100, 59, 85, 52, 75, 94, 96, 96, 27,

In [260]:
# Count how many times the source words occurs in the nbest list of disambiguated
results = []
counter = 0
for word in ambiguous_words:
    matches = 0
    for target in nbest_disambiguated[counter]: 
        if (word in target.split(" ")):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(results)
print(sum(results)/330)
print(sum(x > 0 for x in results))

[63, 100, 32, 94, 1, 5, 51, 63, 100, 75, 24, 50, 48, 100, 32, 60, 84, 100, 100, 33, 1, 68, 17, 0, 8, 40, 100, 71, 100, 93, 68, 91, 68, 80, 98, 100, 72, 65, 72, 81, 1, 100, 93, 43, 80, 24, 100, 25, 97, 61, 75, 54, 0, 16, 76, 95, 27, 75, 93, 100, 96, 70, 18, 46, 83, 37, 0, 52, 2, 58, 91, 90, 43, 0, 100, 0, 65, 47, 6, 11, 1, 87, 61, 59, 0, 63, 5, 0, 4, 39, 98, 96, 80, 100, 100, 93, 100, 99, 0, 69, 69, 96, 98, 56, 11, 99, 78, 71, 52, 100, 47, 60, 52, 0, 4, 100, 78, 60, 16, 21, 88, 97, 27, 1, 1, 91, 89, 100, 92, 100, 80, 84, 93, 88, 28, 46, 95, 96, 5, 46, 51, 91, 85, 86, 100, 100, 10, 16, 97, 89, 99, 16, 100, 32, 8, 20, 76, 10, 48, 10, 44, 6, 39, 100, 71, 58, 29, 73, 74, 91, 0, 71, 72, 99, 0, 69, 100, 38, 100, 25, 100, 8, 57, 41, 37, 100, 13, 73, 47, 42, 63, 73, 17, 76, 82, 18, 95, 38, 29, 9, 19, 87, 58, 30, 99, 100, 95, 83, 82, 85, 97, 70, 8, 73, 100, 7, 100, 85, 95, 53, 36, 15, 46, 80, 57, 50, 100, 83, 1, 100, 100, 100, 79, 17, 12, 96, 63, 72, 69, 64, 94, 100, 68, 90, 60, 68, 90, 99, 99, 

## Count unique sentences

In [66]:
# Count unique sentences in source nbest list for each source sentence of original; 46.06060606060606
unique_sent = []
for source_nbest in nbest_original:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

46.06060606060606


In [67]:
# Count unique sentences in source nbest list for each source sentence of modified; 51.77272727272727
unique_sent = []
for source_nbest in nbest_modified:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

51.77272727272727


## Count unique words

In [None]:
# Count unique words in source nbest list for each source sentence of original; 22.593939393939394
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_original:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

In [None]:
# Count unique words in source nbest list for each source sentence of modified; 22.348484848484848
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_modified:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

# Word alignement (source-translation)

- Count how many unique ambiguous words are in total in source text
- Extract the position of the first ambiguous word from each sentence

In [417]:
# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())
        
ambiguous_words = set() # set forces uniqueness
positions_ambiguous_words = []

with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.add(token)
                position = tokens.index(token)
                positions_ambiguous_words.append(position)
                break
        
print(ambiguous_words)
print(len(ambiguous_words))
print(positions_ambiguous_words)
print(len(positions_ambiguous_words))

{'developer', 'laborer', 'auditor', 'receptionist', 'driver', 'undergraduate', 'secretary', 'bartender', 'dietitian', 'practitioner', 'tailor', 'salesperson', 'editor', 'carpenter', 'mechanic', 'cook', 'writer', 'housekeeper', 'advisor', 'therapist', 'analyst', 'baker', 'guard', 'teenager', 'janitor', 'mover', 'accountant', 'physician', 'painter', 'hairdresser', 'librarian', 'examiner', 'broker', 'firefighter', 'cleaner', 'worker', 'specialist', 'assistant', 'patient', 'chief', 'programmer', 'manager', 'clerk', 'lawyer', 'CEO', 'farmer', 'counselor', 'scientist', 'supervisor', 'attendant', 'sheriff', 'customer', 'nurse'}
53
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [418]:
# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
         
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []

print(len(source))
print(len(nbest_original))           
        
count = 0
with open('original_source-target_en-de.txt', 'w') as fout:
    while count < 330:
        for hyp in nbest_original[count]:
            print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

330
330


In [419]:
# List with disambiguated source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
         
# List with nbest sentences for every source in original 
nbest_disambiguated = []
counter = 0
temp = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []

print(len(source))
print(len(nbest_disambiguated))           
        
count = 0
with open('disambiguated_source-target_en-de.txt', 'w') as fout:
    while count < 330:
        for hyp in nbest_disambiguated[count]:
            print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

330
330


## fast_align

In [None]:
!$FAST_ALIGN -i original_source-target_en-de.txt -d -o -v > original_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_source-target_en-de.txt -d -o -v > disambiguated_source-target_en-de_fast-aligned.txt

In [420]:
# Extract target translated words to source words in original

import re

# List with original translations
translations_original = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations_original.append(line.strip())

              
lineNumber = 0
counter = 0
indices = []
with open('original_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_original:
    if (lineNumber == 10):
            lineNumber = 0
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    tokens = translation.split(' ')
    translation_index = translations_original.index(translation)
    if indices[translation_index] != 999:
        translated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/329)

329
2.4711246200607904


In [421]:
# Extract target translated words to source words in disambiguated

import re

# List with original translations
translations_disambiguated = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations_disambiguated.append(line.strip())

              
lineNumber = 0
counter = 0
indices = []
with open('disambiguated_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_disambiguated:
    if (lineNumber == 10):
            lineNumber = 0
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    tokens = translation.split(' ')
    translation_index = translations_disambiguated.index(translation)
    if indices[translation_index] != 999:
        translated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/329)

329
2.188449848024316


## awesome_align

In [137]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Downloading: 100%|██████████████████████████████| 625/625 [00:00<00:00, 656kB/s]
Downloading: 100%|███████████████████████████| 996k/996k [00:00<00:00, 1.31MB/s]
Downloading: 100%|███████████████████████████| 714M/714M [02:13<00:00, 5.34MB/s]
Loading the dataset...
Extracting: 3300it [00:17, 192.30it/s]


In [139]:
!awesome-align \
    --output_file "disambiguated_source-target_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Loading the dataset...
Extracting: 3300it [00:06, 513.55it/s]


In [424]:
# Extract target translated words to source words in original

import re

# List with original translations
translations_original = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations_original.append(line.strip())

              
lineNumber = 0
counter = 0
indices = []
with open('original_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_original:
    if (lineNumber == 10):
            lineNumber = 0
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    tokens = translation.split(' ')
    translation_index = translations_original.index(translation)
    if indices[translation_index] != 999:
        translated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(translations_ambiguous_words)
print(translations_ambiguous_words)

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/329)

[{'Entwickler', 'Bauunternehmer', 'Bauträger'}, {'Mechaniker'}, {'Umzugshelfer', 'Macher', 'Umzugsteilnehmer', 'Mover'}, {'Assistentin', 'Assistent'}, {'Chefin', 'Chef', 'Verwaltungschef', 'Häuptling'}, {'Verkäufer', 'Verkäuferin'}, {'Anwältin', 'Rechtsanwalt', 'Anwalt', 'Jurist', 'Rechtsanwältin'}, {'Koch', 'Köchin'}, {'Beweger', 'Umzugshelfer', 'Mover'}, {'Bauer', 'Landwirt', 'Bäuerin'}, {'Vorstandsvorsitzende', 'Chef', 'Vorstandschef', 'CEO', 'Geschäftsführerin', 'Firmenchef', 'Geschäftsführer'}, {'Frisör', 'Friseur', 'Friseurin'}, {'Bauunternehmerin', 'Bauträger', 'Bauherr', 'Entwickler', 'Bauunternehmer'}, {'Fahrer'}, {'Rechnungsprüfer', 'Prüfer', 'Wirtschaftsprüfer'}, {'Vorstandsvorsitzende', 'CEO', 'Geschäftsführer'}, {'Wärter', 'Wachmann', 'Wache'}, {'des', 'Assistent'}, {'des', 'Assistent'}, {'Rechnungsprüfer', 'Prüfer', 'Revisor', 'Kassenprüfer', 'Wirtschaftsprüfer'}, {'Verkäufer', 'Verkäuferin'}, {'Chef', 'Manager', 'Geschäftsführer'}, {'Arzt'}, {'Arbeiterin', 'Angestellte',

In [429]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_translations_original.txt', 'w') as fout:
    while count < 329:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(translations_ambiguous_words[count]), end='\n', file=fout)
        count += 1

In [430]:
# Extract target translated words to source words in disambiguated

import re

# List with original translations
translations_disambiguated = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations_disambiguated.append(line.strip())

              
lineNumber = 0
counter = 0
indices = []
with open('disambiguated_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_disambiguated:
    if (lineNumber == 10):
            lineNumber = 0
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    tokens = translation.split(' ')
    translation_index = translations_disambiguated.index(translation)
    if indices[translation_index] != 999:
        translated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(translations_ambiguous_words)
print(translations_ambiguous_words)

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/329)

[{'Entwickler', 'Bauunternehmer'}, {'Mechaniker'}, {'Beweger', 'Umzugshelfer', 'Mover'}, {'Assistentin', 'Assistent'}, {'Chef', 'Chefin'}, {'Verkäufer', 'Verkäuferin'}, {'Anwältin', 'Mann', 'Rechtsanwalt', 'Anwalt', 'Jurist'}, {'Koch', 'Köchin'}, {'Beweger', 'Mover'}, {'Bauer', 'Landwirt'}, {'CEO', 'Vorstandsvorsitzende', 'Geschäftsführer'}, {'Friseur', 'Friseurin'}, {'Entwicklerin', 'Bauunternehmerin', 'Bauträger', 'Entwickler', 'Bauunternehmer'}, {'Fahrer'}, {'Auditor', 'Entwickler', 'Prüfer', 'Wirtschaftsprüfer'}, {'CEO', 'Geschäftsführer'}, {'Wärter', 'Wachmann', 'Garde', 'Wache'}, {'des', 'einer'}, {'des', 'einer'}, {'Revisor', 'Wachmann', 'Kassenprüfer', 'Prüfer'}, {'Verkäufer', 'Verkäuferin'}, {'Manager', 'Geschäftsführer'}, {'Arzt'}, {'Friseur', 'Arbeiter'}, {'Ärztin', 'Arzt'}, {'Frisör', 'Friseur', 'Friseurin'}, {'Entwickler'}, {'Bauer', 'Landwirt'}, {'Rezeptionistin', 'Empfangschef', 'Rezeptionist', 'Empfangsdame'}, {'Manager'}, {'Putzkraft', 'Putzfrau', 'Putzmann', 'Reinigun

In [438]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_translations_disambiguated.txt', 'w') as fout:
    while count < 329:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(translations_ambiguous_words[count]), end='\n', file=fout)
        count += 1

# Word alignement (translation-backtranslation)

## fast_align

- Extract the position of the translated ambiguous word from each sentence

In [301]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_original = []
with open('original_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            positions_ambiguous_words_original.append(int(re.search(regex, line).group(1)))
        else:
            positions_ambiguous_words_original.append(999)
        lineNumber += 1
        
#print(len(positions_ambiguous_words_original))
#print(positions_ambiguous_words_original)

In [302]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_disambiguated = []
with open('disambiguated_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            positions_ambiguous_words_disambiguated.append(int(re.search(regex, line).group(1)))
        else:
            positions_ambiguous_words_disambiguated.append(999)
        lineNumber += 1
        
#print(len(positions_ambiguous_words_disambiguated))
#print(positions_ambiguous_words_disambiguated)

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [303]:
# List with original translated sentences
translations = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations.append(line.strip())
         
# List with nbest sentences for every translation in original 
nbest_original = []
counter = 0
temp = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []

print(len(translations))
print(len(nbest_original))          
        
count = 0
with open('original_translation-back_en-de.txt', 'w') as fout:
    while count < 3300:
        for hyp in nbest_original[count]:
            print(translations[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

3300
3300


In [304]:
# List with original translated sentences
translations = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations.append(line.strip())
         
# List with nbest sentences for every translation in original 
nbest_disambiguated = []
counter = 0
temp = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []

print(len(translations))
print(len(nbest_disambiguated))           
        
count = 0
with open('disambiguated_translation-back_en-de.txt', 'w') as fout:
    while count < 3300:
        for hyp in nbest_disambiguated[count]:
            print(translations[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

3300
3300


- Word alignement

In [None]:
!$FAST_ALIGN -i original_translation-back_en-de.txt -d -o -v > original_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_translation-back_en-de.txt -d -o -v > disambiguated_translation-back_en-de_fast-aligned.txt

- Extract target backtranslated words

In [321]:
import re

# List with original backtranslations
backtranslations_original = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_original.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('original_translation-back_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words_original[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_original:
    if (lineNumber == 10):
            lineNumber = 0
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    tokens = backtranslation.split(' ')
    backtranslation_index = backtranslations_original.index(backtranslation)
    if indices[backtranslation_index] != 999:
        backtranslated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    if (counter == 10):
        counter = 0
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()
    backtranslated_ambiguous_words.update(set_words)
    counter += 1

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/329)

3299
329
5.553191489361702


In [322]:
import re

# List with disambiguated backtranslations
backtranslations_disambiguated = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_disambiguated.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('disambiguated_translation-back_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words_disambiguated[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_disambiguated:
    if (lineNumber == 10):
            lineNumber = 0
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    tokens = backtranslation.split(' ')
    backtranslation_index = backtranslations_disambiguated.index(backtranslation)
    if indices[backtranslation_index] != 999:
        backtranslated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    if (counter == 10):
        counter = 0
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()
    backtranslated_ambiguous_words.update(set_words)
    counter += 1

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/329)

3299
329
6.27355623100304


## awesome_align

- Extract the position of the translated ambiguous word from each sentence

In [308]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_original = []
with open('original_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            positions_ambiguous_words_original.append(int(re.search(regex, line).group(1)))
        else:
            positions_ambiguous_words_original.append(999)
        lineNumber += 1
        
#print(len(positions_ambiguous_words_original))
#print(positions_ambiguous_words_original)

In [309]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_disambiguated = []
with open('disambiguated_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            positions_ambiguous_words_disambiguated.append(int(re.search(regex, line).group(1)))
        else:
            positions_ambiguous_words_disambiguated.append(999)
        lineNumber += 1
        
#print(len(positions_ambiguous_words_disambiguated))
#print(positions_ambiguous_words_disambiguated)

- Word alignement

In [312]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Loading the dataset...
Extracting: 33000it [00:39, 828.12it/s]


In [313]:
!awesome-align \
    --output_file "disambiguated_translation-back_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Loading the dataset...
Extracting: 33000it [00:47, 699.58it/s]


- Extract target backtranslated words

In [432]:
import re

# List with original backtranslations
backtranslations_original = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_original.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('original_translation-back_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words_original[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_original:
    if (lineNumber == 10):
            lineNumber = 0
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    tokens = backtranslation.split(' ')
    backtranslation_index = backtranslations_original.index(backtranslation)
    if indices[backtranslation_index] != 999:
        backtranslated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    if (counter == 10):
        counter = 0
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()
    backtranslated_ambiguous_words.update(set_words)
    counter += 1

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/329)

3299
329
5.620060790273556


In [439]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_backtranslations_original.txt', 'w') as fout:
    while count < 329:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(backtranslations_ambiguous_words_reduced[count]), end='\n', file=fout)
        count += 1

In [435]:
import re

# List with disambiguated backtranslations
backtranslations_disambiguated = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_disambiguated.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('disambiguated_translation-back_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words_disambiguated[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.search(regex, line): 
            indices.append(int(re.search(regex, line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_disambiguated:
    if (lineNumber == 10):
            lineNumber = 0
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    tokens = backtranslation.split(' ')
    backtranslation_index = backtranslations_disambiguated.index(backtranslation)
    if indices[backtranslation_index] != 999:
        backtranslated_ambiguous_words.add(tokens[indices[translation_index]])
    lineNumber += 1
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    if (counter == 10):
        counter = 0
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()
    backtranslated_ambiguous_words.update(set_words)
    counter += 1

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/329)

3299
329
6.291793313069909


In [440]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_backtranslations_disambiguated.txt', 'w') as fout:
    while count < 329:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(backtranslations_ambiguous_words_reduced[count]), end='\n', file=fout)
        count += 1

# Word alignement (translation-translation)

## Tercom alignement (borrowed from Tu)
- https://github.com/TuAnh23/Perturbation-basedQE/blob/master/align_and_analyse_ambiguous_trans.py#L54-L92

### Extract target translated words to source words in original

In [369]:
# List with original source sentences; output 100 times to match backtranslation size
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        for i in range(100): # append the source sentence 100 times to match backtranslations later
            source.append(line.strip().split()) # split() tokenizes the sentence, because tercom expects tokens     

print(len(source))

# List with original backtranslations
backtranslations = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        backtranslations.append(line.strip().split())
        
print(len(backtranslations))

33000
33000


In [352]:
!git clone https://github.com/TuAnh23/Perturbation-basedQE.git

fatal: destination path 'Perturbation-basedQE' already exists and is not an empty directory.


In [370]:
%cd ./Perturbation-basedQE

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity/Perturbation-basedQE


In [None]:
import align_and_analyse_ambiguous_trans as tercom

alignments = tercom.tercom_alignment(source, backtranslations)

print(alignments)

In [405]:
import pandas as pd

#print(positions_ambiguous_words)

lineNumber = 0
counter = 0
indices = []
for align in alignments:
    if (lineNumber == 100):
        lineNumber = 0
        counter += 1
    position = positions_ambiguous_words[counter] # exact position of ambiguous word
    indices.append([item[1] for item in (item for item in align if not(pd.isna(item[0]))) if item[0] == position][0])
    lineNumber += 1

print(len(indices))

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations:
    if (lineNumber == 100):
        translations_ambiguous_words.append(translated_ambiguous_words)
        lineNumber = 0
        translated_ambiguous_words = set()
    backtranslation_index = backtranslations.index(backtranslation)
    if not(pd.isna(indices[backtranslation_index])):
        translated_ambiguous_words.add(backtranslation[indices[backtranslation_index]])
    lineNumber += 1
    
print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/329)


33000
[{'developer', 'contractor', 'designer', 'was', 'builder', 'developers'}, {'mechanic'}, {'creator', 'host', 'producer', 'doer', 'mover', 'maker', 'owner'}, {'Assistant', 'aide', 'scans', 'Wizard', 'assistant', 'wizard'}, {'manager', 'chieftain', 'head', 'boss', 'administration', 'administrator', 'director', 'Chief', 'hostess', 'left', 'chef', 'chief'}, {'saleswoman', 'vendor', 'shop', 'salesman', 'seller', 'salesperson', 'assistant'}, {'barrister', 'attorney', 'lawyer'}, {'chef', 'cook'}, {'man', 'assistant', 'worker', 'pageboy', 'mover'}, {'woman', 'peasant', 'farmer'}, {'manager', 'board', 'executive', 'head', 'president', 'boss', 'CEO', 'was', 'director', 'member', 'chairman', 'chef', 'chief'}, {'friend', 'hairdresser', 'barber', 'hairstylist', 'friends'}, {'developer', 'to', 'contractor', 'was', 'builder', 'owner', 'has', 'client', 'went'}, {'driver'}, {'auditor', 'examiner', 'inspector', 'investigator', 'auditors', 'accountant'}, {'manager', 'general', 'executive', 'CEO', 'd

### Extract target translated words to source words in disambiguated

In [407]:
%cd ..

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity


In [408]:
# List with original source sentences; output 100 times to match backtranslation size
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        for i in range(100): # append the source sentence 100 times to match backtranslations later
            source.append(line.strip().split()) # split() tokenizes the sentence, because tercom expects tokens     

print(len(source))

# List with original backtranslations
backtranslations = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        backtranslations.append(line.strip().split())
        
print(len(backtranslations))

33000
33000


In [409]:
%cd ./Perturbation-basedQE

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity/Perturbation-basedQE


In [410]:
import align_and_analyse_ambiguous_trans as tercom

alignments = tercom.tercom_alignment(source, backtranslations)

print(alignments)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [412]:
import pandas as pd

#print(positions_ambiguous_words)

lineNumber = 0
counter = 0
indices = []
for align in alignments:
    if (lineNumber == 100):
        lineNumber = 0
        counter += 1
    position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; skip gender word
    indices.append([item[1] for item in (item for item in align if not(pd.isna(item[0]))) if item[0] == position][0])
    lineNumber += 1

print(len(indices))

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations:
    if (lineNumber == 100):
        translations_ambiguous_words.append(translated_ambiguous_words)
        lineNumber = 0
        translated_ambiguous_words = set()
    backtranslation_index = backtranslations.index(backtranslation)
    if not(pd.isna(indices[backtranslation_index])):
        translated_ambiguous_words.add(backtranslation[indices[backtranslation_index]])
    lineNumber += 1
    
print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/329)


33000
[{'developer', 'engineer', 'contractor', 'designer', 'was', 'builder', 'constructor', 'developers'}, {'handed', 'mechanic'}, {'mover'}, {'assistant', 'wizard'}, {'manager', 'tip', 'boss', 'head', 'tipped', 'chef', 'chief'}, {'saleswoman', 'vendor', 'librarian', 'salesman', 'seller', 'salesperson', 'assistant'}, {'barrister', 'man', 'lawyer', 'attorney', 'was'}, {'was', 'has', 'chef', 'cook'}, {'mover'}, {'peasant', 'farmer'}, {'manager', 'board', 'helping', 'executive', 'CEO', 'was', 'director', 'member', 'chairman'}, {'friend', 'hairdresser', 'became', 'barber', 'hairstylist', 'friends', 'made'}, {'developer', 'to', 'contractor', 'designer', 'was', 'builder', 'went'}, {'driver'}, {'Auditor', 'auditor', 'accountant', 'prover', 'reviewer', 'tester', 'inspector', 'investigator', 'auditors', 'examiner', 'verifier'}, {'manager', 'director', 'executive', 'CEO'}, {'warden', 'Guard', 'guard', 'attendant', 'guards'}, {'assistant', 'from', 'recommendation'}, {'assistant', 'from', 'recomme