In [1]:
import torch
import random
import numpy as np
import re

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity/Masking"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool
TERCOM = "/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE"

In [2]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [3]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity/Masking


# Preprocessing

In [10]:
# Modify sentences

# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())
                
# Mask gender ambiguous words with most common words
with open('en_original.txt') as in_file, open('en_original_man.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words or token == 'construction'): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "man"
                if sentence[token_pos + 1] == 'worker':
                    sentence.remove(sentence[token_pos + 1])
                break
        print(' '.join(sentence), end='', file=out_file)

with open('en_original.txt') as in_file, open('en_original_woman.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words or token == 'construction'): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "woman"
                if sentence[token_pos + 1] == 'worker':
                    sentence.remove(sentence[token_pos + 1])
                break
        print(' '.join(sentence), end='', file=out_file)
        
with open('en_original.txt') as in_file, open('en_original_girl.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words or token == 'construction'): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "girl"
                if sentence[token_pos + 1] == 'worker':
                    sentence.remove(sentence[token_pos + 1])
                break
        print(' '.join(sentence), end='', file=out_file)
        
with open('en_original.txt') as in_file, open('en_original_guy.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words or token == 'construction'): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "guy"
                if sentence[token_pos + 1] == 'worker':
                    sentence.remove(sentence[token_pos + 1])
                break
        print(' '.join(sentence), end='', file=out_file)
        
with open('en_original.txt') as in_file, open('en_original_boy.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words or token == 'construction'): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "boy"
                if sentence[token_pos + 1] == 'worker':
                    sentence.remove(sentence[token_pos + 1])
                break
        print(' '.join(sentence), end='', file=out_file)


# Translation English-German

In [11]:
# Tokenization
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')
md_en = MosesDetokenizer(lang='en')

with open('en_original_man.txt') as fin, open('tok.en_original_man.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout) 
        
with open('en_original_woman.txt') as fin, open('tok.en_original_woman.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('en_original_girl.txt') as fin, open('tok.en_original_girl.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('en_original_guy.txt') as fin, open('tok.en_original_guy.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('en_original_boy.txt') as fin, open('tok.en_original_boy.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished tokenizing.')

Finished tokenizing.


In [13]:
# Dividing text into subword units

!$FASTBPE applybpe bpe.en_original_man.en tok.en_original_man.en bpecodes.en
!$FASTBPE applybpe bpe.en_original_woman.en tok.en_original_woman.en bpecodes.en
!$FASTBPE applybpe bpe.en_original_girl.en tok.en_original_girl.en bpecodes.en
!$FASTBPE applybpe bpe.en_original_guy.en tok.en_original_guy.en bpecodes.en
!$FASTBPE applybpe bpe.en_original_boy.en tok.en_original_boy.en bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original_man.en ...
Read 2406 words (422 unique) from text file.
Applying BPE to tok.en_original_man.en ...
Modified 2406 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original_woman.en ...
Read 2406 words (422 unique) from text file.
Applying BPE to tok.en_original_woman.en ...
Modified 2406 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original_girl.en ...
Read 2406 words (422 unique) from text file.
Applying BPE to tok.en_original_girl.en ...
Modified 2406 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original_guy.en ...
Read 2406 words (422 unique) from text file.
Applying BPE to tok.en_original_guy.en ...
Modified 2406 words from text file.
Lo

In [14]:
# Binarize text
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original_man \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_man_en-de \
    --workers 8

!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original_woman \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_woman_en-de \
    --workers 8

!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original_girl \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_girl_en-de \
    --workers 8

!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original_guy \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_guy_en-de \
    --workers 8

!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original_boy \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_boy_en-de \
    --workers 8

print('Finished preprocessing.')

2023-08-21 15:11:57 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_man_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', 

In [15]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [17]:
# Generate translations
# Beam search
!fairseq-generate data-bin_original_man_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_man_en-de.decode_Beam_10.log

!fairseq-generate data-bin_original_woman_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_woman_en-de.decode_Beam_10.log

!fairseq-generate data-bin_original_girl_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_girl_en-de.decode_Beam_10.log

!fairseq-generate data-bin_original_guy_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_guy_en-de.decode_Beam_10.log

!fairseq-generate data-bin_original_boy_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_boy_en-de.decode_Beam_10.log

print('Finished translation.')

2023-08-21 15:14:28 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [18]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_man_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_man.txt
!grep ^H original_woman_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_woman.txt
!grep ^H original_girl_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_girl.txt
!grep ^H original_guy_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_guy.txt
!grep ^H original_boy_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_boy.txt


# Backtranslation German-English

In [19]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original_man.de hyp_original_man.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_original_woman.de hyp_original_woman.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_original_girl.de hyp_original_girl.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_original_guy.de hyp_original_guy.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_original_boy.de hyp_original_boy.txt bpecodes.de

print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original_man.txt ...
Read 24841 words (1026 unique) from text file.
Applying BPE to hyp_original_man.txt ...
Modified 24841 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original_woman.txt ...
Read 24948 words (1048 unique) from text file.
Applying BPE to hyp_original_woman.txt ...
Modified 24948 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original_girl.txt ...
Read 24916 words (1129 unique) from text file.
Applying BPE to hyp_original_girl.txt ...
Modified 24916 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original_guy.txt ...
Read 24654 words (822 unique) from text file.
Applying BPE to hyp_original_guy.txt ...
Modified 24654 words from text file.
Loading

In [20]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original_man \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_man_de-en \
    --workers 8

!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original_woman \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_woman_de-en \
    --workers 8

!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original_girl \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_girl_de-en \
    --workers 8

!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original_guy \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_guy_de-en \
    --workers 8

!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original_boy \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_boy_de-en \
    --workers 8

print('Finished preprocessing.')

2023-08-21 15:22:48 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_man_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', 

In [21]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [23]:
# Generate backtranslations
!fairseq-generate data-bin_original_man_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_man_de-en.decode_Beam_10_backtranslation.log

!fairseq-generate data-bin_original_woman_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_woman_de-en.decode_Beam_10_backtranslation.log

!fairseq-generate data-bin_original_girl_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_girl_de-en.decode_Beam_10_backtranslation.log

!fairseq-generate data-bin_original_guy_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_guy_de-en.decode_Beam_10_backtranslation.log

!fairseq-generate data-bin_original_boy_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_boy_de-en.decode_Beam_10_backtranslation.log

print('Finished translation.')

2023-08-21 15:38:36 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [25]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_man_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_man_back.txt
!grep ^H original_woman_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_woman_back.txt
!grep ^H original_girl_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_girl_back.txt
!grep ^H original_guy_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_guy_back.txt
!grep ^H original_boy_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_boy_back.txt


In [26]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_man_back.txt', encoding='utf8') as fin, open('original_man_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_original_woman_back.txt', encoding='utf8') as fin, open('original_woman_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_original_girl_back.txt', encoding='utf8') as fin, open('original_girl_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_original_guy_back.txt', encoding='utf8') as fin, open('original_guy_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_original_boy_back.txt', encoding='utf8') as fin, open('original_boy_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [32]:
# List with original source sentences
source_man = []
with open('en_original_man.txt', 'r') as fin:
    for line in fin:
        source_man.append(line.strip())
        
source_woman = []
with open('en_original_woman.txt', 'r') as fin:
    for line in fin:
        source_woman.append(line.strip())
        
source_girl = []
with open('en_original_girl.txt', 'r') as fin:
    for line in fin:
        source_girl.append(line.strip())
        
source_guy = []
with open('en_original_guy.txt', 'r') as fin:
    for line in fin:
        source_guy.append(line.strip())
        
source_boy = []
with open('en_original_boy.txt', 'r') as fin:
    for line in fin:
        source_boy.append(line.strip())
    
# List with nbest sentences for every source in original
nbest_original_man = []
counter = 0
temp = []
with open('hyp_original_man.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original_man.append(temp)
            counter = 0
            temp = []
            
nbest_original_woman = []
counter = 0
temp = []
with open('hyp_original_woman.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original_woman.append(temp)
            counter = 0
            temp = []
            
nbest_original_girl = []
counter = 0
temp = []
with open('hyp_original_girl.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original_girl.append(temp)
            counter = 0
            temp = []
            
nbest_original_guy = []
counter = 0
temp = []
with open('hyp_original_guy.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original_guy.append(temp)
            counter = 0
            temp = []
            
nbest_original_boy = []
counter = 0
temp = []
with open('hyp_original_boy.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original_boy.append(temp)
            counter = 0
            temp = []
        
            
print(len(source_man))
print(len(nbest_original_man))

335
335


## Count unique sentences

In [33]:
# Count unique sentences in source nbest list for each source sentence
def count_unique_sentences(nbest_sentences):
    unique_sent = []
    for source_nbest in nbest_sentences:
        num_values = len(set(source_nbest))
        #print(num_values)
        unique_sent.append(num_values)

    #print(unique_sent)
    return sum(unique_sent)/len(nbest_sentences) # average

In [37]:
# Value should be 10, because beam search generates 10 unique sentences
print(count_unique_sentences(nbest_original_man))
print(count_unique_sentences(nbest_original_woman))
print(count_unique_sentences(nbest_original_girl))
print(count_unique_sentences(nbest_original_guy))
print(count_unique_sentences(nbest_original_boy))
print("Average: ")
print((count_unique_sentences(nbest_original_man) + count_unique_sentences(nbest_original_woman) + 
      count_unique_sentences(nbest_original_girl) + count_unique_sentences(nbest_original_guy) + count_unique_sentences(nbest_original_boy))/5)

9.976119402985075
9.967164179104477
9.973134328358208
9.985074626865671
9.976119402985075
Average: 
9.975522388059701


## Count unique words

In [38]:
# Count unique words in source nbest list for each source sentence of original
# !!! Method is slow
import spacy

def count_unique_words(nbest_sentences):
    sp = spacy.load('en_core_web_sm')
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    
    unique_words = []
    normalizer = 0 # should normalize based on total number of words, because disambiguated sentences have more words overall
    counter = 0
    for source_nbest in nbest_sentences:
        words = set()
        for sent in source_nbest:
            tokens = sp(sent)
            normalizer += len(tokens)
            for token in tokens:
                # if token.text not in stopwords:    # checking whether the word is a stop word
                    words.add(token.text)
        num_values = len(words)
        unique_words.append(num_values)

        counter += 1
        #print(counter)

    #print(unique_words)
    print('Normalizer: ' + str(normalizer/len(nbest_sentences)))
    return (sum(unique_words)/len(nbest_sentences), (sum(unique_words)/len(nbest_sentences))/(normalizer/len(nbest_sentences))) # (average, norm average)

In [39]:
print(count_unique_words(nbest_original_man))
print(count_unique_words(nbest_original_woman))
print(count_unique_words(nbest_original_girl))
print(count_unique_words(nbest_original_guy))
print(count_unique_words(nbest_original_boy))

Normalizer: 74.26865671641791
(14.797014925373134, 0.19923633440514468)
Normalizer: 74.6
(15.0, 0.20107238605898126)
Normalizer: 74.4955223880597
(15.507462686567164, 0.20816637281615644)
Normalizer: 73.65074626865672
(14.35223880597015, 0.1948688850160094)
Normalizer: 74.1731343283582
(15.364179104477612, 0.20713940759819707)


# Statistics on backtranslations

In [51]:
# List with original source sentences
source_man = []
with open('en_original_man.txt', 'r') as fin:
    for line in fin:
        source_man.append(line.strip())
        
source_woman = []
with open('en_original_woman.txt', 'r') as fin:
    for line in fin:
        source_woman.append(line.strip())
        
source_girl = []
with open('en_original_girl.txt', 'r') as fin:
    for line in fin:
        source_girl.append(line.strip())
        
source_guy = []
with open('en_original_guy.txt', 'r') as fin:
    for line in fin:
        source_guy.append(line.strip())
        
source_boy = []
with open('en_original_boy.txt', 'r') as fin:
    for line in fin:
        source_boy.append(line.strip())
    
# List with nbest sentences for every source in original
nbest_original_man = []
counter = 0
temp = []
with open('original_man_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original_man.append(temp)
            counter = 0
            temp = []
            
nbest_original_woman = []
counter = 0
temp = []
with open('original_woman_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original_woman.append(temp)
            counter = 0
            temp = []
            
nbest_original_girl = []
counter = 0
temp = []
with open('original_girl_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original_girl.append(temp)
            counter = 0
            temp = []
            
nbest_original_guy = []
counter = 0
temp = []
with open('original_guy_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original_guy.append(temp)
            counter = 0
            temp = []
            
nbest_original_boy = []
counter = 0
temp = []
with open('original_boy_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original_boy.append(temp)
            counter = 0
            temp = []
        
            
print(len(source_man))
print(len(nbest_original_man))

335
335


## Source sentence reoccurrence

In [52]:
# Count how many of the source sentences reoccur in the backtranslation
def count_sentence_reoccurrence(source_sentences, nbest_sentences):
    results = []
    counter = 0
    for sent in source_sentences:
        matches = 0
        for target in nbest_sentences[counter]: 
            if (sent == target):
                matches += 1
        results.append(matches)  
        counter += 1

    return sum(x > 0 for x in results)

In [53]:
print(count_sentence_reoccurrence(source_man, nbest_original_man))
print(count_sentence_reoccurrence(source_woman, nbest_original_woman))
print(count_sentence_reoccurrence(source_girl, nbest_original_girl))
print(count_sentence_reoccurrence(source_guy, nbest_original_guy))
print(count_sentence_reoccurrence(source_boy, nbest_original_boy))
print("Average: ")
print((count_sentence_reoccurrence(source_man, nbest_original_man) + count_sentence_reoccurrence(source_woman, nbest_original_woman) + 
      count_sentence_reoccurrence(source_girl, nbest_original_girl) + count_sentence_reoccurrence(source_guy, nbest_original_guy) +
      count_sentence_reoccurrence(source_boy, nbest_original_boy))/5)

311
306
310
302
311
Average: 
308.0


## Ambiguous source words reoccurrence


In [56]:
# Count how many of the ambiguous words reoccur in the backtranslation
def count_words_reoccurrence(ambiguous_words, nbest_sentences):
    results = []
    counter = 0
    for word in ambiguous_words:
        matches = 0
        for target in nbest_sentences[counter]: 
            if (word in target.split(" ")):
                matches += 1
        results.append(matches)  
        counter += 1

    return sum(x > 0 for x in results)

In [61]:
print(count_words_reoccurrence(["man"] * 335, nbest_original_man))
print(count_words_reoccurrence(["woman"] * 335, nbest_original_woman))
print(count_words_reoccurrence(["girl"] * 335, nbest_original_girl))
print(count_words_reoccurrence(["guy"] * 335, nbest_original_guy))
print(count_words_reoccurrence(["boy"] * 335, nbest_original_boy))
print("Average: ")
print((count_words_reoccurrence(["man"] * 335, nbest_original_man) + count_words_reoccurrence(["woman"] * 335, nbest_original_woman) +
      count_words_reoccurrence(["girl"] * 335, nbest_original_girl) + count_words_reoccurrence(["guy"] * 335, nbest_original_guy) + 
      count_words_reoccurrence(["boy"] * 335, nbest_original_boy))/5)

335
335
335
335
335
Average: 
335.0


## Count unique sentences

In [62]:
print(count_unique_sentences(nbest_original_man))
print(count_unique_sentences(nbest_original_woman))
print(count_unique_sentences(nbest_original_girl))
print(count_unique_sentences(nbest_original_guy))
print(count_unique_sentences(nbest_original_boy))
print("Average: ")
print((count_unique_sentences(nbest_original_man) + count_unique_sentences(nbest_original_woman) + 
      count_unique_sentences(nbest_original_girl) + count_unique_sentences(nbest_original_guy) + count_unique_sentences(nbest_original_boy))/5)

44.6865671641791
49.47761194029851
48.10746268656717
49.15223880597015
43.90149253731343
Average: 
47.065074626865666


## Count unique words

In [63]:
print(count_unique_words(nbest_original_man))
print(count_unique_words(nbest_original_woman))
print(count_unique_words(nbest_original_girl))
print(count_unique_words(nbest_original_guy))
print(count_unique_words(nbest_original_boy))

Normalizer: 717.4835820895522
(32.48656716417911, 0.04527848159196529)
Normalizer: 718.3492537313433
(32.28059701492537, 0.04493719015819852)
Normalizer: 730.1014925373134
(34.31044776119403, 0.04699407974356458)
Normalizer: 721.6417910447761
(31.253731343283583, 0.04330920372285419)
Normalizer: 721.534328358209
(32.06865671641791, 0.044445087996557914)


# Word alignement (source-translation)

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [64]:
def build_alignment_input(sentencesN, sourceIn, targetIn, output):
    # List with original source sentences
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    # List with nbest sentences for every source in original 
    target = []
    counter = 0
    temp = []
    with open(targetIn, 'r') as fin:
        for line in fin:
            temp.append(line.strip())
            counter += 1
            if (counter == 10):
                target.append(temp)
                counter = 0
                temp = []

    #print(len(source))
    #print(len(target))           

    count = 0
    with open(output, 'w') as fout:
        while count < sentencesN:
            for hyp in target[count]:
                print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
            count += 1

In [65]:
build_alignment_input(335, 'tok.en_original_man.en', 'hyp_original_man.txt', 'original_man_source-target_en-de.txt')
build_alignment_input(335, 'tok.en_original_woman.en', 'hyp_original_woman.txt', 'original_woman_source-target_en-de.txt')
build_alignment_input(335, 'tok.en_original_girl.en', 'hyp_original_girl.txt', 'original_girl_source-target_en-de.txt')
build_alignment_input(335, 'tok.en_original_guy.en', 'hyp_original_guy.txt', 'original_guy_source-target_en-de.txt')
build_alignment_input(335, 'tok.en_original_boy.en', 'hyp_original_boy.txt', 'original_boy_source-target_en-de.txt')


## fast_align

In [66]:
!$FAST_ALIGN -i original_man_source-target_en-de.txt -d -o -v > original_man_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_woman_source-target_en-de.txt -d -o -v > original_woman_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_girl_source-target_en-de.txt -d -o -v > original_girl_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_guy_source-target_en-de.txt -d -o -v > original_guy_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_boy_source-target_en-de.txt -d -o -v > original_boy_source-target_en-de_fast-aligned.txt

print("Finished alignment.")

ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
...
expected target length = source length * 1.0406
ITERATION 1
...
  log_e likelihood: -514787
  log_2 likelihood: -742680
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.196931
       size counts: 62
ITERATION 2
...
  log_e likelihood: -70168.6
  log_2 likelihood: -101232
     cross entropy: 4.0752
        perplexity: 16.8561
      posterior p0: 0.0340835
 posterior al-feat: -0.157038
       size counts: 62
  1  model al-feat: -0.159127 (tension=4)
  2  model al-feat: -0.158068 (tension=4.04177)
  3  model al-feat: -0.157549 (tension=4.06236)
  4  model al-feat: -0.157293 (tension=4.07258)
  5  model al-feat: -0.157165 (tension=4.07767)
  6  model al-feat: -0.157102 (tension=4.08021)
  7  model al-feat: -0.15707 (tension=4.08147)
  8  model al-feat: -0.157054 (tension=4.08211)
     final tension: 4.08242
ITERATION 3
...
  log_e likelihood: -55944.7
  log_2 likelihood: -80711.2
     cross entro

In [68]:
import re

# Count unique translated words to the ambiguous words in translations per source sentence
def count_unique_words_alignment_translations(position, sentencesN, sourceIn, translationsIn, alignmentsIn, output):
    
    # Get positions of ambigous words
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
        
    # List with translations
    translations = []
    with open(translationsIn, 'r') as fin:
        for line in fin:
            translations.append(line.strip())
            
            
    
    # Extract alginments of ambiguous words
    lineNumber = 0
    counter = 0
    indices = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                indices.append([int(index) for index in re.findall(regex, line)])
            else:
                indices.append([999])
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = set() # set forces uniqueness
    for translation in translations:
        tokens = translation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                #print(lineNumber)
                #print(tokens[ind])
                #print(ind)
                translated_ambiguous_words.add(tokens[ind])
        lineNumber += 1
        if (lineNumber % 10 == 0):
                translations_ambiguous_words.append(translated_ambiguous_words)
                translated_ambiguous_words = set()

    #print(translations_ambiguous_words)
    #print(len(translations_ambiguous_words))
    
    # Add results to file
    ambiguous_words = []
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0                
    with open(output, 'w') as fout:
        while count < sentencesN:
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(translations_ambiguous_words[count]), end='\n', file=fout)
            count += 1

    unique_translations = 0
    for set_words in translations_ambiguous_words:
        
        ############################################################
        # remove gender info; removing "in" and "e" endings in words
        set_words_new = set()
        for word in set_words:
            word_new = re.sub("in$|e$", "", word)
            #print(word_new)
            set_words_new.add(word_new)
        #print(set_words_new)
        ############################################################
        
        unique_translations += len(set_words_new)
        
    #print(unique_translations)
    return unique_translations/sentencesN # average

In [69]:
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_man.en', 'hyp_original_man.txt', 'original_man_source-target_en-de_fast-aligned.txt', 'unique-words_translations_original_man.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_woman.en', 'hyp_original_woman.txt', 'original_woman_source-target_en-de_fast-aligned.txt', 'unique-words_translations_original_woman.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_girl.en', 'hyp_original_girl.txt', 'original_girl_source-target_en-de_fast-aligned.txt', 'unique-words_translations_original_girl.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_guy.en', 'hyp_original_guy.txt', 'original_guy_source-target_en-de_fast-aligned.txt', 'unique-words_translations_original_guy.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_boy.en', 'hyp_original_boy.txt', 'original_boy_source-target_en-de_fast-aligned.txt', 'unique-words_translations_original_boy.txt'))

1.1791044776119404
1.1343283582089552
1.2776119402985076
3.665671641791045
1.8895522388059702


## awesome_align

In [71]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_man_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_man_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_woman_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_woman_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_girl_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_girl_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_guy_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_guy_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_boy_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_boy_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

print("Finished alignment.")

Loading the dataset...
Extracting: 3350it [00:04, 690.02it/s]
Loading the dataset...
Extracting: 3350it [00:04, 751.00it/s]
Loading the dataset...
Extracting: 3350it [00:04, 741.40it/s]
Loading the dataset...
Extracting: 3350it [00:04, 721.75it/s]
Loading the dataset...
Extracting: 3350it [00:04, 695.15it/s]
Finished alignment.


In [72]:
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_man.en', 'hyp_original_man.txt', 'original_man_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_original_man.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_woman.en', 'hyp_original_woman.txt', 'original_woman_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_original_woman.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_girl.en', 'hyp_original_girl.txt', 'original_girl_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_original_girl.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_guy.en', 'hyp_original_guy.txt', 'original_guy_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_original_guy.txt'))
print('======')
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original_boy.en', 'hyp_original_boy.txt', 'original_boy_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_original_boy.txt'))

1.0537313432835822
1.0238805970149254
1.0507462686567164
3.644776119402985
1.7343283582089553


# Word alignement (translation-backtranslation)

## fast_align

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [73]:
build_alignment_input(3350, 'hyp_original_man.txt', 'hyp_original_man_back.txt', 'original_man_translation-back_en-de.txt')
build_alignment_input(3350, 'hyp_original_woman.txt', 'hyp_original_woman_back.txt', 'original_woman_translation-back_en-de.txt')
build_alignment_input(3350, 'hyp_original_girl.txt', 'hyp_original_girl_back.txt', 'original_girl_translation-back_en-de.txt')
build_alignment_input(3350, 'hyp_original_guy.txt', 'hyp_original_guy_back.txt', 'original_guy_translation-back_en-de.txt')
build_alignment_input(3350, 'hyp_original_boy.txt', 'hyp_original_boy_back.txt', 'original_boy_translation-back_en-de.txt')


- Word alignement

In [74]:
!$FAST_ALIGN -i original_man_translation-back_en-de.txt -d -o -v > original_man_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_woman_translation-back_en-de.txt -d -o -v > original_woman_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_girl_translation-back_en-de.txt -d -o -v > original_girl_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_guy_translation-back_en-de.txt -d -o -v > original_guy_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i original_boy_translation-back_en-de.txt -d -o -v > original_boy_translation-back_en-de_fast-aligned.txt

print("Finished alignment.")

ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
.................................
expected target length = source length * 0.980197
ITERATION 1
.................................
  log_e likelihood: -4.98109e+06
  log_2 likelihood: -7.18619e+06
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.198406
       size counts: 106
ITERATION 2
.................................
  log_e likelihood: -639690
  log_2 likelihood: -922877
     cross entropy: 3.83953
        perplexity: 14.3157
      posterior p0: 0.0331482
 posterior al-feat: -0.162198
       size counts: 106
  1  model al-feat: -0.170996 (tension=4)
  2  model al-feat: -0.166283 (tension=4.17596)
  3  model al-feat: -0.164156 (tension=4.25765)
  4  model al-feat: -0.16315 (tension=4.2968)
  5  model al-feat: -0.162664 (tension=4.31583)
  6  model al-feat: -0.162427 (tension=4.32515)
  7  model al-feat: -0.162311 (tension=4.32972)
  8  model al-feat: -0.162254 (tension=4.33196)
     final tens

- Extract target backtranslated words

In [75]:
import re

# Count unique translated words to the ambiguous words in backtranslations per source sentence
def count_unique_words_alignment_backtranslations(position_word, sentencesN, sourceIn, backtranslationsIn, alignmentsIn_translation, alignmentsIn_backtranslation, output):
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position_word)
       
    lineNumber = 0
    counter = 0
    positions_ambiguous_words_translations = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn_translation, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                positions_ambiguous_words_translations.append([int(index) for index in re.findall(regex, line)])
            else:
                positions_ambiguous_words_translations.append([999])
            lineNumber += 1
    
    # List with backtranslations
    backtranslations = []
    with open(backtranslationsIn, 'r') as fin:
        for line in fin:
            backtranslations.append(line.strip())

    lineNumber = 0
    counter = 0
    indices = []
    with open(alignmentsIn_backtranslation, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            positions = positions_ambiguous_words_translations[counter] # exact positions of ambiguous words
            list_indices = []
            for position in positions:
                regex = r"" + str(position) + r"-(\d)"
                if re.findall(regex, line): 
                    list_indices.extend([int(index) for index in re.findall(regex, line)])
                else:
                    list_indices.extend([999])
            indices.append(list_indices)
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    backtranslated_ambiguous_words = set() # set forces uniqueness
    for backtranslation in backtranslations:
        tokens = backtranslation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                #print(lineNumber)
                #print(tokens[ind])
                #print(ind)
                backtranslated_ambiguous_words.add(tokens[ind])
        lineNumber += 1
        if (lineNumber % 10 == 0):
                backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
                backtranslated_ambiguous_words = set()



    #print(backtranslations_ambiguous_words)
    print(len(backtranslations_ambiguous_words))

    # Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
    backtranslations_ambiguous_words_reduced = []
    backtranslated_ambiguous_words = set() # set forces uniqueness
    counter = 0
    for set_words in backtranslations_ambiguous_words:
        backtranslated_ambiguous_words.update(set_words)
        counter += 1
        if (counter % 10 == 0):
            backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()

    print(len(backtranslations_ambiguous_words_reduced)) 
    
    # Add results to file

    ambiguous_words = []
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position_word])

    count = 0                
    with open(output, 'w') as fout:
        while count < sentencesN:
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(backtranslations_ambiguous_words_reduced[count]), end='\n', file=fout)
            count += 1

    unique_backtranslations = 0
    for set_words in backtranslations_ambiguous_words_reduced:
        unique_backtranslations += len(set_words)
        
    return unique_backtranslations/sentencesN

In [76]:
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_man.en', 'hyp_original_man_back.txt', 'original_man_source-target_en-de_fast-aligned.txt', 'original_man_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_original_man.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_woman.en', 'hyp_original_woman_back.txt', 'original_woman_source-target_en-de_fast-aligned.txt', 'original_woman_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_original_woman.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_girl.en', 'hyp_original_girl_back.txt', 'original_girl_source-target_en-de_fast-aligned.txt', 'original_girl_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_original_girl.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_guy.en', 'hyp_original_guy_back.txt', 'original_guy_source-target_en-de_fast-aligned.txt', 'original_guy_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_original_guy.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_boy.en', 'hyp_original_boy_back.txt', 'original_boy_source-target_en-de_fast-aligned.txt', 'original_boy_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_original_boy.txt'))


3350
335
3.683582089552239
3350
335
4.785074626865671
3350
335
3.8
3350
335
8.435820895522388
3350
335
5.53134328358209


## awesome_align

- Extract the position of the translated ambiguous word from each sentence

- Word alignement

In [78]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_man_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_man_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_woman_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_woman_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_girl_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_girl_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_guy_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_guy_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "original_boy_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_boy_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

print("Finished alignment.")

Loading the dataset...
Extracting: 33500it [00:35, 956.60it/s] 
Loading the dataset...
Extracting: 33500it [00:35, 934.27it/s] 
Loading the dataset...
Extracting: 33500it [00:36, 919.39it/s] 
Loading the dataset...
Extracting: 33500it [00:40, 823.35it/s] 
Loading the dataset...
Extracting: 33500it [00:35, 952.96it/s] 
Finished alignment.


- Extract target backtranslated words

In [79]:
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_man.en', 'hyp_original_man_back.txt', 'original_man_source-target_en-de_awesome-aligned.txt', 'original_man_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_original_man.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_woman.en', 'hyp_original_woman_back.txt', 'original_woman_source-target_en-de_awesome-aligned.txt', 'original_woman_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_original_woman.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_girl.en', 'hyp_original_girl_back.txt', 'original_girl_source-target_en-de_awesome-aligned.txt', 'original_girl_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_original_girl.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_guy.en', 'hyp_original_guy_back.txt', 'original_guy_source-target_en-de_awesome-aligned.txt', 'original_guy_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_original_guy.txt'))
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original_boy.en', 'hyp_original_boy_back.txt', 'original_boy_source-target_en-de_awesome-aligned.txt', 'original_boy_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_original_boy.txt'))


3350
335
3.31044776119403
3350
335
5.04179104477612
3350
335
3.1044776119402986
3350
335
7.477611940298507
3350
335
5.080597014925373


# Word alignement (translation-translation)

## Tercom alignement (borrowed from Tu)
- https://github.com/TuAnh23/Perturbation-basedQE/blob/master/align_and_analyse_ambiguous_trans.py#L54-L92

In [None]:
!git clone https://github.com/TuAnh23/Perturbation-basedQE.git

In [80]:
%cd $TERCOM

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE


In [81]:
import align_and_analyse_ambiguous_trans as tercom
import pandas as pd

def count_unique_words_tercom_alignment(position, sentencesN, sourceIn, backtranslationsIn):
    # List with source sentences; output 100 times to match backtranslation size
    source = []
    with open(PATH + "/" + sourceIn, 'r') as fin:
        for line in fin:
            for i in range(100): # append the source sentence 100 times to match backtranslations later
                source.append(line.strip().split()) # split() tokenizes the sentence, because tercom expects tokens     

    print(len(source))

    # List with original backtranslations
    backtranslations = []
    with open(PATH + "/" + backtranslationsIn, 'r') as fin:
        for line in fin:
            backtranslations.append(line.strip().split())

    print(len(backtranslations))
    
    # Generate alignments
    alignments = tercom.tercom_alignment(source, backtranslations)
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
        
    # Extract target translated words to source words
    lineNumber = 0
    counter = 0
    indices = []
    for align in alignments:
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        indices.append([item[1] for item in (item for item in align if not(pd.isna(item[0]))) if item[0] == position][0])
        lineNumber += 1
        if (lineNumber % 100 == 0):
            counter += 1

    print(len(indices))

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = set() # set forces uniqueness
    for backtranslation in backtranslations:
        backtranslation_index = backtranslations.index(backtranslation)
        if not(pd.isna(indices[backtranslation_index])):
            translated_ambiguous_words.add(backtranslation[indices[backtranslation_index]])
        lineNumber += 1
        if (lineNumber % 100 == 0):
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()

    #print(translations_ambiguous_words)
    #print(len(translations_ambiguous_words))

    unique_translations = 0
    for set_words in translations_ambiguous_words:
        unique_translations += len(set_words)
        
    return unique_translations/sentencesN

In [82]:
print(count_unique_words_tercom_alignment(1, 335, 'tok.en_original_man.en', 'hyp_original_man_back.txt'))
print(count_unique_words_tercom_alignment(1, 335, 'tok.en_original_woman.en', 'hyp_original_woman_back.txt'))
print(count_unique_words_tercom_alignment(1, 335, 'tok.en_original_girl.en', 'hyp_original_girl_back.txt'))
print(count_unique_words_tercom_alignment(1, 335, 'tok.en_original_guy.en', 'hyp_original_guy_back.txt'))
print(count_unique_words_tercom_alignment(1, 335, 'tok.en_original_boy.en', 'hyp_original_boy_back.txt'))


33500
33500
33500
4.95820895522388
33500
33500
33500
6.582089552238806
33500
33500
33500
4.116417910447761
33500
33500
33500
10.480597014925372
33500
33500
33500
6.447761194029851


In [83]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity/Masking


# Word occurrence

## Translation

In [84]:
def extract_word_translations(filename_tokenized, filename_translations, filename_out, filename_alignments):
    """
    Match alignment indices from translation to backtranslation
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_alignments, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    translations = []
    with open(filename_translations, 'r') as fin:
         for line in fin:
                translations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 9):
                alignments = indices_translation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(translations[counter + f])):
                                 words_set.add(translations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 10

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 335:
            print(source[count] + ' | ' + str(target_words[count]), end='\n', file=fout)
            count += 1

In [85]:
def count_word_translations(filename_tokenized, filename_translations, filename_out, filename_alignments):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_alignments, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    translations = []
    with open(filename_translations, 'r') as fin:
         for line in fin:
                translations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 9):
                alignments = indices_translation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(translations[counter + f])):
                                 words_set.add(translations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 10

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0
    occurrences = []
    with open(filename_out, 'w') as fout:
        while count < 335:
            occurrences.append([len(target_set) for target_set in target_words[count]])
            print(source[count] + ' | ' + str([len(target_set) for target_set in target_words[count]]), end='\n', file=fout)
            count += 1
    
    return occurrences

- Investigate the variability of the remaining sentence without the ambiguos word

In [86]:
def uniqueness_rest_of_sentence(position, sentencesN, occurrences):
    
    # Get positions of ambigous words
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
        
    # Sum over all sentences and build average     
    counter = 0
    sum_sent = 0
    for occur in occurrences:
        position = positions_ambiguous_words[counter]
        sum_sent += (sum(occur) - occur[position])/(len(occur) - 1) # sum for every sentence
        counter += 1
        
    return sum_sent/sentencesN

In [87]:
extract_word_translations('tok.en_original_man.en', 'hyp_original_man.txt', 'translations_words_original_man.txt', 'original_man_source-target_en-de_awesome-aligned.txt')
occurrence_original_man = count_word_translations('tok.en_original_man.en', 'hyp_original_man.txt', 'translations_words_original_man_occurrence.txt', 'original_man_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(1, 335, occurrence_original_man))

extract_word_translations('tok.en_original_woman.en', 'hyp_original_woman.txt', 'translations_words_original_woman.txt', 'original_woman_source-target_en-de_awesome-aligned.txt')
occurrence_original_woman = count_word_translations('tok.en_original_woman.en', 'hyp_original_woman.txt', 'translations_words_original_woman_occurrence.txt', 'original_woman_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(1, 335, occurrence_original_woman))

extract_word_translations('tok.en_original_girl.en', 'hyp_original_girl.txt', 'translations_words_original_girl.txt', 'original_girl_source-target_en-de_awesome-aligned.txt')
occurrence_original_girl = count_word_translations('tok.en_original_girl.en', 'hyp_original_girl.txt', 'translations_words_original_girl_occurrence.txt', 'original_girl_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(1, 335, occurrence_original_girl))

extract_word_translations('tok.en_original_guy.en', 'hyp_original_guy.txt', 'translations_words_original_guy.txt', 'original_guy_source-target_en-de_awesome-aligned.txt')
occurrence_original_guy = count_word_translations('tok.en_original_guy.en', 'hyp_original_guy.txt', 'translations_words_original_guy_occurrence.txt', 'original_guy_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(1, 335, occurrence_original_guy))

extract_word_translations('tok.en_original_boy.en', 'hyp_original_boy.txt', 'translations_words_original_boy.txt', 'original_boy_source-target_en-de_awesome-aligned.txt')
occurrence_original_boy = count_word_translations('tok.en_original_boy.en', 'hyp_original_boy.txt', 'translations_words_original_boy_occurrence.txt', 'original_boy_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(1, 335, occurrence_original_boy))


1.9965660303017645
2.010733396931817
2.1075552400969437
1.5920804495475265
1.971849309601723


## Backtranslation

In [88]:
def extract_alignment_indices_backtranslation(filename_translations, filename_backtranslations):
    """
    Extract alignment indices
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_translations, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
       
    # Match alignement indices from translation to backtranslation
    lineNumber = 0
    counter = 0
    indices_backtranslation = []
    with open(filename_backtranslations, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            alignement_tokens = line.split()
            indices_line = []
            for index_list in indices_translation[counter]:
                index_matches = []
                for index in index_list:
                    regex = r"" + str(index) + r"-(\d)"
                    if re.findall(regex, line): 
                        index_matches.extend([int(i) for i in re.findall(regex, line)])
                    else:
                        index_matches.extend([999])
                indices_line.append(index_matches)
            indices_backtranslation.append(indices_line)
            lineNumber += 1 
    return indices_backtranslation

In [89]:
def extract_word_backtranslations(filename_tokenized, filename_backtranslations, filename_out, indices_backtranslation):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    backtranslations = []
    with open(filename_backtranslations, 'r') as fin:
         for line in fin:
                backtranslations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 99):
                alignments = indices_backtranslation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(backtranslations[counter + f])):
                                 words_set.add(backtranslations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 100

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 335:
            print(source[count] + ' | ' + str(target_words[count]), end='\n', file=fout)
            count += 1

In [90]:
def count_word_backtranslations(filename_tokenized, filename_backtranslations, filename_out, indices_backtranslation):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    backtranslations = []
    with open(filename_backtranslations, 'r') as fin:
         for line in fin:
                backtranslations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 99):
                alignments = indices_backtranslation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(backtranslations[counter + f])):
                                 words_set.add(backtranslations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 100

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())
       
    count = 0
    occurrences = []
    with open(filename_out, 'w') as fout:
        while count < 335:
            occurrences.append([len(target_set) for target_set in target_words[count]])
            print(source[count] + ' | ' + str([len(target_set) for target_set in target_words[count]]), end='\n', file=fout)
            count += 1
    
    return occurrences

In [91]:
indices_original_man = extract_alignment_indices_backtranslation('original_man_source-target_en-de_awesome-aligned.txt', 'original_man_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_original_man.en', 'hyp_original_man_back.txt', 'backtranslations_words_original_man.txt', indices_original_man)
occurrences_original_man = count_word_backtranslations('tok.en_original_man.en', 'hyp_original_man_back.txt', 'backtranslations_words_original_man_occurrence.txt', indices_original_man)
print(uniqueness_rest_of_sentence(1, 335, occurrences_original_man))

indices_original_woman = extract_alignment_indices_backtranslation('original_woman_source-target_en-de_awesome-aligned.txt', 'original_woman_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_original_woman.en', 'hyp_original_woman_back.txt', 'backtranslations_words_original_woman.txt', indices_original_woman)
occurrences_original_woman = count_word_backtranslations('tok.en_original_woman.en', 'hyp_original_woman_back.txt', 'backtranslations_words_original_woman_occurrence.txt', indices_original_woman)
print(uniqueness_rest_of_sentence(1, 335, occurrences_original_woman))

indices_original_girl = extract_alignment_indices_backtranslation('original_girl_source-target_en-de_awesome-aligned.txt', 'original_girl_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_original_girl.en', 'hyp_original_girl_back.txt', 'backtranslations_words_original_girl.txt', indices_original_girl)
occurrences_original_girl = count_word_backtranslations('tok.en_original_girl.en', 'hyp_original_girl_back.txt', 'backtranslations_words_original_girl_occurrence.txt', indices_original_girl)
print(uniqueness_rest_of_sentence(1, 335, occurrences_original_girl))

indices_original_guy = extract_alignment_indices_backtranslation('original_guy_source-target_en-de_awesome-aligned.txt', 'original_guy_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_original_guy.en', 'hyp_original_guy_back.txt', 'backtranslations_words_original_guy.txt', indices_original_guy)
occurrences_original_guy = count_word_backtranslations('tok.en_original_guy.en', 'hyp_original_guy_back.txt', 'backtranslations_words_original_guy_occurrence.txt', indices_original_guy)
print(uniqueness_rest_of_sentence(1, 335, occurrences_original_guy))

indices_original_boy = extract_alignment_indices_backtranslation('original_boy_source-target_en-de_awesome-aligned.txt', 'original_boy_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_original_boy.en', 'hyp_original_boy_back.txt', 'backtranslations_words_original_boy.txt', indices_original_boy)
occurrences_original_boy = count_word_backtranslations('tok.en_original_boy.en', 'hyp_original_boy_back.txt', 'backtranslations_words_original_boy_occurrence.txt', indices_original_boy)
print(uniqueness_rest_of_sentence(1, 335, occurrences_original_boy))

4.900207695923237
4.754693276677471
5.126070741277062
3.9921624054415963
4.512226022928395


# Gender statistics

In [92]:
from enum import Enum

class GENDER(Enum):
    """
    Enumerate possible genders.
    Ignore option resolves to words that should be ignored in particular language
    """
    male = 0
    female = 1
    neutral = 2
    unknown = 3
    ignore = 4
    
# ??? These are not always correct; 'der' could be Dativ or Genitiv for female, 'die' could be plural
# !!! There isn't always an article
DE_DETERMINERS = {"der": GENDER.male, "ein": GENDER.male, "dem": GENDER.male, "den": GENDER.male, 
                  "einen": GENDER.male, "des": GENDER.male, "er": GENDER.male, "seiner": GENDER.male,
                  "ihn": GENDER.male, "seinen": GENDER.male, "ihm": GENDER.male, "ihren": GENDER.male,
                  "die": GENDER.female, "eine": GENDER.female, "einer": GENDER.female, "seinem": GENDER.male,
                  "ihrem": GENDER.male, "sein": GENDER.male,
                  "sie": GENDER.female, "seine": GENDER.female, "ihrer": GENDER.female, 
                  "ihr": GENDER.neutral, "ihre": GENDER.neutral, "das": GENDER.neutral,
                  "jemanden": GENDER.neutral}

def get_german_determiners(words):
    """
    Get a list of (gender)
    given a list of words.
    """
    determiners = []
    for (word_ind, word) in enumerate(words):
        word = word.lower()
        if word in DE_DETERMINERS:
            determiners.append((DE_DETERMINERS[word].name))
    return determiners

In [93]:
dets = get_german_determiners(["dem"])
print(dets)

['male']


- Calculate gender based on the articles of unique words: how many of the sentences produce both genders, female and male

In [94]:
import re

# Extract articles of target tranlsated words
def extract_articles(position, sentencesN, translationsIn, alignmentsIn, sourceIn, output):
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
    
    # List with original translations
    translations_original = []
    with open(translationsIn, 'r') as fin:
        for line in fin:
            translations_original.append(line.strip())


    lineNumber = 0
    counter = 0
    indices = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                indices.append([int(index) for index in re.findall(regex, line)])
            else:
                indices.append([999])
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = set() # set forces uniqueness
    for translation in translations_original:
        tokens = translation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                translated_ambiguous_words.add(tokens[0]) # extract articles; currently assume index 0 for article position, TODO
        lineNumber += 1
        if (lineNumber % 10 == 0):
                translations_ambiguous_words.append(translated_ambiguous_words)
                translated_ambiguous_words = set()


    #print(len(translations_ambiguous_words))
    
    # Add results to file

    # List with original source sentences
    source = []
    ambiguous_words = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0  
    genders = []
    male = []
    female = []
    with open(output, 'w') as fout:
        while count < sentencesN:
            #print(translations_ambiguous_words[count])
            genders.append(set(get_german_determiners(translations_ambiguous_words[count])))
            male.append("male" in get_german_determiners(translations_ambiguous_words[count]))
            female.append("female" in get_german_determiners(translations_ambiguous_words[count]))
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(get_german_determiners(translations_ambiguous_words[count])), end='\n', file=fout)
            count += 1
            
    return (sum(1 for i in genders if ('male' in i and 'female' in i)), 
            male.count(True), female.count(True))

In [95]:
print(extract_articles(1, 335, 'hyp_original_man.txt', 'original_man_source-target_en-de_awesome-aligned.txt', 'tok.en_original_man.en', 'unique-words_translations_original_man_articles.txt'))
print(extract_articles(1, 335, 'hyp_original_woman.txt', 'original_woman_source-target_en-de_awesome-aligned.txt', 'tok.en_original_woman.en', 'unique-words_translations_original_woman_articles.txt'))
print(extract_articles(1, 335, 'hyp_original_girl.txt', 'original_girl_source-target_en-de_awesome-aligned.txt', 'tok.en_original_girl.en', 'unique-words_translations_original_girl_articles.txt'))
print(extract_articles(1, 335, 'hyp_original_guy.txt', 'original_guy_source-target_en-de_awesome-aligned.txt', 'tok.en_original_guy.en', 'unique-words_translations_original_guy_articles.txt'))
print(extract_articles(1, 335, 'hyp_original_boy.txt', 'original_boy_source-target_en-de_awesome-aligned.txt', 'tok.en_original_boy.en', 'unique-words_translations_original_boy_articles.txt'))


(1, 335, 1)
(8, 8, 335)
(1, 8, 19)
(1, 335, 1)
(1, 335, 1)


- Calculate gender in percentage for each sentence: percent of "male" vs. female in translations for each sentence

In [96]:
import re

# Extract articles of target tranlsated words
def extract_articles_percent(position, sentencesN, translationsIn, alignmentsIn, sourceIn, output):
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
    
    # List with original translations
    translations_original = []
    with open(translationsIn, 'r') as fin:
        for line in fin:
            translations_original.append(line.strip())


    lineNumber = 0
    counter = 0
    indices = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                indices.append([int(index) for index in re.findall(regex, line)])
            else:
                indices.append([999])
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = [] 
    for translation in translations_original:
        tokens = translation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                translated_ambiguous_words.append(tokens[0]) # extract articles; currently assume index 0 for article position, TODO
        lineNumber += 1
        if (lineNumber % 10 == 0):
                translations_ambiguous_words.append(translated_ambiguous_words)
                translated_ambiguous_words = []


    #print(len(translations_ambiguous_words))
    
    # Add results to file

    # List with original source sentences
    source = []
    ambiguous_words = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0  
    genders = []
    male = []
    female = []
    with open(output, 'w') as fout:
        while count < sentencesN:
            #print(translations_ambiguous_words[count])
            genders.append(get_german_determiners(translations_ambiguous_words[count]))
            male.append("male" in get_german_determiners(translations_ambiguous_words[count]))
            female.append("female" in get_german_determiners(translations_ambiguous_words[count]))
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(get_german_determiners(translations_ambiguous_words[count])), end='\n', file=fout)
            count += 1
     
    #print(genders)
    return (sum([i.count('male')/10 for i in genders])/sentencesN*100, 
            sum([i.count('female')/10 for i in genders])/sentencesN*100)

In [97]:
print(extract_articles_percent(1, 335, 'hyp_original_man.txt', 'original_man_source-target_en-de_awesome-aligned.txt', 'tok.en_original_man.en', 'unique-words_translations_original_man_articles.txt'))
print(extract_articles_percent(1, 335, 'hyp_original_woman.txt', 'original_woman_source-target_en-de_awesome-aligned.txt', 'tok.en_original_woman.en', 'unique-words_translations_original_woman_articles.txt'))
print(extract_articles_percent(1, 335, 'hyp_original_girl.txt', 'original_girl_source-target_en-de_awesome-aligned.txt', 'tok.en_original_girl.en', 'unique-words_translations_original_girl_articles.txt'))
print(extract_articles_percent(1, 335, 'hyp_original_guy.txt', 'original_guy_source-target_en-de_awesome-aligned.txt', 'tok.en_original_guy.en', 'unique-words_translations_original_guy_articles.txt'))
print(extract_articles_percent(1, 335, 'hyp_original_boy.txt', 'original_boy_source-target_en-de_awesome-aligned.txt', 'tok.en_original_boy.en', 'unique-words_translations_original_boy_articles.txt'))


(95.01492537313433, 0.1791044776119403)
(0.5074626865671642, 96.68656716417912)
(0.38805970149253727, 0.8059701492537316)
(93.07462686567169, 0.1791044776119403)
(96.14925373134334, 0.26865671641791045)
