In [215]:
import torch
import random
import numpy as np
import re

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_female"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool
TERCOM = "/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE"

In [216]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [217]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_female


In [20]:
# Extract sentences
!cut -f3 -d'	' en.txt > en_sentences.txt

In [21]:
# Extract sentences containing 'because' and remove the second part of the clause
# 330 unique sentences in total
with open('en_sentences.txt', 'r') as fin, open('en_original.txt', 'w') as fout:
    for line in fin:
        sentence = ''
        tokens = line.split(" ")
        for token in tokens:
            if token == 'because':
                print(sentence + '.', end='\n', file=fout)
            sentence = sentence + token.replace(',', '') + ' '

In [225]:
# Modify gender ambiguous words with gender

# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())
        
with open('en_original.txt') as in_file, open('en_disambiguated.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "female " + token # could also replace with "female"
                break
        print(' '.join(sentence), end='', file=out_file)

# Translation English-German

In [92]:
# Tokenization
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')
md_en = MosesDetokenizer(lang='en')

with open('en_original.txt') as fin, open('tok.en_original.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout) 
        
with open('en_disambiguated.txt') as fin, open('tok.en_disambiguated.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished tokenizing.')

Finished tokenizing.


In [93]:
# Dividing text into subword units

!$FASTBPE applybpe bpe.en_original.en tok.en_original.en bpecodes.en
!$FASTBPE applybpe bpe.en_disambiguated.en tok.en_disambiguated.en bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original.en ...
Read 2733 words (470 unique) from text file.
Applying BPE to tok.en_original.en ...
Modified 2733 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_disambiguated.en ...
Read 3167 words (471 unique) from text file.
Applying BPE to tok.en_disambiguated.en ...
Modified 3167 words from text file.
Finished subword.


In [94]:
# Binarize text
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_en-de \
    --workers 8

2023-07-18 11:37:59 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcd

In [95]:
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_disambiguated \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_disambiguated_en-de \
    --workers 8

2023-07-18 11:38:03 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_disambiguated_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en',

In [96]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [97]:
# Generate N hypothesis
!fairseq-generate data-bin_original_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_en-de.decode_Beam_10.log

2023-07-18 11:39:41 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [98]:
# Generate N hypothesis
!fairseq-generate data-bin_disambiguated_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_en-de.decode_Beam_10.log

2023-07-18 11:40:56 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

# Backtranslation German-English

In [99]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original.txt
!grep ^H disambiguated_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated.txt

In [100]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original.de hyp_original.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_disambiguated.de hyp_disambiguated.txt bpecodes.de

print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original.txt ...
Read 27703 words (1212 unique) from text file.
Applying BPE to hyp_original.txt ...
Modified 27703 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_disambiguated.txt ...
Read 28129 words (1209 unique) from text file.
Applying BPE to hyp_disambiguated.txt ...
Modified 28129 words from text file.
Finished subword.


In [101]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_de-en \
    --workers 8

2023-07-18 11:44:13 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcd

In [102]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_disambiguated \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_disambiguated_de-en \
    --workers 8

2023-07-18 11:44:18 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_disambiguated_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de',

In [103]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [104]:
# Generate N hypothesis
!fairseq-generate data-bin_original_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_de-en.decode_Beam_10_backtranslation.log

2023-07-18 11:45:17 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [105]:
# Generate N hypothesis
!fairseq-generate data-bin_disambiguated_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_de-en.decode_Beam_10_backtranslation.log

2023-07-18 11:48:47 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [106]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_back.txt
!grep ^H disambiguated_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated_back.txt

In [107]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_back.txt', encoding='utf8') as fin, open('original_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_disambiguated_back.txt', encoding='utf8') as fin, open('disambiguated_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [108]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
# List with disambiguated source sentences
source_disambiguated = []
with open('en_disambiguated.txt', 'r') as fin:
    for line in fin:
        source_disambiguated.append(line.strip())
    
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated            
nbest_disambiguated = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))
print(len(nbest_disambiguated))

330
330
330


## Count unique sentences

In [112]:
# Count unique sentences in source nbest list for each source sentence of original; 9.945454545454545
# Value should be 10, because beam search generates 10 unique sentences
unique_sent = []
for source_nbest in nbest_original:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

9.945454545454545


In [113]:
# Count unique sentences in source nbest list for each source sentence of modified; 9.954545454545455
unique_sent = []
for source_nbest in nbest_disambiguated:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

9.927272727272728


## Count unique words

In [114]:
# Count unique words in source nbest list for each source sentence of original; 16.836363636363636
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_original:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [115]:
# Count unique words in source nbest list for each source sentence of modified; 17.64848484848485
# !!! This is normal to generate more unique words, because the disambiguated sentences have more words in total
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_disambiguated:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


# Statistics on backtranslations

In [205]:
# List with original source sentences
source_original = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source_original.append(line.strip())
        
# List with disambiguated source sentences
source_disambiguated = []
with open('en_disambiguated.txt', 'r') as fin:
    for line in fin:
        source_disambiguated.append(line.strip())
    
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated
nbest_disambiguated = []
with open('disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []
            
print(len(nbest_original))
print(len(nbest_disambiguated))

330
330


## Source sentence reoccurrence

In [206]:
# Count how many times the source sentence occurs in the nbest list of original; 258
results = []
counter = 0
for sent in source_original:
    matches = 0
    for target in nbest_original[counter]: 
        if (sent == target):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(sum(results)/330)
print(sum(x > 0 for x in results))

5.575757575757576
258


In [207]:
# Count how many times the source sentence occurs in the nbest list of disambiguated; 230
results = []
counter = 0
for sent in source_disambiguated:
    matches = 0
    for target in nbest_disambiguated[counter]: 
        if (sent == target):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(sum(results)/330)
print(sum(x > 0 for x in results))

3.903030303030303
230


## Ambiguous source words reoccurrence


In [208]:
# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())

# Extract ambiguous words from source sentences
ambiguous_words = [] 
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break
        
print(ambiguous_words)
print(len(ambiguous_words))
        
# List with nbest sentences for every source
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []

nbest_disambiguated = []
with open('disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []  

print(len(nbest_original))
print(len(nbest_disambiguated))     

['developer', 'mechanic', 'mover', 'assistant', 'chief', 'salesperson', 'lawyer', 'cook', 'mover', 'farmer', 'CEO', 'hairdresser', 'developer', 'driver', 'auditor', 'CEO', 'guard', 'assistant', 'assistant', 'auditor', 'salesperson', 'manager', 'physician', 'laborer', 'physician', 'hairdresser', 'developer', 'farmer', 'receptionist', 'manager', 'cleaner', 'mechanic', 'writer', 'worker', 'editor', 'analyst', 'carpenter', 'cook', 'carpenter', 'cleaner', 'laborer', 'mechanic', 'mechanic', 'cook', 'farmer', 'CEO', 'librarian', 'chief', 'developer', 'nurse', 'lawyer', 'developer', 'mover', 'mover', 'worker', 'secretary', 'CEO', 'carpenter', 'sheriff', 'mechanic', 'analyst', 'assistant', 'chief', 'janitor', 'manager', 'supervisor', 'chief', 'worker', 'salesperson', 'lawyer', 'developer', 'sheriff', 'janitor', 'laborer', 'driver', 'mover', 'developer', 'janitor', 'salesperson', 'chief', 'laborer', 'guard', 'nurse', 'worker', 'laborer', 'lawyer', 'CEO', 'laborer', 'laborer', 'nurse', 'manager',

In [209]:
# Count how many times the source words occurs in the nbest list of original
results = []
counter = 0
for word in ambiguous_words:
    matches = 0
    for target in nbest_original[counter]: 
        if (word in target.split(" ")):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(results)
print(sum(results)/330)
print(sum(x > 0 for x in results))

[53, 100, 17, 72, 14, 3, 56, 81, 60, 73, 16, 55, 41, 100, 49, 64, 82, 100, 100, 46, 7, 74, 0, 0, 10, 46, 87, 74, 100, 100, 60, 95, 40, 70, 86, 100, 70, 78, 74, 81, 0, 98, 96, 78, 83, 29, 98, 25, 93, 100, 78, 52, 5, 0, 73, 92, 24, 80, 93, 100, 90, 66, 27, 35, 80, 35, 4, 48, 0, 66, 82, 88, 34, 0, 100, 80, 34, 43, 5, 15, 3, 85, 90, 64, 0, 77, 7, 0, 7, 70, 94, 98, 24, 95, 100, 94, 100, 97, 0, 72, 61, 100, 99, 36, 11, 99, 79, 61, 58, 100, 56, 54, 43, 1, 3, 100, 76, 53, 11, 13, 78, 96, 12, 10, 0, 99, 91, 100, 83, 100, 90, 85, 99, 90, 45, 2, 88, 96, 8, 37, 39, 67, 91, 89, 100, 100, 6, 10, 96, 91, 37, 5, 100, 9, 17, 11, 87, 11, 42, 10, 50, 1, 62, 100, 67, 56, 17, 79, 72, 75, 0, 77, 81, 95, 30, 62, 100, 57, 100, 19, 97, 9, 68, 53, 22, 99, 1, 79, 36, 21, 79, 68, 21, 86, 86, 2, 92, 26, 32, 6, 17, 60, 47, 37, 100, 100, 84, 75, 81, 75, 94, 35, 29, 61, 100, 6, 100, 83, 100, 76, 34, 13, 39, 75, 63, 46, 97, 91, 0, 100, 100, 100, 85, 28, 16, 100, 50, 55, 62, 46, 93, 100, 59, 85, 52, 75, 94, 96, 96, 27,

In [210]:
# Count how many times the source words occurs in the nbest list of disambiguated
results = []
counter = 0
for word in ambiguous_words:
    matches = 0
    for target in nbest_disambiguated[counter]: 
        if (word in target.split(" ")):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(results)
print(sum(results)/330)
print(sum(x > 0 for x in results))

[63, 100, 32, 94, 1, 5, 51, 63, 100, 75, 24, 50, 48, 100, 32, 60, 84, 100, 100, 33, 1, 68, 17, 0, 8, 40, 100, 71, 100, 93, 68, 91, 68, 80, 98, 100, 72, 65, 72, 81, 1, 100, 93, 43, 80, 24, 100, 25, 97, 61, 75, 54, 0, 16, 76, 95, 27, 75, 93, 100, 96, 70, 18, 46, 83, 37, 0, 52, 2, 58, 91, 90, 43, 0, 100, 0, 65, 47, 6, 11, 1, 87, 61, 59, 0, 63, 5, 0, 4, 39, 98, 96, 80, 100, 100, 93, 100, 99, 0, 69, 69, 96, 98, 56, 11, 99, 78, 71, 52, 100, 47, 60, 52, 0, 4, 100, 78, 60, 16, 21, 88, 97, 27, 1, 1, 91, 89, 100, 92, 100, 80, 84, 93, 88, 28, 46, 95, 96, 5, 46, 51, 91, 85, 86, 100, 100, 10, 16, 97, 89, 99, 16, 100, 32, 8, 20, 76, 10, 48, 10, 44, 6, 39, 100, 71, 58, 29, 73, 74, 91, 0, 71, 72, 99, 0, 69, 100, 38, 100, 25, 100, 8, 57, 41, 37, 100, 13, 73, 47, 42, 63, 73, 17, 76, 82, 18, 95, 38, 29, 9, 19, 87, 58, 30, 99, 100, 95, 83, 82, 85, 97, 70, 8, 73, 100, 7, 100, 85, 95, 53, 36, 15, 46, 80, 57, 50, 100, 83, 1, 100, 100, 100, 79, 17, 12, 96, 63, 72, 69, 64, 94, 100, 68, 90, 60, 68, 90, 99, 99, 

## Count unique sentences

In [211]:
# Count unique sentences in source nbest list for each source sentence of original; 46.06060606060606
unique_sent = []
for source_nbest in nbest_original:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

46.06060606060606


In [212]:
# Count unique sentences in source nbest list for each source sentence of modified; 51.77272727272727
unique_sent = []
for source_nbest in nbest_disambiguated:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

51.77272727272727


## Count unique words

In [None]:
# Count unique words in source nbest list for each source sentence of original; 22.593939393939394
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_original:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

In [None]:
# Count unique words in source nbest list for each source sentence of modified; 22.348484848484848
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_disambiguated:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

# Word alignement (source-translation)

- Count how many unique ambiguous words are in total in source text
- Extract the position of the first ambiguous word from each sentence

In [124]:
# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())
        
ambiguous_words = set() # set forces uniqueness
positions_ambiguous_words = []

with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.add(token)
                position = tokens.index(token)
                positions_ambiguous_words.append(position)
                break
        
print(ambiguous_words)
print(len(ambiguous_words))
print(positions_ambiguous_words)
print(len(positions_ambiguous_words))

{'driver', 'attendant', 'cook', 'laborer', 'accountant', 'housekeeper', 'assistant', 'developer', 'writer', 'librarian', 'analyst', 'editor', 'manager', 'nurse', 'customer', 'programmer', 'advisor', 'supervisor', 'sheriff', 'painter', 'specialist', 'counselor', 'bartender', 'carpenter', 'practitioner', 'mechanic', 'dietitian', 'salesperson', 'farmer', 'tailor', 'secretary', 'broker', 'baker', 'clerk', 'mover', 'firefighter', 'auditor', 'patient', 'receptionist', 'CEO', 'teenager', 'therapist', 'janitor', 'chief', 'physician', 'scientist', 'lawyer', 'guard', 'undergraduate', 'worker', 'hairdresser', 'examiner', 'cleaner'}
53
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [125]:
# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
         
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []

print(len(source))
print(len(nbest_original))           
        
count = 0
with open('original_source-target_en-de.txt', 'w') as fout:
    while count < 330:
        for hyp in nbest_original[count]:
            print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

330
330


In [126]:
# List with disambiguated source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
         
# List with nbest sentences for every source in original 
nbest_disambiguated = []
counter = 0
temp = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []

print(len(source))
print(len(nbest_disambiguated))           
        
count = 0
with open('disambiguated_source-target_en-de.txt', 'w') as fout:
    while count < 330:
        for hyp in nbest_disambiguated[count]:
            print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

330
330


## fast_align

In [127]:
!$FAST_ALIGN -i original_source-target_en-de.txt -d -o -v > original_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_source-target_en-de.txt -d -o -v > disambiguated_source-target_en-de_fast-aligned.txt

ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
...
expected target length = source length * 1.01967
ITERATION 1
...
  log_e likelihood: -574097
  log_2 likelihood: -828246
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.189235
       size counts: 67
ITERATION 2
...
  log_e likelihood: -89748.3
  log_2 likelihood: -129479
     cross entropy: 4.67384
        perplexity: 25.5251
      posterior p0: 0.0236035
 posterior al-feat: -0.146952
       size counts: 67
  1  model al-feat: -0.164113 (tension=4)
  2  model al-feat: -0.155528 (tension=4.34323)
  3  model al-feat: -0.151469 (tension=4.51475)
  4  model al-feat: -0.149392 (tension=4.6051)
  5  model al-feat: -0.148286 (tension=4.6539)
  6  model al-feat: -0.147687 (tension=4.68059)
  7  model al-feat: -0.147358 (tension=4.69529)
  8  model al-feat: -0.147177 (tension=4.70341)
     final tension: 4.70791
ITERATION 3
...
  log_e likelihood: -68388.7
  log_2 likelihood: -98664
     cross entrop

In [128]:
# Extract target translated words to source words in original

import re

# List with original translations
translations_original = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations_original.append(line.strip())

              
lineNumber = 0
counter = 0
indices = [] # a list of lists of indices of translated words for each ambiguous word
with open('original_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            indices.append([int(index) for index in re.findall(regex, line)])
        else:
            indices.append([999])
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_original:
    tokens = translation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            translated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    
print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/330)

[{'Bauunternehmer', 'Bauträger', 'Entwickler'}, {'Mechaniker'}, {'Mover', 'Umzugsteilnehmer', 'Macher', 'Umzugshelfer'}, {'Assistentin'}, {'Chefin', 'Häuptling', 'Chef', 'Verwaltungschef'}, {'Verkäuferin', 'Verkäufer'}, {'Jurist', 'Anwalt', 'Rechtsanwalt', 'Anwältin'}, {'Köchin', 'Koch'}, {'Mover', 'Beweger', 'Umzugshelfer'}, {'Bauer', 'Landwirt', 'Bäuerin'}, {'Vorstandschef', 'Geschäftsführer', 'Geschäftsführerin', 'Vorstandsvorsitzende', 'Firmenchef', 'Chef', 'CEO'}, {'Frisör', 'Friseur', 'Friseurin'}, {'Bauunternehmer', 'Entwickler', 'Bauunternehmerin', 'Bauträger', 'war', 'Bauherr'}, {'Fahrer'}, {'Rechnungsprüfer', 'Prüfer', 'Wirtschaftsprüfer'}, {'Geschäftsführer', 'Vorstandsvorsitzende'}, {'Wache', 'Wärter', 'Wachmann'}, set(), set(), {'Rechnungsprüfer', 'Prüfer', 'Revisor', 'Kassenprüfer', 'Wirtschaftsprüfer'}, {'Verkäuferin', 'Verkäufer'}, {'Geschäftsführer', 'Manager'}, {'Arzt'}, {'Angestellte', 'Arbeiterin', 'Arbeiter'}, {'Mediziner', 'Arzt', 'Ärztin'}, {'Frisör', 'Friseur', 

In [129]:
# Extract target translated words to source words in disambiguated

import re

# List with original translations
translations_disambiguated = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations_disambiguated.append(line.strip())

              
lineNumber = 0
counter = 0
indices = [] # a list of lists of indices of translated words for each ambiguous word
with open('disambiguated_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            indices.append([int(index) for index in re.findall(regex, line)])
        else:
            indices.append([999])
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_disambiguated:
    tokens = translation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            translated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    
print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/330)

[{'Bauunternehmerin', 'Designerin', 'Entwicklerin'}, {'Mechanikerin'}, {'Fahrerin', 'Umzugshelferin', 'Umzugsfrau'}, {'Assistentin'}, {'Chefin', 'Chef', 'Leiterin'}, {'Verkäuferin'}, {'Rechtsanwältin', 'Juristin', 'Anwältin'}, {'Köchin'}, {'Mover', 'Umzugsdame', 'Umzugshelferin', 'Umzugsfrau'}, {'Landwirtin', 'Bäuerin'}, {'Chefin', 'Geschäftsführerin', 'Vorstandsvorsitzende', 'Firmenchefin', 'Vorstandsfrau'}, {'Frisörin', 'Friseurin'}, {'Designerin', 'Bauunternehmerin', 'war', 'Bauträgerin', 'Entwicklerin'}, {'Autofahrerin', 'Fahrerin'}, {'Prüferin', 'Wirtschaftsprüferin'}, {'Geschäftsführerin', 'Chefin', 'Vorstandsvorsitzende', '@-@', 'CEO'}, {'Wachfrau', 'Wärterin', 'Wache', 'weibliche', 'Aufseherin', 'Garde', 'Wächterin'}, {'Assistentin'}, {'Assistentin'}, {'Revisorin', 'Rechnungsprüferin', 'Prüferin', 'Kassenprüferin'}, {'Verkäuferin'}, {'Geschäftsführerin', 'Managerin'}, {'Ärztin'}, {'Angestellte', 'Arbeiterin', 'Mitarbeiterin'}, {'Ärztin', 'Medizinerin'}, {'Frisörin', 'Friseurmei

## awesome_align

In [130]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Loading the dataset...
Extracting: 3300it [00:04, 666.31it/s]


In [131]:
!awesome-align \
    --output_file "disambiguated_source-target_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Loading the dataset...
Extracting: 3300it [00:05, 650.23it/s]


In [132]:
# Extract target translated words to source words in original

import re

# List with original translations
translations_original = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations_original.append(line.strip())

              
lineNumber = 0
counter = 0
indices = [] # a list of lists of indices of translated words for each ambiguous word
with open('original_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            indices.append([int(index) for index in re.findall(regex, line)])
        else:
            indices.append([999])
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_original:
    tokens = translation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            translated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    
#print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/330)

330
2.6636363636363636


In [133]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_translations_original.txt', 'w') as fout:
    while count < 330:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(translations_ambiguous_words[count]), end='\n', file=fout)
        count += 1

In [201]:
# Extract target translated words to source words in disambiguated

import re

# List with original translations
translations_disambiguated = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations_disambiguated.append(line.strip())

              
lineNumber = 0
counter = 0
indices = [] # a list of lists of indices of translated words for each ambiguous word
with open('disambiguated_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line):
            indices.append([int(index) for index in re.findall(regex, line)])
        else:
            indices.append([999])
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_disambiguated:
    tokens = translation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            translated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    
print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/330)

[{'Entwicklerin'}, {'Mechanikerin'}, {'Umzugsteilnehmerin', 'Fahrerin', 'Umzugshelferin', 'Umzugsfrau'}, {'Assistentin'}, {'Chefin', 'Chef', 'Leiterin'}, {'Verkäuferin'}, {'Rechtsanwältin', 'Juristin', 'Anwältin'}, {'Köchin'}, {'Mover', 'Friseur', 'Umzugshelferin', 'Friseurin'}, {'Landwirtin', 'Bäuerin'}, {'Chefin', 'Geschäftsführerin', 'Vorstandsvorsitzende', 'Firmenchefin', 'Vorstandsfrau'}, {'Frisörin', 'Friseurin'}, {'Bauunternehmerin', 'Designerin', 'Bauträgerin', 'Entwicklerin'}, {'Autofahrerin', 'Fahrerin'}, {'Prüferin', 'Entwicklerin', 'Wirtschaftsprüferin'}, {'Geschäftsführerin', 'Chefin', 'Vorstandsvorsitzende', 'CEO'}, {'Wachfrau', 'Wärterin', 'Wache', 'Aufseherin', 'Garde', 'Wächterin'}, {'der', 'Assistentin', 'des'}, {'der', 'Assistentin', 'des'}, {'Revisorin', 'Prüferin', 'Wachfrau', 'Kassenprüferin', 'Rechnungsprüferin'}, {'Verkäuferin'}, {'Chefin', 'Managerin', 'Geschäftsführerin'}, {'Ärztin'}, {'Angestellte', 'Arbeiterin', 'Mitarbeiterin'}, {'Ärztin', 'Medizinerin'}, {

In [135]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_translations_disambiguated.txt', 'w') as fout:
    while count < 330:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(translations_ambiguous_words[count]), end='\n', file=fout)
        count += 1

# Word alignement (translation-backtranslation)

## fast_align

- Extract the position of the translated ambiguous word from each sentence

In [136]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_original = [] # a list of lists of indices of translated words for each ambiguous word
with open('original_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            positions_ambiguous_words_original.append([int(index) for index in re.findall(regex, line)])
        else:
            positions_ambiguous_words_original.append([999])
        lineNumber += 1
        
print(len(positions_ambiguous_words_original))
print(positions_ambiguous_words_original)

3300
[[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [999], [999], [999], [999], [999], [999], [999], [999], [999], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [999], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1, 2], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [999], [1], [999], [1], [999], [1], [1], [999], [999], [999], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [999], [999], [999], [999], [999], [999], [999], [999], [999], [999], [999], [999], [999], [999], [999], [999

In [137]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_disambiguated = [] # a list of lists of indices of translated words for each ambiguous word
with open('disambiguated_source-target_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            positions_ambiguous_words_disambiguated.append([int(index) for index in re.findall(regex, line)])
        else:
            positions_ambiguous_words_disambiguated.append([999])
        lineNumber += 1
        
print(len(positions_ambiguous_words_disambiguated))
print(positions_ambiguous_words_disambiguated)

3300
[[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [999], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [1], [1], [2], [2], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [1], [1], [1], [1], [1], [2], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1, 2], [1], [1], [1], [1], [1], [1], [1, 2], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [1], [1], [2], [1, 2], [1], [1], [1], [1], [1, 2], [1], [1], [1], [1, 2], [1, 2], [1], [1], [1, 2], [1], [999], [1], [1], [1], [999], [999], [1], [999], [999], [999], [1], [1], [1], [999], [1], [1], [8], [7], [7], [999], [1], 

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [138]:
# List with original translated sentences
translations = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations.append(line.strip())
         
# List with nbest sentences for every translation in original 
nbest_original = []
counter = 0
temp = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []

print(len(translations))
print(len(nbest_original))          
        
count = 0
with open('original_translation-back_en-de.txt', 'w') as fout:
    while count < 3300:
        for hyp in nbest_original[count]:
            print(translations[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

3300
3300


In [139]:
# List with disambiguated translated sentences
translations = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations.append(line.strip())
         
# List with nbest sentences for every translation in original 
nbest_disambiguated = []
counter = 0
temp = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []

print(len(translations))
print(len(nbest_disambiguated))           
        
count = 0
with open('disambiguated_translation-back_en-de.txt', 'w') as fout:
    while count < 3300:
        for hyp in nbest_disambiguated[count]:
            print(translations[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

3300
3300


- Word alignement

In [140]:
!$FAST_ALIGN -i original_translation-back_en-de.txt -d -o -v > original_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_translation-back_en-de.txt -d -o -v > disambiguated_translation-back_en-de_fast-aligned.txt

ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
.................................
expected target length = source length * 1.01162
ITERATION 1
.................................
  log_e likelihood: -5.74155e+06
  log_2 likelihood: -8.2833e+06
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.189396
       size counts: 116
ITERATION 2
.................................
  log_e likelihood: -780039
  log_2 likelihood: -1.12536e+06
     cross entropy: 4.06182
        perplexity: 16.7005
      posterior p0: 0.0250382
 posterior al-feat: -0.146595
       size counts: 116
  1  model al-feat: -0.167054 (tension=4)
  2  model al-feat: -0.156813 (tension=4.40918)
  3  model al-feat: -0.152029 (tension=4.61356)
  4  model al-feat: -0.149571 (tension=4.72225)
  5  model al-feat: -0.148249 (tension=4.78177)
  6  model al-feat: -0.147522 (tension=4.81485)
  7  model al-feat: -0.147116 (tension=4.83339)
  8  model al-feat: -0.146889 (tension=4.84382)
     final

- Extract target backtranslated words

In [141]:
import re

# List with original backtranslations
backtranslations_original = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_original.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('original_translation-back_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        positions = positions_ambiguous_words_original[counter] # exact positions of ambiguous words
        list_indices = []
        for position in positions:
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                list_indices.extend([int(index) for index in re.findall(regex, line)])
            else:
                list_indices.extend([999])
        indices.append(list_indices)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_original:
    tokens = backtranslation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            backtranslated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    backtranslated_ambiguous_words.update(set_words)
    counter += 1
    if (counter % 10 == 0):
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/330)

3300
330
6.942424242424242


In [142]:
import re

# List with disambiguated backtranslations
backtranslations_disambiguated = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_disambiguated.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('disambiguated_translation-back_en-de_fast-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        positions = positions_ambiguous_words_disambiguated[counter] # exact positions of ambiguous words
        list_indices = []
        for position in positions:
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                list_indices.extend([int(index) for index in re.findall(regex, line)])
            else:
                list_indices.extend([999])
        indices.append(list_indices)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_disambiguated:
    tokens = backtranslation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            backtranslated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    backtranslated_ambiguous_words.update(set_words)
    counter += 1
    if (counter % 10 == 0):
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/330)

3300
330
7.678787878787879


## awesome_align

- Extract the position of the translated ambiguous word from each sentence

In [143]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_original = [] # a list of lists of indices of translated words for each ambiguous word
with open('original_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            positions_ambiguous_words_original.append([int(index) for index in re.findall(regex, line)])
        else:
            positions_ambiguous_words_original.append([999])
        lineNumber += 1
        
print(len(positions_ambiguous_words_original))
#print(positions_ambiguous_words_original)

3300


In [144]:
import re
             
lineNumber = 0
counter = 0
positions_ambiguous_words_disambiguated = [] # a list of lists of indices of translated words for each ambiguous word
with open('disambiguated_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            positions_ambiguous_words_disambiguated.append([int(index) for index in re.findall(regex, line)])
        else:
            positions_ambiguous_words_disambiguated.append([999])
        lineNumber += 1
        
print(len(positions_ambiguous_words_disambiguated))
#print(positions_ambiguous_words_disambiguated)

3300


- Word alignement

In [145]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Loading the dataset...
Extracting: 33000it [00:43, 760.46it/s]


In [146]:
!awesome-align \
    --output_file "disambiguated_translation-back_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

Loading the dataset...
Extracting: 33000it [00:44, 744.33it/s]


- Extract target backtranslated words

In [147]:
import re

# List with original backtranslations
backtranslations_original = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_original.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('original_translation-back_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        positions = positions_ambiguous_words_original[counter] # exact positions of ambiguous words
        list_indices = []
        for position in positions:
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                list_indices.extend([int(index) for index in re.findall(regex, line)])
            else:
                list_indices.extend([999])
        indices.append(list_indices)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_original:
    tokens = backtranslation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            backtranslated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    backtranslated_ambiguous_words.update(set_words)
    counter += 1
    if (counter % 10 == 0):
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/330)

3300
330
6.66969696969697


In [148]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_backtranslations_original.txt', 'w') as fout:
    while count < 330:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(backtranslations_ambiguous_words_reduced[count]), end='\n', file=fout)
        count += 1

In [149]:
import re

# List with disambiguated backtranslations
backtranslations_disambiguated = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        backtranslations_disambiguated.append(line.strip())
             
lineNumber = 0
counter = 0
indices = []
with open('disambiguated_translation-back_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        positions = positions_ambiguous_words_disambiguated[counter] # exact positions of ambiguous words
        list_indices = []
        for position in positions:
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                list_indices.extend([int(index) for index in re.findall(regex, line)])
            else:
                list_indices.extend([999])
        indices.append(list_indices)
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
backtranslated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations_disambiguated:
    tokens = backtranslation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            backtranslated_ambiguous_words.add(tokens[ind])
    lineNumber += 1
    if (lineNumber % 10 == 0):
            backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()
    
#print(backtranslations_ambiguous_words)
print(len(backtranslations_ambiguous_words))

# Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
backtranslations_ambiguous_words_reduced = []
backtranslated_ambiguous_words = set() # set forces uniqueness
counter = 0
for set_words in backtranslations_ambiguous_words:
    backtranslated_ambiguous_words.update(set_words)
    counter += 1
    if (counter % 10 == 0):
        backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
        backtranslated_ambiguous_words = set()

print(len(backtranslations_ambiguous_words_reduced))   
    
unique_backtranslations = 0
for set_words in backtranslations_ambiguous_words_reduced:
    unique_backtranslations += len(set_words)
print(unique_backtranslations/330)

3300
330
7.463636363636364


In [150]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0                
with open('unique-words_backtranslations_disambiguated.txt', 'w') as fout:
    while count < 330:
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(backtranslations_ambiguous_words_reduced[count]), end='\n', file=fout)
        count += 1

# Word alignement (translation-translation)

## Tercom alignement (borrowed from Tu)
- https://github.com/TuAnh23/Perturbation-basedQE/blob/master/align_and_analyse_ambiguous_trans.py#L54-L92

### Extract target translated words to source words in original

In [151]:
# List with original source sentences; output 100 times to match backtranslation size
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        for i in range(100): # append the source sentence 100 times to match backtranslations later
            source.append(line.strip().split()) # split() tokenizes the sentence, because tercom expects tokens     

print(len(source))

# List with original backtranslations
backtranslations = []
with open('hyp_original_back.txt', 'r') as fin:
    for line in fin:
        backtranslations.append(line.strip().split())
        
print(len(backtranslations))

33000
33000


In [352]:
!git clone https://github.com/TuAnh23/Perturbation-basedQE.git

fatal: destination path 'Perturbation-basedQE' already exists and is not an empty directory.


In [157]:
%cd $TERCOM

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE


In [158]:
import align_and_analyse_ambiguous_trans as tercom

alignments = tercom.tercom_alignment(source, backtranslations)

In [159]:
import pandas as pd

#print(positions_ambiguous_words)

lineNumber = 0
counter = 0
indices = []
for align in alignments:
    if (lineNumber == 100):
        lineNumber = 0
        counter += 1
    position = positions_ambiguous_words[counter] # exact position of ambiguous word
    indices.append([item[1] for item in (item for item in align if not(pd.isna(item[0]))) if item[0] == position][0])
    lineNumber += 1

print(len(indices))

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations:
    backtranslation_index = backtranslations.index(backtranslation)
    if not(pd.isna(indices[backtranslation_index])):
        translated_ambiguous_words.add(backtranslation[indices[backtranslation_index]])
    lineNumber += 1
    if (lineNumber % 100 == 0):
        translations_ambiguous_words.append(translated_ambiguous_words)
        translated_ambiguous_words = set()

    
#print(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/330)

33000
330
6.1030303030303035


### Extract target translated words to source words in disambiguated

In [161]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_female


In [162]:
# List with original source sentences; output 100 times to match backtranslation size
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        for i in range(100): # append the source sentence 100 times to match backtranslations later
            source.append(line.strip().split()) # split() tokenizes the sentence, because tercom expects tokens     

print(len(source))

# List with original backtranslations
backtranslations = []
with open('hyp_disambiguated_back.txt', 'r') as fin:
    for line in fin:
        backtranslations.append(line.strip().split())
        
print(len(backtranslations))

33000
33000


In [164]:
%cd $TERCOM

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE


In [165]:
import align_and_analyse_ambiguous_trans as tercom

alignments = tercom.tercom_alignment(source, backtranslations)

In [166]:
import pandas as pd

#print(positions_ambiguous_words)

lineNumber = 0
counter = 0
indices = []
for align in alignments:
    if (lineNumber == 100):
        lineNumber = 0
        counter += 1
    position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; skip gender word
    indices.append([item[1] for item in (item for item in align if not(pd.isna(item[0]))) if item[0] == position][0])
    lineNumber += 1

print(len(indices))

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for backtranslation in backtranslations:
    backtranslation_index = backtranslations.index(backtranslation)
    if not(pd.isna(indices[backtranslation_index])):
        translated_ambiguous_words.add(backtranslation[indices[backtranslation_index]])
    lineNumber += 1
    if (lineNumber % 100 == 0):
        translations_ambiguous_words.append(translated_ambiguous_words)
        translated_ambiguous_words = set()

    
#(translations_ambiguous_words)
print(len(translations_ambiguous_words))

unique_translations = 0
for set_words in translations_ambiguous_words:
    unique_translations += len(set_words)
print(unique_translations/330)

33000
330
7.3


# Word occurrence

## Translation

In [170]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_female


In [171]:
def extract_alignment_indices_translation(filename_translations):
    """
    Extract alignment indices
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_translations, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
    
    return indices_translation

In [172]:
indices_original = extract_alignment_indices_translation('original_source-target_en-de_awesome-aligned.txt')
indices_disambiguated = extract_alignment_indices_translation('disambiguated_source-target_en-de_awesome-aligned.txt')

In [173]:
def extract_word_translations(filename_tokenized, filename_translations, filename_out, indices_translation):
    """
    Match alignment indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    translations = []
    with open(filename_translations, 'r') as fin:
         for line in fin:
                translations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 330
    counter = 0
    for i in range(0, 330): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 9):
                alignments = indices_translation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(translations[counter + f])):
                                 words_set.add(translations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 10

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 330:
            print(source[count] + ' | ' + str(target_words[count]), end='\n', file=fout)
            count += 1

In [174]:
def count_word_translations(filename_tokenized, filename_translations, filename_out, indices_translation):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    translations = []
    with open(filename_translations, 'r') as fin:
         for line in fin:
                translations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 330
    counter = 0
    for i in range(0, 330): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 9):
                alignments = indices_translation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(translations[counter + f])):
                                 words_set.add(translations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 10

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 330:
            print(source[count] + ' | ' + str([len(target_set) for target_set in target_words[count]]), end='\n', file=fout)
            count += 1

In [175]:
extract_word_translations('tok.en_original.en', 'hyp_original.txt', 'translations_words_original.txt', indices_original)
count_word_translations('tok.en_original.en', 'hyp_original.txt', 'translations_words_original_occurrence.txt', indices_original)

extract_word_translations('tok.en_disambiguated.en', 'hyp_disambiguated.txt', 'translations_words_disambiguated.txt', indices_disambiguated)
count_word_translations('tok.en_disambiguated.en', 'hyp_disambiguated.txt', 'translations_words_disambiguated_occurrence.txt', indices_disambiguated)

## Backtranslation

In [176]:
def extract_alignment_indices_backtranslation(filename_translations, filename_backtranslations):
    """
    Extract alignment indices
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_translations, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
       
    # Match alignement indices from translation to backtranslation
    lineNumber = 0
    counter = 0
    indices_backtranslation = []
    with open(filename_backtranslations, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            alignement_tokens = line.split()
            indices_line = []
            for index_list in indices_translation[counter]:
                index_matches = []
                for index in index_list:
                    regex = r"" + str(index) + r"-(\d)"
                    if re.findall(regex, line): 
                        index_matches.extend([int(i) for i in re.findall(regex, line)])
                    else:
                        index_matches.extend([999])
                indices_line.append(index_matches)
            indices_backtranslation.append(indices_line)
            lineNumber += 1 
    return indices_backtranslation

In [177]:
indices_original = extract_alignment_indices_backtranslation('original_source-target_en-de_awesome-aligned.txt', 'original_translation-back_en-de_awesome-aligned.txt')
indices_disambiguated = extract_alignment_indices_backtranslation('disambiguated_source-target_en-de_awesome-aligned.txt', 'disambiguated_translation-back_en-de_awesome-aligned.txt')

In [178]:
def extract_word_backtranslations(filename_tokenized, filename_backtranslations, filename_out, indices_backtranslation):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    backtranslations = []
    with open(filename_backtranslations, 'r') as fin:
         for line in fin:
                backtranslations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 330
    counter = 0
    for i in range(0, 330): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 99):
                alignments = indices_backtranslation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(backtranslations[counter + f])):
                                 words_set.add(backtranslations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 100

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 330:
            print(source[count] + ' | ' + str(target_words[count]), end='\n', file=fout)
            count += 1

In [179]:
def count_word_backtranslations(filename_tokenized, filename_backtranslations, filename_out, indices_backtranslation):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    backtranslations = []
    with open(filename_backtranslations, 'r') as fin:
         for line in fin:
                backtranslations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 330
    counter = 0
    for i in range(0, 330): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 99):
                alignments = indices_backtranslation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(backtranslations[counter + f])):
                                 words_set.add(backtranslations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 100

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 330:
            print(source[count] + ' | ' + str([len(target_set) for target_set in target_words[count]]), end='\n', file=fout)
            count += 1

In [180]:
extract_word_backtranslations('tok.en_original.en', 'hyp_original_back.txt', 'backtranslations_words_original.txt', indices_original)
count_word_backtranslations('tok.en_original.en', 'hyp_original_back.txt', 'backtranslations_words_original_occurrence.txt', indices_original)

extract_word_backtranslations('tok.en_disambiguated.en', 'hyp_disambiguated_back.txt', 'backtranslations_words_disambiguated.txt', indices_disambiguated)
count_word_backtranslations('tok.en_disambiguated.en', 'hyp_disambiguated_back.txt', 'backtranslations_words_disambiguated_occurrence.txt', indices_disambiguated)

# Gender statistics

In [181]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_female


In [182]:
from enum import Enum

class GENDER(Enum):
    """
    Enumerate possible genders.
    Ignore option resolves to words that should be ignored in particular language
    """
    male = 0
    female = 1
    neutral = 2
    unknown = 3
    ignore = 4
    
DE_DETERMINERS = {"der": GENDER.male, "ein": GENDER.male, "dem": GENDER.male, #"den": GENDER.male, 
                  "einen": GENDER.male, "des": GENDER.male, "er": GENDER.male, "seiner": GENDER.male,
                  "ihn": GENDER.male, "seinen": GENDER.male, "ihm": GENDER.male, "ihren": GENDER.male,
                  "die": GENDER.female, "eine": GENDER.female, "einer": GENDER.female, "seinem": GENDER.male,
                  "ihrem": GENDER.male, "sein": GENDER.male,
                  "sie": GENDER.female, "seine": GENDER.female, "ihrer": GENDER.female, 
                  "ihr": GENDER.neutral, "ihre": GENDER.neutral, "das": GENDER.neutral,
                  "jemanden": GENDER.neutral}

def get_german_determiners(words):
    """
    Get a list of (gender)
    given a list of words.
    """
    determiners = []
    for (word_ind, word) in enumerate(words):
        word = word.lower()
        if word in DE_DETERMINERS:
            determiners.append((DE_DETERMINERS[word].name))
    return determiners

In [183]:
dets = get_german_determiners(["dem"])
print(dets)

['male']


- Extract positions of ambiguos words

In [184]:
# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())
        
ambiguous_words = set() # set forces uniqueness
positions_ambiguous_words = []

with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.add(token)
                position = tokens.index(token)
                positions_ambiguous_words.append(position)
                break
        
print(ambiguous_words)
print(len(ambiguous_words))
print(positions_ambiguous_words)
print(len(positions_ambiguous_words))

{'driver', 'attendant', 'cook', 'laborer', 'accountant', 'housekeeper', 'assistant', 'developer', 'writer', 'librarian', 'analyst', 'editor', 'manager', 'nurse', 'customer', 'programmer', 'advisor', 'supervisor', 'sheriff', 'painter', 'specialist', 'counselor', 'bartender', 'carpenter', 'practitioner', 'mechanic', 'dietitian', 'salesperson', 'farmer', 'tailor', 'secretary', 'broker', 'baker', 'clerk', 'mover', 'firefighter', 'auditor', 'patient', 'receptionist', 'CEO', 'teenager', 'therapist', 'janitor', 'chief', 'physician', 'scientist', 'lawyer', 'guard', 'undergraduate', 'worker', 'hairdresser', 'examiner', 'cleaner'}
53
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [185]:
# Extract target translated words to source words in original

import re

# List with original translations
translations_original = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations_original.append(line.strip())

              
lineNumber = 0
counter = 0
indices = [] # a list of lists of indices of translated words for each ambiguous word
with open('original_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            indices.append([int(index) for index in re.findall(regex, line)])
        else:
            indices.append([999])
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_original:
    if (lineNumber == 10):
            lineNumber = 0
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    tokens = translation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            translated_ambiguous_words.add(tokens[0]) # extract articles; currently assume index 0 for article position, TODO
    lineNumber += 1
translations_ambiguous_words.append(translated_ambiguous_words) # last lines

In [186]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0  
genders = []
male = []
female = []
with open('unique-words_translations_original_articles.txt', 'w') as fout:
    while count < 330:
        #print(translations_ambiguous_words[count])
        genders.append(set(get_german_determiners(translations_ambiguous_words[count])))
        male.append("male" in get_german_determiners(translations_ambiguous_words[count]))
        female.append("female" in get_german_determiners(translations_ambiguous_words[count]))
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(get_german_determiners(translations_ambiguous_words[count])), end='\n', file=fout)
        count += 1

In [187]:
print(genders)
print(sum(1 for i in genders if ('male' in i and 'female' in i))) # both genders

print(male)
print(male.count(True)) # only male gender

print(female)
print(female.count(True)) # only female gender

[{'male'}, {'male'}, {'male'}, {'female', 'male'}, {'female', 'male'}, {'female', 'male'}, {'female', 'male'}, {'female', 'male'}, {'male'}, {'female', 'male'}, {'female', 'male'}, {'female', 'male'}, {'female', 'male'}, {'male'}, {'male'}, {'male'}, {'female', 'male'}, {'male'}, {'male'}, {'male'}, {'female', 'male'}, {'male'}, {'male'}, {'female', 'male'}, {'female', 'male'}, {'female', 'male'}, {'male'}, {'male'}, {'female', 'male'}, {'male'}, {'female', 'male'}, {'male'}, {'female', 'male'}, {'male'}, {'female', 'male'}, {'male'}, {'male'}, {'female', 'male'}, {'male'}, {'female', 'male'}, {'male'}, {'male'}, {'male'}, {'female', 'male'}, {'male'}, {'male'}, {'female', 'male'}, {'female', 'male'}, {'male'}, {'female'}, {'male'}, {'male'}, {'male'}, {'female', 'male'}, {'female', 'male'}, {'female', 'male'}, {'male'}, {'male'}, {'male'}, {'male'}, {'male'}, {'female', 'male'}, {'female', 'male'}, {'male'}, {'male'}, {'female', 'male'}, {'male'}, {'female', 'male'}, {'female', 'male'

In [188]:
# Extract target translated words to source words in disambiguated

import re

# List with original translations
translations_disambiguated = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations_disambiguated.append(line.strip())

              
lineNumber = 0
counter = 0
indices = [] # a list of lists of indices of translated words for each ambiguous word
with open('disambiguated_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        position = positions_ambiguous_words[counter] + 1 # exact position of ambiguous word; !!! add 1 because of gender word
        regex = r"" + str(position) + r"-(\d)"
        if re.findall(regex, line): 
            indices.append([int(index) for index in re.findall(regex, line)])
        else:
            indices.append([999])
        lineNumber += 1
        
#print(len(indices))
#print(indices)

lineNumber = 0
translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
translated_ambiguous_words = set() # set forces uniqueness
for translation in translations_disambiguated:
    if (lineNumber == 10):
            lineNumber = 0
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()
    tokens = translation.split(' ')
    if 999 not in indices[lineNumber]:
        for ind in indices[lineNumber]:
            translated_ambiguous_words.add(tokens[0]) # extract articles; currently assume index 0 for article position, TODO
    lineNumber += 1
translations_ambiguous_words.append(translated_ambiguous_words) # last lines

In [189]:
# Add results to file

# List with original source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
ambiguous_words = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        tokens = line.split(' ')
        for token in tokens:
            if token in words:
                ambiguous_words.append(token)
                break

count = 0    
genders = []
male = []
female = []
with open('unique-words_translations_disambiguated_articles.txt', 'w') as fout:
    while count < 330:
        #print(translations_ambiguous_words[count])
        #print(get_german_determiners(translations_ambiguous_words[count]))
        genders.append(set(get_german_determiners(translations_ambiguous_words[count])))
        male.append("male" in get_german_determiners(translations_ambiguous_words[count]))
        female.append("female" in get_german_determiners(translations_ambiguous_words[count]))
        print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(get_german_determiners(translations_ambiguous_words[count])), end='\n', file=fout)
        count += 1

In [190]:
print(genders)
print(sum(1 for i in genders if ('male' in i and 'female' in i))) # both genders

print(male)
print(male.count(True)) # only male gender

print(female)
print(female.count(True)) # only female gender

[{'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'male'}, {'female'}, {'female'}, {'female'}, {'female', 'male'}, {'female', 'male'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female', 'male'}, {'female'}, {'male'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female'}, {'female', 'male'}, {'female'},