In [4]:
import torch
import random
import numpy as np
import re

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_100"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool
TERCOM = "/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE"

In [5]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [6]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_100


# Translation English-German

In [8]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
NBEST = 100
BEAM = 100

In [12]:
# Generate translations
# Beam search
!fairseq-generate data-bin_original_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 8 \
    --memory-efficient-fp16 \
    --remove-bpe > original_en-de.decode_Beam_10.log

2023-09-19 12:00:16 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [13]:
!fairseq-generate data-bin_disambiguated_male_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 8 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_male_en-de.decode_Beam_10.log

2023-09-19 12:03:30 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [14]:
!fairseq-generate data-bin_disambiguated_female_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 8 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_female_en-de.decode_Beam_10.log

print('Finished translation.')

2023-09-19 12:06:47 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [15]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original.txt
!grep ^H disambiguated_male_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated_male.txt
!grep ^H disambiguated_female_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated_female.txt

# Backtranslation German-English

In [16]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original.de hyp_original.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_disambiguated_male.de hyp_disambiguated_male.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_disambiguated_female.de hyp_disambiguated_female.txt bpecodes.de


print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original.txt ...
Read 249165 words (3939 unique) from text file.
Applying BPE to hyp_original.txt ...
Modified 249165 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_disambiguated_male.txt ...
Read 269736 words (3573 unique) from text file.
Applying BPE to hyp_disambiguated_male.txt ...
Modified 269736 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_disambiguated_female.txt ...
Read 255036 words (3895 unique) from text file.
Applying BPE to hyp_disambiguated_female.txt ...
Modified 255036 words from text file.
Finished subword.


In [17]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_de-en \
    --workers 8

!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_disambiguated_male \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_disambiguated_male_de-en \
    --workers 8

!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_disambiguated_female \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_disambiguated_female_de-en \
    --workers 8

print('Finished preprocessing.')

2023-09-19 12:10:23 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcd

In [18]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"
NBEST = 100
BEAM = 100

In [19]:
# Generate backtranslations
!fairseq-generate data-bin_original_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 8 \
    --memory-efficient-fp16 \
    --remove-bpe > original_de-en.decode_Beam_10_backtranslation.log

2023-09-19 12:11:06 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [85]:
!fairseq-generate data-bin_disambiguated_male_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 8 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_male_de-en.decode_Beam_10_backtranslation.log

In [85]:
!fairseq-generate data-bin_disambiguated_female_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 8 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_female_de-en.decode_Beam_10_backtranslation.log

print('Finished translation.')

2023-08-21 11:34:56 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [86]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_back.txt
!grep ^H disambiguated_male_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated_male_back.txt
!grep ^H disambiguated_female_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated_female_back.txt

In [87]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_back.txt', encoding='utf8') as fin, open('original_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_disambiguated_male_back.txt', encoding='utf8') as fin, open('disambiguated_male_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_disambiguated_female_back.txt', encoding='utf8') as fin, open('disambiguated_female_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [134]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
# List with disambiguated source sentences male
source_disambiguated_male = []
with open('en_disambiguated_male.txt', 'r') as fin:
    for line in fin:
        source_disambiguated_male.append(line.strip())
        
# List with disambiguated source sentences female
source_disambiguated_female = []
with open('en_disambiguated_female.txt', 'r') as fin:
    for line in fin:
        source_disambiguated_female.append(line.strip())        
    
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated male        
nbest_disambiguated_male = []
with open('hyp_disambiguated_male.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated_male.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated female        
nbest_disambiguated_female = []
with open('hyp_disambiguated_female.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated_female.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))
print(len(nbest_disambiguated_male))
print(len(nbest_disambiguated_female))

335
335
335
335


## Count unique sentences

In [135]:
# Count unique sentences in source nbest list for each source sentence
def count_unique_sentences(nbest_sentences):
    unique_sent = []
    for source_nbest in nbest_sentences:
        num_values = len(set(source_nbest))
        #print(num_values)
        unique_sent.append(num_values)

    #print(unique_sent)
    return sum(unique_sent)/len(nbest_sentences) # average

In [136]:
# Value should be 10, because beam search generates 10 unique sentences
print(count_unique_sentences(nbest_original))

print(count_unique_sentences(nbest_disambiguated_male))

print(count_unique_sentences(nbest_disambiguated_female))

9.943283582089553
9.949253731343283
9.874626865671642


## Count unique words

In [137]:
# Count unique words in source nbest list for each source sentence of original
# !!! Method is slow
import spacy

def count_unique_words(nbest_sentences):
    sp = spacy.load('en_core_web_sm')
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    
    unique_words = []
    normalizer = 0 # should normalize based on total number of words, because disambiguated sentences have more words overall
    counter = 0
    for source_nbest in nbest_sentences:
        words = set()
        for sent in source_nbest:
            tokens = sp(sent)
            normalizer += len(tokens)
            for token in tokens:
                # if token.text not in stopwords:    # checking whether the word is a stop word
                    words.add(token.text)
        num_values = len(words)
        unique_words.append(num_values)

        counter += 1
        #print(counter)

    #print(unique_words)
    print('Normalizer: ' + str(normalizer/len(nbest_sentences)))
    return (sum(unique_words)/len(nbest_sentences), (sum(unique_words)/len(nbest_sentences))/(normalizer/len(nbest_sentences))) # (average, norm average)

In [138]:
print(count_unique_words(nbest_original))

print(count_unique_words(nbest_disambiguated_male))

print(count_unique_words(nbest_disambiguated_female))

Normalizer: 74.17611940298508
(15.223880597014926, 0.20523964747072315)
Normalizer: 81.2
(15.349253731343284, 0.1890302183662966)
Normalizer: 75.74029850746268
(15.274626865671642, 0.20167106767035828)


# Statistics on backtranslations

In [139]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
# List with disambiguated source sentences male
source_disambiguated_male = []
with open('en_disambiguated_male.txt', 'r') as fin:
    for line in fin:
        source_disambiguated_male.append(line.strip())
        
# List with disambiguated source sentences female
source_disambiguated_female = []
with open('en_disambiguated_female.txt', 'r') as fin:
    for line in fin:
        source_disambiguated_female.append(line.strip())        
    
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated male
nbest_disambiguated_male = []
with open('disambiguated_male_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_disambiguated_male.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated female
nbest_disambiguated_female = []
with open('disambiguated_female_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_disambiguated_female.append(temp)
            counter = 0
            temp = []
            
print(len(nbest_original))
print(len(nbest_disambiguated_male))
print(len(nbest_disambiguated_female))

335
335
335


## Source sentence reoccurrence

In [94]:
# Count how many of the source sentences reoccur in the backtranslation
def count_sentence_reoccurrence(source_sentences, nbest_sentences):
    results = []
    counter = 0
    for sent in source_sentences:
        matches = 0
        for target in nbest_sentences[counter]: 
            if (sent == target):
                matches += 1
        results.append(matches)  
        counter += 1

    return sum(x > 0 for x in results)

In [95]:
print(count_sentence_reoccurrence(source, nbest_original))

print(count_sentence_reoccurrence(source_disambiguated_male, nbest_disambiguated_male))

print(count_sentence_reoccurrence(source_disambiguated_female, nbest_disambiguated_female))

295
293
118


## Ambiguous source words reoccurrence


In [96]:
# Extract ambiguous words from source sentences
ambiguous_words = [] 
with open('en_original.txt', 'r') as fin:
    for line in fin:
        tokens = line.split(" ")
        ambiguous_words.append(tokens[1].replace('\n', ''))
        
print(ambiguous_words)
print(len(ambiguous_words))

['developer', 'mechanic', 'mover', 'assistant', 'chief', 'salesperson', 'lawyer', 'cook', 'mover', 'farmer', 'CEO', 'hairdresser', 'developer', 'driver', 'auditor', 'CEO', 'guard', 'assistant', 'assistant', 'auditor', 'salesperson', 'mechanic', 'manager', 'physician', 'laborer', 'physician', 'hairdresser', 'developer', 'farmer', 'receptionist', 'manager', 'cleaner', 'mechanic', 'writer', 'construction', 'editor', 'analyst', 'carpenter', 'cook', 'carpenter', 'cleaner', 'laborer', 'mechanic', 'mechanic', 'cook', 'farmer', 'CEO', 'librarian', 'chief', 'developer', 'nurse', 'lawyer', 'developer', 'mover', 'mover', 'construction', 'secretary', 'CEO', 'carpenter', 'sheriff', 'mechanic', 'analyst', 'clerk', 'assistant', 'chief', 'janitor', 'manager', 'supervisor', 'chief', 'construction', 'salesperson', 'lawyer', 'developer', 'sheriff', 'janitor', 'laborer', 'driver', 'mover', 'developer', 'janitor', 'salesperson', 'chief', 'laborer', 'guard', 'nurse', 'construction', 'laborer', 'lawyer', 'CE

In [97]:
# Count how many of the ambiguous words reoccur in the backtranslation
def count_words_reoccurrence(ambiguous_words, nbest_sentences):
    results = []
    counter = 0
    for word in ambiguous_words:
        matches = 0
        for target in nbest_sentences[counter]: 
            if (word in target.split(" ")):
                matches += 1
        results.append(matches)  
        counter += 1

    return sum(x > 0 for x in results)

In [98]:
print(count_words_reoccurrence(ambiguous_words, nbest_original))

print(count_words_reoccurrence(ambiguous_words, nbest_disambiguated_male))

print(count_words_reoccurrence(ambiguous_words, nbest_disambiguated_female))

329
330
314


## Count unique sentences

In [140]:
print(count_unique_sentences(nbest_original))

print(count_unique_sentences(nbest_disambiguated_male))

print(count_unique_sentences(nbest_disambiguated_female))

45.97910447761194
50.72835820895522
50.82089552238806


## Count unique words

In [141]:
print(count_unique_words(nbest_original))

print(count_unique_words(nbest_disambiguated_male))

print(count_unique_words(nbest_disambiguated_female))

Normalizer: 730.8298507462687
(32.34925373134328, 0.04426372800496675)
Normalizer: 803.5910447761194
(31.623880597014924, 0.0393532018588203)
Normalizer: 752.0268656716418
(32.017910447761196, 0.0425754875381556)


# Word alignement (source-translation)

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [101]:
def build_alignment_input(sentencesN, sourceIn, targetIn, output):
    # List with original source sentences
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    # List with nbest sentences for every source in original 
    target = []
    counter = 0
    temp = []
    with open(targetIn, 'r') as fin:
        for line in fin:
            temp.append(line.strip())
            counter += 1
            if (counter == 10):
                target.append(temp)
                counter = 0
                temp = []

    #print(len(source))
    #print(len(target))           

    count = 0
    with open(output, 'w') as fout:
        while count < sentencesN:
            for hyp in target[count]:
                print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
            count += 1

In [102]:
build_alignment_input(335, 'tok.en_original.en', 'hyp_original.txt', 'original_source-target_en-de.txt')
build_alignment_input(335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male.txt', 'disambiguated_male_source-target_en-de.txt')
build_alignment_input(335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female.txt', 'disambiguated_female_source-target_en-de.txt')

## fast_align

In [None]:
!$FAST_ALIGN -i original_source-target_en-de.txt -d -o -v > original_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_male_source-target_en-de.txt -d -o -v > disambiguated_male_source-target_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_female_source-target_en-de.txt -d -o -v > disambiguated_female_source-target_en-de_fast-aligned.txt

print("Finished alignment.")

In [22]:
import re

# Count unique translated words to the ambiguous words in translations per source sentence
def count_unique_words_alignment_translations(position, sentencesN, sourceIn, translationsIn, alignmentsIn, output):
    
    # Get positions of ambigous words
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
        
    # List with translations
    translations = []
    with open(translationsIn, 'r') as fin:
        for line in fin:
            translations.append(line.strip())
            
            
    
    # Extract alginments of ambiguous words
    lineNumber = 0
    counter = 0
    indices = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                indices.append([int(index) for index in re.findall(regex, line)])
            else:
                indices.append([999])
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = set() # set forces uniqueness
    for translation in translations:
        tokens = translation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                #print(lineNumber)
                #print(tokens[ind])
                #print(ind)
                translated_ambiguous_words.add(tokens[ind])
        lineNumber += 1
        if (lineNumber % 10 == 0):
                translations_ambiguous_words.append(translated_ambiguous_words)
                translated_ambiguous_words = set()

    #print(translations_ambiguous_words)
    #print(len(translations_ambiguous_words))
    
    # Add results to file
    ambiguous_words = []
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0                
    with open(output, 'w') as fout:
        while count < sentencesN:
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(translations_ambiguous_words[count]), end='\n', file=fout)
            count += 1

    unique_translations = 0
    for set_words in translations_ambiguous_words:
        
        ############################################################
        # remove gender info; removing "in" and "e" endings in words
        set_words_new = set()
        for word in set_words:
            word_new = re.sub("in$|e$", "", word)
            #print(word_new)
            set_words_new.add(word_new)
        #print(set_words_new)
        ############################################################
        
        unique_translations += len(set_words_new)
        
    #print(unique_translations)
    return unique_translations/sentencesN # average

In [23]:
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original.en', 'hyp_original.txt', 'original_source-target_en-de_fast-aligned.txt', 'unique-words_translations_original.txt'))
print('======')
print(count_unique_words_alignment_translations(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male.txt', 'disambiguated_male_source-target_en-de_fast-aligned.txt', 'unique-words_translations_disambiguated_male.txt')) # positions is 2 because of gender word
print('======')
print(count_unique_words_alignment_translations(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female.txt', 'disambiguated_female_source-target_en-de_fast-aligned.txt', 'unique-words_translations_disambiguated_female.txt'))

2.656716417910448
2.635820895522388
2.808955223880597


In [24]:
import re

# Count unique translated words to the ambiguous words in translations per source sentence
def count_unique_words_alignment_translations_TEST(position, sentencesN, sourceIn, translationsIn, alignmentsIn, output):
    
    # Get positions of ambigous words
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
        
    # List with translations
    translations = []
    with open(translationsIn, 'r') as fin:
        for line in fin:
            translations.append(line.strip())
            
            
    
    # Extract alginments of ambiguous words
    lineNumber = 0
    counter = 0
    indices = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                indices.append([int(index) for index in re.findall(regex, line)])
            else:
                indices.append([999])
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = set() # set forces uniqueness
    for translation in translations:
        tokens = translation.split(' ')
        if 999 not in indices[lineNumber]:
            translated_ambiguous_words.add(' '.join([tokens[ind] for ind in indices[lineNumber]])) # join multiple aligned words
        lineNumber += 1
        if (lineNumber % 10 == 0):
                translations_ambiguous_words.append(translated_ambiguous_words)
                translated_ambiguous_words = set()

    #print(translations_ambiguous_words)
    #print(len(translations_ambiguous_words))
    
    # Add results to file
    ambiguous_words = []
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0                
    with open(output, 'w') as fout:
        while count < sentencesN:
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(translations_ambiguous_words[count]), end='\n', file=fout)
            count += 1

    unique_translations = 0
    for set_words in translations_ambiguous_words:
        
        ############################################################
        # remove gender info; removing "in" and "e" endings in words
        set_words_new = set()
        for word in set_words:
            word_new = re.sub("in$|e$", "", word)
            #print(word_new)
            set_words_new.add(word_new)
        #print(set_words_new)
        ############################################################
        
        unique_translations += len(set_words_new)
        
    #print(unique_translations)
    return unique_translations/sentencesN # average

In [25]:
print(count_unique_words_alignment_translations_TEST(1, 335, 'tok.en_original.en', 'hyp_original.txt', 'original_source-target_en-de_fast-aligned.txt', 'unique-words_translations_original_TEST.txt'))
print('======')
print(count_unique_words_alignment_translations_TEST(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male.txt', 'disambiguated_male_source-target_en-de_fast-aligned.txt', 'unique-words_translations_disambiguated_male_TEST.txt')) # positions is 2 because of gender word
print('======')
print(count_unique_words_alignment_translations_TEST(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female.txt', 'disambiguated_female_source-target_en-de_fast-aligned.txt', 'unique-words_translations_disambiguated_female_TEST.txt'))

2.6
2.6029850746268655
2.773134328358209


## awesome_align

In [106]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_source-target_en-de_awesome-aligned.txt" \
    --data_file "original_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "disambiguated_male_source-target_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_male_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "disambiguated_female_source-target_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_female_source-target_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

print("Finished alignment.")

Loading the dataset...
Extracting: 3350it [00:04, 721.46it/s]
Loading the dataset...
Extracting: 3350it [00:04, 686.67it/s]
Loading the dataset...
Extracting: 3350it [00:05, 659.90it/s]
Finished alignment.


In [26]:
print(count_unique_words_alignment_translations(1, 335, 'tok.en_original.en', 'hyp_original.txt', 'original_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_original.txt'))
print('======')
print(count_unique_words_alignment_translations(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_disambiguated_male.txt')) # positions is 2 because of gender word
print('======')
print(count_unique_words_alignment_translations(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_disambiguated_female.txt'))

2.382089552238806
2.426865671641791
2.334328358208955


In [27]:
print(count_unique_words_alignment_translations_TEST(1, 335, 'tok.en_original.en', 'hyp_original.txt', 'original_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_original_TEST.txt'))
print('======')
print(count_unique_words_alignment_translations_TEST(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_disambiguated_male_TEST.txt')) # positions is 2 because of gender word
print('======')
print(count_unique_words_alignment_translations_TEST(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt', 'unique-words_translations_disambiguated_female_TEST.txt'))

2.388059701492537
2.4238805970149255
2.334328358208955


# Word alignement (translation-backtranslation)

## fast_align

- Input to fast_align must be tokenized and aligned into parallel sentences. 
- Line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||)

In [108]:
build_alignment_input(3350, 'hyp_original.txt', 'hyp_original_back.txt', 'original_translation-back_en-de.txt')
build_alignment_input(3350, 'hyp_disambiguated_male.txt', 'hyp_disambiguated_male_back.txt', 'disambiguated_male_translation-back_en-de.txt')
build_alignment_input(3350, 'hyp_disambiguated_female.txt', 'hyp_disambiguated_female_back.txt', 'disambiguated_female_translation-back_en-de.txt')

- Word alignement

In [None]:
!$FAST_ALIGN -i original_translation-back_en-de.txt -d -o -v > original_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_male_translation-back_en-de.txt -d -o -v > disambiguated_male_translation-back_en-de_fast-aligned.txt
!$FAST_ALIGN -i disambiguated_female_translation-back_en-de.txt -d -o -v > disambiguated_female_translation-back_en-de_fast-aligned.txt

print("Finished alignment.")

- Extract target backtranslated words

In [34]:
import re

# Count unique translated words to the ambiguous words in backtranslations per source sentence
def count_unique_words_alignment_backtranslations(position, sentencesN, sourceIn, backtranslationsIn, alignmentsIn_translation, alignmentsIn_backtranslation, output):
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
       
    lineNumber = 0
    counter = 0
    positions_ambiguous_words_translations = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn_translation, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                positions_ambiguous_words_translations.append([int(index) for index in re.findall(regex, line)])
            else:
                positions_ambiguous_words_translations.append([999])
            lineNumber += 1
    
    # List with backtranslations
    backtranslations = []
    with open(backtranslationsIn, 'r') as fin:
        for line in fin:
            backtranslations.append(line.strip())

    lineNumber = 0
    counter = 0
    indices = []
    with open(alignmentsIn_backtranslation, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            positions = positions_ambiguous_words_translations[counter] # exact positions of ambiguous words
            list_indices = []
            for position in positions:
                regex = r"" + str(position) + r"-(\d)"
                if re.findall(regex, line): 
                    list_indices.extend([int(index) for index in re.findall(regex, line)])
                else:
                    list_indices.extend([999])
            indices.append(list_indices)
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    backtranslated_ambiguous_words = set() # set forces uniqueness
    for backtranslation in backtranslations:
        tokens = backtranslation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                #print(lineNumber)
                #print(tokens[ind])
                #print(ind)
                backtranslated_ambiguous_words.add(tokens[ind])
        lineNumber += 1
        if (lineNumber % 10 == 0):
                backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
                backtranslated_ambiguous_words = set()



    #print(backtranslations_ambiguous_words)
    print(len(backtranslations_ambiguous_words))

    # Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
    backtranslations_ambiguous_words_reduced = []
    backtranslated_ambiguous_words = set() # set forces uniqueness
    counter = 0
    for set_words in backtranslations_ambiguous_words:
        backtranslated_ambiguous_words.update(set_words)
        counter += 1
        if (counter % 10 == 0):
            backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()

    print(len(backtranslations_ambiguous_words_reduced)) 
    
    # Add results to file

    ambiguous_words = []
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0                
    with open(output, 'w') as fout:
        while count < sentencesN:
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(backtranslations_ambiguous_words_reduced[count]), end='\n', file=fout)
            count += 1

    unique_backtranslations = 0
    for set_words in backtranslations_ambiguous_words_reduced:
        unique_backtranslations += len(set_words)
        
    return unique_backtranslations/sentencesN

In [35]:
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original.en', 'hyp_original_back.txt', 'original_source-target_en-de_fast-aligned.txt', 'original_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_original.txt'))
print(count_unique_words_alignment_backtranslations(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male_back.txt', 'disambiguated_male_source-target_en-de_fast-aligned.txt', 'disambiguated_male_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_disambiguated_male.txt')) # positions is 2 because of gender word
print(count_unique_words_alignment_backtranslations(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female_back.txt', 'disambiguated_female_source-target_en-de_fast-aligned.txt', 'disambiguated_female_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_disambiguated_female.txt'))

3350
335
8.844776119402985
3350
335
8.325373134328359
3350
335
9.794029850746268


In [36]:
import re

# Count unique translated words to the ambiguous words in backtranslations per source sentence
def count_unique_words_alignment_backtranslations_TEST(position, sentencesN, sourceIn, backtranslationsIn, alignmentsIn_translation, alignmentsIn_backtranslation, output):
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
       
    lineNumber = 0
    counter = 0
    positions_ambiguous_words_translations = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn_translation, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                positions_ambiguous_words_translations.append([int(index) for index in re.findall(regex, line)])
            else:
                positions_ambiguous_words_translations.append([999])
            lineNumber += 1
    
    # List with backtranslations
    backtranslations = []
    with open(backtranslationsIn, 'r') as fin:
        for line in fin:
            backtranslations.append(line.strip())

    lineNumber = 0
    counter = 0
    indices = []
    with open(alignmentsIn_backtranslation, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            positions = positions_ambiguous_words_translations[counter] # exact positions of ambiguous words
            list_indices = []
            for position in positions:
                regex = r"" + str(position) + r"-(\d)"
                if re.findall(regex, line): 
                    list_indices.extend([int(index) for index in re.findall(regex, line)])
                else:
                    list_indices.extend([999])
            indices.append(list_indices)
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    backtranslations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    backtranslated_ambiguous_words = set() # set forces uniqueness
    for backtranslation in backtranslations:
        tokens = backtranslation.split(' ')
        if 999 not in indices[lineNumber]:
            backtranslated_ambiguous_words.add(' '.join([tokens[ind] for ind in indices[lineNumber]])) # join multiple aligned words
        lineNumber += 1
        if (lineNumber % 10 == 0):
                backtranslations_ambiguous_words.append(backtranslated_ambiguous_words)
                backtranslated_ambiguous_words = set()



    #print(backtranslations_ambiguous_words)
    print(len(backtranslations_ambiguous_words))

    # Here we need to merge the sets for every 10 sets, because we want to see unique words in the nbest 100 backtranslation
    backtranslations_ambiguous_words_reduced = []
    backtranslated_ambiguous_words = set() # set forces uniqueness
    counter = 0
    for set_words in backtranslations_ambiguous_words:
        backtranslated_ambiguous_words.update(set_words)
        counter += 1
        if (counter % 10 == 0):
            backtranslations_ambiguous_words_reduced.append(backtranslated_ambiguous_words)
            backtranslated_ambiguous_words = set()

    print(len(backtranslations_ambiguous_words_reduced)) 
    
    # Add results to file

    ambiguous_words = []
    source = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0                
    with open(output, 'w') as fout:
        while count < sentencesN:
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(backtranslations_ambiguous_words_reduced[count]), end='\n', file=fout)
            count += 1

    unique_backtranslations = 0
    for set_words in backtranslations_ambiguous_words_reduced:
        unique_backtranslations += len(set_words)
        
    return unique_backtranslations/sentencesN

In [37]:
print(count_unique_words_alignment_backtranslations_TEST(1, 335, 'tok.en_original.en', 'hyp_original_back.txt', 'original_source-target_en-de_fast-aligned.txt', 'original_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_original_TEST.txt'))
print(count_unique_words_alignment_backtranslations_TEST(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male_back.txt', 'disambiguated_male_source-target_en-de_fast-aligned.txt', 'disambiguated_male_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_disambiguated_male_TEST.txt')) # positions is 2 because of gender word
print(count_unique_words_alignment_backtranslations_TEST(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female_back.txt', 'disambiguated_female_source-target_en-de_fast-aligned.txt', 'disambiguated_female_translation-back_en-de_fast-aligned.txt', 'unique-words_backtranslations_disambiguated_female_TEST.txt'))

3350
335
9.829850746268656
3350
335
9.322388059701492
3350
335
11.623880597014926


## awesome_align

- Extract the position of the translated ambiguous word from each sentence

- Word alignement

In [112]:
# ??? How to set model correctly
# MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
!awesome-align \
    --output_file "original_translation-back_en-de_awesome-aligned.txt" \
    --data_file "original_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "disambiguated_male_translation-back_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_male_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

!awesome-align \
    --output_file "disambiguated_female_translation-back_en-de_awesome-aligned.txt" \
    --data_file "disambiguated_female_translation-back_en-de.txt" \
    --model_name_or_path bert-base-multilingual-cased \
    --extraction 'softmax' \
    --batch_size 32

print("Finished alignment.")

Loading the dataset...
Extracting: 33500it [00:39, 848.57it/s]
Loading the dataset...
Extracting: 33500it [00:40, 818.23it/s]
Loading the dataset...
Extracting: 33500it [00:39, 846.62it/s]
Finished alignment.


- Extract target backtranslated words

In [38]:
print(count_unique_words_alignment_backtranslations(1, 335, 'tok.en_original.en', 'hyp_original_back.txt', 'original_source-target_en-de_awesome-aligned.txt', 'original_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_original.txt'))
print(count_unique_words_alignment_backtranslations(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male_back.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt', 'disambiguated_male_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_disambiguated_male.txt')) # positions is 2 because of gender word
print(count_unique_words_alignment_backtranslations(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female_back.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt', 'disambiguated_female_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_disambiguated_female.txt'))

3350
335
7.477611940298507
3350
335
7.044776119402985
3350
335
7.847761194029851


In [39]:
print(count_unique_words_alignment_backtranslations_TEST(1, 335, 'tok.en_original.en', 'hyp_original_back.txt', 'original_source-target_en-de_awesome-aligned.txt', 'original_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_original_TEST.txt'))
print(count_unique_words_alignment_backtranslations_TEST(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male_back.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt', 'disambiguated_male_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_disambiguated_male_TEST.txt')) # positions is 2 because of gender word
print(count_unique_words_alignment_backtranslations_TEST(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female_back.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt', 'disambiguated_female_translation-back_en-de_awesome-aligned.txt', 'unique-words_backtranslations_disambiguated_female_TEST.txt'))

3350
335
8.071641791044776
3350
335
7.916417910447761
3350
335
8.868656716417911


# Word alignement (translation-translation)

## Tercom alignement (borrowed from Tu)
- https://github.com/TuAnh23/Perturbation-basedQE/blob/master/align_and_analyse_ambiguous_trans.py#L54-L92

In [None]:
!git clone https://github.com/TuAnh23/Perturbation-basedQE.git

In [114]:
%cd $TERCOM

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE


In [115]:
import align_and_analyse_ambiguous_trans as tercom
import pandas as pd

def count_unique_words_tercom_alignment(position, sentencesN, sourceIn, backtranslationsIn):
    # List with source sentences; output 100 times to match backtranslation size
    source = []
    with open(PATH + "/" + sourceIn, 'r') as fin:
        for line in fin:
            for i in range(100): # append the source sentence 100 times to match backtranslations later
                source.append(line.strip().split()) # split() tokenizes the sentence, because tercom expects tokens     

    print(len(source))

    # List with original backtranslations
    backtranslations = []
    with open(PATH + "/" + backtranslationsIn, 'r') as fin:
        for line in fin:
            backtranslations.append(line.strip().split())

    print(len(backtranslations))
    
    # Generate alignments
    alignments = tercom.tercom_alignment(source, backtranslations)
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
        
    # Extract target translated words to source words
    lineNumber = 0
    counter = 0
    indices = []
    for align in alignments:
        position = positions_ambiguous_words[counter] # exact position of ambiguous word
        indices.append([item[1] for item in (item for item in align if not(pd.isna(item[0]))) if item[0] == position][0])
        lineNumber += 1
        if (lineNumber % 100 == 0):
            counter += 1

    print(len(indices))

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = set() # set forces uniqueness
    for backtranslation in backtranslations:
        backtranslation_index = backtranslations.index(backtranslation)
        if not(pd.isna(indices[backtranslation_index])):
            translated_ambiguous_words.add(backtranslation[indices[backtranslation_index]])
        lineNumber += 1
        if (lineNumber % 100 == 0):
            translations_ambiguous_words.append(translated_ambiguous_words)
            translated_ambiguous_words = set()

    #print(translations_ambiguous_words)
    #print(len(translations_ambiguous_words))

    unique_translations = 0
    for set_words in translations_ambiguous_words:
        unique_translations += len(set_words)
        
    return unique_translations/sentencesN

In [116]:
print(count_unique_words_tercom_alignment(1, 335, 'tok.en_original.en', 'hyp_original_back.txt'))
print(count_unique_words_tercom_alignment(2, 335, 'tok.en_disambiguated_male.en', 'hyp_disambiguated_male_back.txt'))
print(count_unique_words_tercom_alignment(2, 335, 'tok.en_disambiguated_female.en', 'hyp_disambiguated_female_back.txt'))

33500
33500
33500
8.017910447761194
33500
33500
33500
7.898507462686567
33500
33500
33500
9.817910447761195


In [117]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity


# Word occurrence

## Translation

In [118]:
def extract_word_translations(filename_tokenized, filename_translations, filename_out, filename_alignments):
    """
    Match alignment indices from translation to backtranslation
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_alignments, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    translations = []
    with open(filename_translations, 'r') as fin:
         for line in fin:
                translations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 9):
                alignments = indices_translation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(translations[counter + f])):
                                 words_set.add(translations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 10

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 335:
            print(source[count] + ' | ' + str(target_words[count]), end='\n', file=fout)
            count += 1

In [119]:
def count_word_translations(filename_tokenized, filename_translations, filename_out, filename_alignments):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_alignments, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    translations = []
    with open(filename_translations, 'r') as fin:
         for line in fin:
                translations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 9):
                alignments = indices_translation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(translations[counter + f])):
                                 words_set.add(translations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 10

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0
    occurrences = []
    with open(filename_out, 'w') as fout:
        while count < 335:
            occurrences.append([len(target_set) for target_set in target_words[count]])
            print(source[count] + ' | ' + str([len(target_set) for target_set in target_words[count]]), end='\n', file=fout)
            count += 1
    
    return occurrences

- Investigate the variability of the remaining sentence without the ambiguos word

In [120]:
def uniqueness_rest_of_sentence(position, sentencesN, occurrences):
    
    # Get positions of ambigous words
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
        
    # Sum over all sentences and build average     
    counter = 0
    sum_sent = 0
    for occur in occurrences:
        position = positions_ambiguous_words[counter]
        sum_sent += (sum(occur) - occur[position])/(len(occur) - 1) # sum for every sentence
        counter += 1
        
    return sum_sent/sentencesN

In [121]:
extract_word_translations('tok.en_original.en', 'hyp_original.txt', 'translations_words_original.txt', 'original_source-target_en-de_awesome-aligned.txt')
occurrence_original = count_word_translations('tok.en_original.en', 'hyp_original.txt', 'translations_words_original_occurrence.txt', 'original_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(1, 335, occurrence_original))

extract_word_translations('tok.en_disambiguated_male.en', 'hyp_disambiguated_male.txt', 'translations_words_disambiguated_male.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt')
occurrence_male = count_word_translations('tok.en_disambiguated_male.en', 'hyp_disambiguated_male.txt', 'translations_words_disambiguated_male_occurrence.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(2, 335, occurrence_male))

extract_word_translations('tok.en_disambiguated_female.en', 'hyp_disambiguated_female.txt', 'translations_words_disambiguated_female.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt')
occurrence_female = count_word_translations('tok.en_disambiguated_female.en', 'hyp_disambiguated_female.txt', 'translations_words_disambiguated_female_occurrence.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt')
print(uniqueness_rest_of_sentence(2, 335, occurrence_female))


1.8704661186048366
1.6735271941988372
1.6452370266549383


## Backtranslation

In [122]:
def extract_alignment_indices_backtranslation(filename_translations, filename_backtranslations):
    """
    Extract alignment indices
    """
    
    # Extract alignement indices from translation
    indices_translation = []
    with open(filename_translations, 'r') as alignments:
        for line in alignments:
            alignement_tokens = line.split()
            indices_line = []
            for i in range(0, len(alignement_tokens)):    
                regex = r"" + str(i) + r"-(\d)"
                if re.findall(regex, line): 
                    indices_line.append([int(index) for index in re.findall(regex, line)])
                else:
                    indices_line.append([999])
            indices_translation.append(indices_line)
       
    # Match alignement indices from translation to backtranslation
    lineNumber = 0
    counter = 0
    indices_backtranslation = []
    with open(filename_backtranslations, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            alignement_tokens = line.split()
            indices_line = []
            for index_list in indices_translation[counter]:
                index_matches = []
                for index in index_list:
                    regex = r"" + str(index) + r"-(\d)"
                    if re.findall(regex, line): 
                        index_matches.extend([int(i) for i in re.findall(regex, line)])
                    else:
                        index_matches.extend([999])
                indices_line.append(index_matches)
            indices_backtranslation.append(indices_line)
            lineNumber += 1 
    return indices_backtranslation

In [123]:
def extract_word_backtranslations(filename_tokenized, filename_backtranslations, filename_out, indices_backtranslation):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    backtranslations = []
    with open(filename_backtranslations, 'r') as fin:
         for line in fin:
                backtranslations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 99):
                alignments = indices_backtranslation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(backtranslations[counter + f])):
                                 words_set.add(backtranslations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 100

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())

    count = 0                
    with open(filename_out, 'w') as fout:
        while count < 335:
            print(source[count] + ' | ' + str(target_words[count]), end='\n', file=fout)
            count += 1

In [124]:
def count_word_backtranslations(filename_tokenized, filename_backtranslations, filename_out, indices_backtranslation):
    """
    Match alignement indices from translation to backtranslation
    """
    
    # List with lengths of the source sentences
    source_lengths = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source_lengths.append(len(line.strip().split()))

    #print(source_lengths)

    # List with backtranslations
    backtranslations = []
    with open(filename_backtranslations, 'r') as fin:
         for line in fin:
                backtranslations.append(line.split())

    #print(backtranslations)

    target_words = [] # list containing lists with translation sets for every word in the source sentences; length 335
    counter = 0
    for i in range(0, 335): # for every source sentence
        source_sent = []
        for j in range(0, source_lengths[i]): # for every word in the source sentence
            words_set = set()
            for  f in range(0, 99):
                alignments = indices_backtranslation[counter + f]        
                if (j < len(alignments)):
                    for index in alignments[j]:
                        if index != 999:
                            if (index < len(backtranslations[counter + f])):
                                 words_set.add(backtranslations[counter + f][index])
            source_sent.append(words_set)
        target_words.append(source_sent)
        counter += 100

    #print(len(target_words))

    # Add results to file

    # List with source sentences
    source = []
    with open(filename_tokenized, 'r') as fin:
        for line in fin:
            source.append(line.strip())
       
    count = 0
    occurrences = []
    with open(filename_out, 'w') as fout:
        while count < 335:
            occurrences.append([len(target_set) for target_set in target_words[count]])
            print(source[count] + ' | ' + str([len(target_set) for target_set in target_words[count]]), end='\n', file=fout)
            count += 1
    
    return occurrences

In [125]:
indices_original = extract_alignment_indices_backtranslation('original_source-target_en-de_awesome-aligned.txt', 'original_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_original.en', 'hyp_original_back.txt', 'backtranslations_words_original.txt', indices_original)
occurrences_original = count_word_backtranslations('tok.en_original.en', 'hyp_original_back.txt', 'backtranslations_words_original_occurrence.txt', indices_original)
print(uniqueness_rest_of_sentence(1, 335, occurrences_original))

indices_disambiguated_male = extract_alignment_indices_backtranslation('disambiguated_male_source-target_en-de_awesome-aligned.txt', 'disambiguated_male_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_disambiguated_male.en', 'hyp_disambiguated_male_back.txt', 'backtranslations_words_disambiguated_male.txt', indices_disambiguated_male)
occurrences_male = count_word_backtranslations('tok.en_disambiguated_male.en', 'hyp_disambiguated_male_back.txt', 'backtranslations_words_disambiguated_male_occurrence.txt', indices_disambiguated_male)
print(uniqueness_rest_of_sentence(2, 335, occurrences_male))

indices_disambiguated_female = extract_alignment_indices_backtranslation('disambiguated_female_source-target_en-de_awesome-aligned.txt', 'disambiguated_female_translation-back_en-de_awesome-aligned.txt')
extract_word_backtranslations('tok.en_disambiguated_female.en', 'hyp_disambiguated_female_back.txt', 'backtranslations_words_disambiguated_female.txt', indices_disambiguated_female)
occurrences_female = count_word_backtranslations('tok.en_disambiguated_female.en', 'hyp_disambiguated_female_back.txt', 'backtranslations_words_disambiguated_female_occurrence.txt', indices_disambiguated_female)
print(uniqueness_rest_of_sentence(2, 335, occurrences_female))


4.1377987260032905
3.72257932780321
3.734217109590242


# Gender statistics

In [9]:
from enum import Enum

class GENDER(Enum):
    """
    Enumerate possible genders.
    Ignore option resolves to words that should be ignored in particular language
    """
    male = 0
    female = 1
    neutral = 2
    unknown = 3
    ignore = 4
    
# ??? These are not always correct; 'der' could be Dativ or Genitiv for female, 'die' could be plural
# !!! There isn't always an article
DE_DETERMINERS = {"der": GENDER.male, "ein": GENDER.male, "dem": GENDER.male, "den": GENDER.male, 
                  "einen": GENDER.male, "des": GENDER.male, "er": GENDER.male, "seiner": GENDER.male,
                  "ihn": GENDER.male, "seinen": GENDER.male, "ihm": GENDER.male, "ihren": GENDER.male,
                  "die": GENDER.female, "eine": GENDER.female, "einer": GENDER.female, "seinem": GENDER.male,
                  "ihrem": GENDER.male, "sein": GENDER.male,
                  "sie": GENDER.female, "seine": GENDER.female, "ihrer": GENDER.female, 
                  "ihr": GENDER.neutral, "ihre": GENDER.neutral, "das": GENDER.neutral,
                  "jemanden": GENDER.neutral}

def get_german_determiners(words):
    """
    Get a list of (gender)
    given a list of words.
    """
    determiners = []
    for (word_ind, word) in enumerate(words):
        word = word.lower()
        if word in DE_DETERMINERS:
            determiners.append((DE_DETERMINERS[word].name))
    return determiners

In [10]:
dets = get_german_determiners(["dem"])
print(dets)

['male']


- Calculate gender based on the articles of unique words: how many of the sentences produce both genders, female and male

In [11]:
import re

# Extract articles of target tranlsated words
def extract_articles(position, sentencesN, translationsIn, alignmentsIn, sourceIn, output):
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
    
    # List with original translations
    translations_original = []
    with open(translationsIn, 'r') as fin:
        for line in fin:
            translations_original.append(line.strip())


    lineNumber = 0
    counter = 0
    indices = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                indices.append([int(index) for index in re.findall(regex, line)])
            else:
                indices.append([999])
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = set() # set forces uniqueness
    for translation in translations_original:
        tokens = translation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                translated_ambiguous_words.add(tokens[0]) # extract articles; currently assume index 0 for article position, TODO
        lineNumber += 1
        if (lineNumber % 10 == 0):
                translations_ambiguous_words.append(translated_ambiguous_words)
                translated_ambiguous_words = set()


    #print(len(translations_ambiguous_words))
    
    # Add results to file

    # List with original source sentences
    source = []
    ambiguous_words = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0  
    genders = []
    male = []
    female = []
    with open(output, 'w') as fout:
        while count < sentencesN:
            #print(translations_ambiguous_words[count])
            genders.append(set(get_german_determiners(translations_ambiguous_words[count])))
            male.append("male" in get_german_determiners(translations_ambiguous_words[count]))
            female.append("female" in get_german_determiners(translations_ambiguous_words[count]))
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(get_german_determiners(translations_ambiguous_words[count])), end='\n', file=fout)
            count += 1
            
    return (sum(1 for i in genders if ('male' in i and 'female' in i)), 
            male.count(True), female.count(True))

In [12]:
print(extract_articles(1, 335, 'hyp_original.txt', 'original_source-target_en-de_awesome-aligned.txt', 'tok.en_original.en', 'unique-words_translations_original_articles.txt'))
print(extract_articles(2, 335, 'hyp_disambiguated_male.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt', 'tok.en_disambiguated_male.en', 'unique-words_translations_disambiguated_male_articles.txt'))
print(extract_articles(2, 335, 'hyp_disambiguated_female.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt', 'tok.en_disambiguated_female.en', 'unique-words_translations_disambiguated_female_articles.txt'))

(128, 327, 136)
(118, 332, 121)
(94, 94, 335)


- Calculate gender in percentage for each sentence: percent of "male" vs. female in translations for each sentence

In [13]:
import re

# Extract articles of target tranlsated words
def extract_articles_percent(position, sentencesN, translationsIn, alignmentsIn, sourceIn, output):
    
    # Extract the position of the translated ambiguous word from each sentence
    positions_ambiguous_words = []
    for i in range(sentencesN):
        positions_ambiguous_words.append(position)
    
    # List with original translations
    translations_original = []
    with open(translationsIn, 'r') as fin:
        for line in fin:
            translations_original.append(line.strip())


    lineNumber = 0
    counter = 0
    indices = [] # a list of lists of indices of translated words for each ambiguous word
    with open(alignmentsIn, 'r') as alignments:
        for line in alignments:
            if (lineNumber == 10):
                lineNumber = 0
                counter += 1
            position = positions_ambiguous_words[counter] # exact position of ambiguous word
            regex = r"" + str(position) + r"-(\d)"
            if re.findall(regex, line): 
                indices.append([int(index) for index in re.findall(regex, line)])
            else:
                indices.append([999])
            lineNumber += 1

    #print(len(indices))
    #print(indices)

    lineNumber = 0
    translations_ambiguous_words = [] # a list of set of translations to each ambiguous word in source
    translated_ambiguous_words = [] 
    for translation in translations_original:
        tokens = translation.split(' ')
        if 999 not in indices[lineNumber]:
            for ind in indices[lineNumber]:
                translated_ambiguous_words.append(tokens[0]) # extract articles; currently assume index 0 for article position, TODO
        lineNumber += 1
        if (lineNumber % 10 == 0):
                translations_ambiguous_words.append(translated_ambiguous_words)
                translated_ambiguous_words = []


    #print(len(translations_ambiguous_words))
    
    # Add results to file

    # List with original source sentences
    source = []
    ambiguous_words = []
    with open(sourceIn, 'r') as fin:
        for line in fin:
            source.append(line.strip())
            tokens = line.split(' ')
            ambiguous_words.append(tokens[position])

    count = 0  
    genders = []
    male = []
    female = []
    with open(output, 'w') as fout:
        while count < sentencesN:
            #print(translations_ambiguous_words[count])
            genders.append(get_german_determiners(translations_ambiguous_words[count]))
            male.append("male" in get_german_determiners(translations_ambiguous_words[count]))
            female.append("female" in get_german_determiners(translations_ambiguous_words[count]))
            print(source[count] + ' | ' + ambiguous_words[count] + ' | ' + str(get_german_determiners(translations_ambiguous_words[count])), end='\n', file=fout)
            count += 1
     
    print(genders)
    return (sum([i.count('male')/10 for i in genders])/sentencesN*100, 
            sum([i.count('female')/10 for i in genders])/sentencesN*100)

In [None]:
print(extract_articles_percent(1, 335, 'hyp_original.txt', 'original_source-target_en-de_awesome-aligned.txt', 'tok.en_original.en', 'unique-words_translations_original_articles.txt'))
print(extract_articles_percent(2, 335, 'hyp_disambiguated_male.txt', 'disambiguated_male_source-target_en-de_awesome-aligned.txt', 'tok.en_disambiguated_male.en', 'unique-words_translations_disambiguated_male_articles.txt'))
print(extract_articles_percent(2, 335, 'hyp_disambiguated_female.txt', 'disambiguated_female_source-target_en-de_awesome-aligned.txt', 'tok.en_disambiguated_female.en', 'unique-words_translations_disambiguated_female_articles.txt'))

# Stemming

In [51]:
import spacy

nlp = spacy.load('de_core_news_sm')
doc = nlp('Entwicklerin')
for token in doc:
    print(token, token.lemma, token.lemma_)
    
doc = nlp('Entwickler')
for token in doc:
    print(token, token.lemma, token.lemma_)

Entwicklerin 12449385729285917437 Entwicklerin
Entwickler 17644558927020980110 Entwickler


In [48]:
from nltk.stem.cistem import Cistem
 
stemmer = Cistem(case_insensitive=True)
s1 = "Entwicklerin"
print(stemmer.stem(s1))
s2 = "Entwickler"
print(stemmer.stem(s2))

entwickleri
entwickl


In [586]:
!pip install german-lemmatizer

Collecting german-lemmatizer
  Downloading german_lemmatizer-0.1.1-py3-none-any.whl (4.5 kB)
Collecting docker
  Downloading docker-6.1.3-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: docker, german-lemmatizer
Successfully installed docker-6.1.3 german-lemmatizer-0.1.1


In [623]:
from german_lemmatizer import lemmatize

lemmatize(
    ['Johannes war ein guter Schüler', 'Sabiene sang zahlreiche Lieder'],
    working_dir='*',
    chunk_size=10000,
    n_jobs=1,
    escape=False,
    remove_stop=False)

<generator object lemmatize at 0x7f229d89be40>

In [596]:
!pip install HanTa

Collecting HanTa
  Downloading HanTa-1.1.1-py3-none-any.whl (15.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.0/15.0 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: HanTa
Successfully installed HanTa-1.1.1


In [632]:
from HanTa import HanoverTagger as ht

tagger = ht.HanoverTagger('morphmodel_ger.pgz')

print(tagger.analyze('Ärztin'))
print(tagger.analyze('Arzt'))

('Ärztin', 'NN')
('Arzt', 'NN')


In [609]:
!pip install -U textblob-de

Collecting textblob-de
  Downloading textblob_de-0.4.3-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.9/468.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting textblob>=0.9.0
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: textblob, textblob-de
Successfully installed textblob-0.17.1 textblob-de-0.4.3


In [610]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /home/vzhekova/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /home/vzhekova/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/vzhekova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vzhekova/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     /home/vzhekova/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/vzhekova/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [621]:
from textblob_de import TextBlobDE

word = 'Ärztin'
w = TextBlobDE(word)
print(w.words.lemmatize())

word = 'Arzt'
w = TextBlobDE(word)
print(w.words.lemmatize())

['Ärztin']
['Arzt']


In [None]:
!pip install spacy-udpipe

In [54]:
import spacy_udpipe

spacy_udpipe.download("de") # download German model

text = "Entwicklerin"
nlp = spacy_udpipe.load("de")

doc = nlp(text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

Already downloaded a model for the 'de' language
Entwicklerin Entwicklerin NOUN ROOT
