In [79]:
import torch

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool

In [18]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [80]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity


In [20]:
# Extract sentences
!cut -f3 -d'	' en.txt > en_sentences.txt

In [21]:
# Extract sentences containing 'because' and remove the second part of the clause
# 330 unique sentences in total
with open('en_sentences.txt', 'r') as fin, open('en_original.txt', 'w') as fout:
    for line in fin:
        sentence = ''
        tokens = line.split(" ")
        for token in tokens:
            if token == 'because':
                print(sentence + '.', end='\n', file=fout)
            sentence = sentence + token.replace(',', '') + ' '

In [23]:
# Modify gender ambiguous words with gender

# List with source words
words = set() # set forces uniqueness
with open('words.txt', 'r') as fin:
    for line in fin:
        words.add(line.strip())
        
with open('en_original.txt') as in_file, open('en_disambiguated.txt', 'w') as out_file: 
    for line in in_file:
        sentence = line.split(' ')
        for token in sentence:
            if (token.replace(',', '') in words): # tokens often contain ","
                token_pos = sentence.index(token)
                sentence[token_pos] = "male " + token # could also replace with "female"
        print(' '.join(sentence), end='', file=out_file)

# Translation English-German

In [24]:
# Tokenization
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')
md_en = MosesDetokenizer(lang='en')

with open('en_original.txt') as fin, open('tok.en_original.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout) 
        
with open('en_disambiguated.txt') as fin, open('tok.en_disambiguated.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished tokenizing.')

Finished tokenizing.


In [25]:
# Dividing text into subword units

!$FASTBPE applybpe bpe.en_original.en tok.en_original.en bpecodes.en
!$FASTBPE applybpe bpe.en_disambiguated.en tok.en_disambiguated.en bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original.en ...
Read 2733 words (470 unique) from text file.
Applying BPE to tok.en_original.en ...
Modified 2733 words from text file.
Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_disambiguated.en ...
Read 3388 words (471 unique) from text file.
Applying BPE to tok.en_disambiguated.en ...
Modified 3388 words from text file.
Finished subword.


In [26]:
# Binarize text
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_en-de \
    --workers 8

2023-05-09 11:28:37 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcd

In [27]:
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_disambiguated \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_disambiguated_en-de \
    --workers 8

2023-05-09 11:29:02 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_disambiguated_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en',

In [28]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [30]:
# Generate N hypothesis
!fairseq-generate data-bin_original_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_en-de.decode_Beam_10.log

2023-05-09 11:30:46 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [31]:
# Generate N hypothesis
!fairseq-generate data-bin_disambiguated_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_en-de.decode_Beam_10.log

2023-05-09 11:34:54 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

# Backtranslation German-English

In [32]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original.txt
!grep ^H disambiguated_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated.txt

In [33]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original.de hyp_original.txt bpecodes.de
!$FASTBPE applybpe bpe.hyp_disambiguated.de hyp_disambiguated.txt bpecodes.de

print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original.txt ...
Read 27703 words (1212 unique) from text file.
Applying BPE to hyp_original.txt ...
Modified 27703 words from text file.
Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_disambiguated.txt ...
Read 32952 words (1120 unique) from text file.
Applying BPE to hyp_disambiguated.txt ...
Modified 32952 words from text file.
Finished subword.


In [34]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_de-en \
    --workers 8

2023-05-09 11:37:00 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcd

In [35]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_disambiguated \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_disambiguated_de-en \
    --workers 8

2023-05-09 11:37:08 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_disambiguated_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de',

In [36]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"
NBEST = 10
BEAM = 10

In [38]:
# Generate N hypothesis
!fairseq-generate data-bin_original_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > original_de-en.decode_Beam_10_backtranslation.log

2023-05-09 12:09:43 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [39]:
# Generate N hypothesis
!fairseq-generate data-bin_disambiguated_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --beam $BEAM \
    --nbest $NBEST \
    --batch-size 64 \
    --memory-efficient-fp16 \
    --remove-bpe > disambiguated_de-en.decode_Beam_10_backtranslation.log

2023-05-09 12:21:17 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [40]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_back.txt
!grep ^H disambiguated_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_disambiguated_back.txt

In [41]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_back.txt', encoding='utf8') as fin, open('original_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('hyp_disambiguated_back.txt', encoding='utf8') as fin, open('disambiguated_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [42]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
# List with disambiguated source sentences
source_disambiguated = []
with open('en_disambiguated.txt', 'r') as fin:
    for line in fin:
        source_disambiguated.append(line.strip())
    
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated            
nbest_modified = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_modified.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))
print(len(nbest_modified))

330
330
330


## Count unique sentences

In [44]:
# Count unique sentences in source nbest list for each source sentence of original; 9.945454545454545
# Value should be 10, because beam search generates 10 unique sentences
for source_nbest in nbest_original:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

9.945454545454545


In [45]:
# Count unique sentences in source nbest list for each source sentence of modified; 9.954545454545455
unique_sent = []
for source_nbest in nbest_modified:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

9.954545454545455


## Count unique words

In [None]:
# Count unique words in source nbest list for each source sentence of original; 16.836363636363636
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_original:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

In [None]:
# Count unique words in source nbest list for each source sentence of modified; 17.64848484848485
# !!! This is normal to generate more unique words, because the disambiguated sentences have more words in total
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_modified:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

# Statistics on backtranslations

In [60]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
# List with disambiguated source sentences
source_disambiguated = []
with open('en_disambiguated.txt', 'r') as fin:
    for line in fin:
        source_disambiguated.append(line.strip())
    
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
# List with nbest sentences for every source in disambiguated
nbest_modified = []
with open('disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_modified.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))
print(len(nbest_modified))

330
330
330


## Source sentence occurrence

In [61]:
# Count how many times the source sentence occurs in the nbest list of original; 258
results = []
counter = 0
for sent in source:
    matches = 0
    for target in nbest_original[counter]: 
        if (sent == target):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(sum(results))
print(sum(x > 0 for x in results))

1840
258


In [62]:
# Count how many times the source sentence occurs in the nbest list of modified; 230
results = []
counter = 0
for sent in source_disambiguated:
    matches = 0
    for target in nbest_modified[counter]: 
        if (sent == target):
            matches += 1
    results.append(matches)  
    counter += 1
    
print(sum(results))
print(sum(x > 0 for x in results))

1288
230


## Ambiguous source words occurrence


In [63]:
# List with source words
source = []
with open('words.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
# List with nbest sentences for every source
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []

nbest_modified = []
with open('disambiguated_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_modified.append(temp)
            counter = 0
            temp = []  

print(len(source))
print(len(nbest_original))
print(len(nbest_modified))     

3888
330
330


In [64]:
# # Count how many times the source words occurs in the nbest list of original
# results = []
# counter = 0
# for word in source:
#     matches = 0
#     for target in nbest_original[counter]: 
#         if (word in target.split(" ")):
#             matches += 1
#     results.append(matches)  
#     counter += 1
    
# print(sum(results))
# print(sum(x > 0 for x in results))

IndexError: list index out of range

In [56]:
# # Count how many times the source words occurs in the nbest list of modified
# results = []
# counter = 0
# for word in source:
#     matches = 0
#     for target in nbest_modified[counter]: 
#         if (word in target.split(" ")):
#             matches += 1
#     results.append(matches)  
#     counter += 1
    
# print(sum(results))
# print(sum(x > 0 for x in results))

IndexError: list index out of range

## Count unique sentences

In [66]:
# Count unique sentences in source nbest list for each source sentence of original; 46.06060606060606
unique_sent = []
for source_nbest in nbest_original:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

46.06060606060606


In [67]:
# Count unique sentences in source nbest list for each source sentence of modified; 51.77272727272727
unique_sent = []
for source_nbest in nbest_modified:
    num_values = len(set(source_nbest))
    #print(num_values)
    unique_sent.append(num_values)
    
#print(unique_sent)
print(sum(unique_sent)/330) # average

51.77272727272727


## Count unique words

In [None]:
# Count unique words in source nbest list for each source sentence of original; 22.593939393939394
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_original:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

In [None]:
# Count unique words in source nbest list for each source sentence of modified; 22.348484848484848
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS


unique_words = []
counter = 0
for source_nbest in nbest_modified:
    words = set()
    for sent in source_nbest:
        tokens = sp(sent)
        for token in tokens:
            if token.text not in stopwords:    # checking whether the word is a stop word
                words.add(token.text)
    num_values = len(words)
    unique_words.append(num_values)
    
    counter += 1
    print(counter)
    
#print(unique_words)
print(sum(unique_words)/330)

# Word alignement

In [75]:
# Input to fast_align must be tokenized and aligned into parallel sentences. 
# line is a source language sentence and its target language translation, 
# separated by a triple pipe symbol with leading and trailing white space (|||)

!$FAST_ALIGN

Usage: /home/vzhekova/fast_align/build/fast_align -i file.fr-en
 Standard options ([USE] = strongly recommended):
  -i: [REQ] Input parallel corpus
  -v: [USE] Use Dirichlet prior on lexical translation distributions
  -d: [USE] Favor alignment points close to the monotonic diagonoal
  -o: [USE] Optimize how close to the diagonal alignment points should be
  -r: Run alignment in reverse (condition on target and predict source)
  -c: Output conditional probability table
 Advanced options:
  -I: number of iterations in EM training (default = 5)
  -q: p_null parameter (default = 0.08)
  -N: No null word
  -a: alpha parameter for optional Dirichlet prior (default = 0.01)
  -T: starting lambda for diagonal distance parameter (default = 4)
  -s: print alignment scores (alignment ||| score, disabled by default)


In [77]:
# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
         
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []

print(len(source))
print(len(nbest_original))           
        
count = 0
with open('original_source-target_en-de.txt', 'w') as fout:
    while count < 330:
        for hyp in nbest_original[count]:
            print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

330
330


In [78]:
# List with original source sentences
source = []
with open('tok.en_disambiguated.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
         
# List with nbest sentences for every source in original 
nbest_disambiguated = []
counter = 0
temp = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_disambiguated.append(temp)
            counter = 0
            temp = []

print(len(source))
print(len(nbest_disambiguated))           
    
# Merge source and target sentences for alignement
count = 0
with open('disambiguated_source-target_en-de.txt', 'w') as fout:
    while count < 330:
        for hyp in nbest_disambiguated[count]:
            print(source[count] + ' ||| ' + hyp, end='\n', file=fout)
        count += 1

330
330


In [None]:
!$FAST_ALIGN -i original_source-target_en-de.txt -d -o -v > original_source-target_en-de_aligned.txt
!$FAST_ALIGN -i disambiguated_source-target_en-de.txt -d -o -v > disambiguated_source-target_en-de_aligned.txt

In [116]:
# Extract target translated words to source words in original
# assume index 1 of source word for test, but should extract them more elegantly after

import re

# List with original translations
translations_original = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        translations_original.append(line.strip())

              
lineNumber = 0  
indices = []
with open('original_source-target_en-de_aligned.txt', 'r') as alignments:
    for line in alignments:
        if re.search(r'1-\d', line): 
            indices.append(int(re.search(r'1-(\d)', line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(indices)

ambiguous_words = set() # set forces uniqueness
for translation in translations_original:
    tokens = translation.split(' ')
    translation_index = translations_original.index(translation)
    if indices[translation_index] != 999:
        ambiguous_words.add(tokens[indices[translation_index]])

#print(ambiguous_words)
print(len(ambiguous_words))

196


In [120]:
# Extract target translated words to source words in disambiguated
# assume index 2 of source word for test, but should extract them more elegantly after

import re

# List with original translations
translations_disambiguated = []
with open('hyp_disambiguated.txt', 'r') as fin:
    for line in fin:
        translations_disambiguated.append(line.strip())

              
lineNumber = 0  
indices = []
with open('disambiguated_source-target_en-de_aligned.txt', 'r') as alignments:
    for line in alignments:
        if re.search(r'2-\d', line): 
            indices.append(int(re.search(r'2-(\d)', line).group(1)))
        else:
            indices.append(999)
        lineNumber += 1
        
#print(indices)

ambiguous_words = set() # set forces uniqueness
for translation in translations_disambiguated:
    tokens = translation.split(' ')
    translation_index = translations_disambiguated.index(translation)
    if indices[translation_index] != 999:
        ambiguous_words.add(tokens[indices[translation_index]])

#print(ambiguous_words)
print(len(ambiguous_words))

167
