In [2]:
import torch
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool

In [3]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [4]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity


In [None]:
# Extract alignement indices from translation
import re

indices_translation = []
with open('original_source-target_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        alignement_tokens = line.split()
        indices_line = []
        for i in range(0, len(alignement_tokens)):    
            regex = r"" + str(i) + r"-(\d)"
            if re.findall(regex, line): 
                indices_line.append([int(index) for index in re.findall(regex, line)])
            else:
                indices_line.append([999])
        indices_translation.append(indices_line)
        
print(len(indices_translation))
print(indices_translation)

In [None]:
# Match alignement indices from translation to backtranslation
import re

lineNumber = 0
counter = 0
indices_backtranslation = []
with open('original_translation-back_en-de_awesome-aligned.txt', 'r') as alignments:
    for line in alignments:
        if (lineNumber == 10):
            lineNumber = 0
            counter += 1
        alignement_tokens = line.split()
        indices_line = []
        for index_list in indices[counter]:
            index_matches = []
            for index in index_list:
                regex = r"" + str(index) + r"-(\d)"
                if re.findall(regex, line): 
                    index_matches.extend([int(i) for i in re.findall(regex, line)])
                else:
                    index_matches.extend([999])
            indices_line.append(index_matches)
        indices_backtranslation.append(indices_line)
        lineNumber += 1
        
print(len(indices_backtranslation))
print(indices_backtranslation)

In [73]:
# List with lengths of the original source sentences
source_lengths = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source_lengths.append(len(line.strip().split()))

#print(source_lengths)

# List with backtranslations
backtranslations = []
with open('hyp_original_back.txt', 'r') as fin:
     for line in fin:
            backtranslations.append(line.split())

#print(backtranslations)

#TODO
target_words = [] # list containing lists with translation sets for every word in the source sentences; length 330
counter = 0
for i in range(0, 330): # for every source sentence
    source_sent = []
    for j in range(0, source_lengths[i]): # for every word in the source sentence
        words_set = set()
        for  f in range(0, 100):
            alignments = indices_backtranslation[counter + f]        
            if (j < len(alignments)):
                for index in alignments[j]:
                    if index != 999:
                        if (index < len(backtranslations[counter + f])):
                             words_set.add(backtranslations[counter + f][index])
        source_sent.append(words_set)
    target_words.append(source_sent)
    counter += 1

print(target_words)

# Add results to file

# List with original source sentences
source = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        source.append(line.strip())
        
count = 0                
with open('backtranslations_words_original.txt', 'w') as fout:
    while count < 330:
        print(source[count] + ' | ' + str(target_words[count]), end='\n', file=fout)
        count += 1

[[{'The'}, {'developers', 'contractor', 'developer', 'designer', 'real', 'property', 'builder', 'building', 'estate'}, {'fought', 'quarreled', 'disagreed', 'had', 'disputed', 'reasoned', 'quarrelled', 'dispute', 'arguing', 'clashed', 'was', 'argued', 'argues'}, {'with'}, {'the'}, {'constructor', 'engineer', 'designer', 'developer', 'planner', 'builder', 'design', 'designers'}, {'.'}], [{'The'}, {'developers', 'contractor', 'developer', 'designer', 'real', 'mechanic', 'property', 'builder', 'building', 'estate'}, {'fought', 'quarreled', 'disagreed', 'had', 'disputed', 'reasoned', 'quarrelled', 'gave', 'dispute', 'arguing', 'clashed', 'was', 'argued', 'argues'}, {'the', 'with'}, {'shop', 'the'}, {'constructor', 'engineer', 'designer', 'developer', 'planner', 'builder', 'a', 'design', 'designers'}, {'gift', '.'}, {'.'}], [{'The'}, {'developers', 'contractor', 'developer', 'designer', 'real', 'mechanic', 'property', 'builder', 'building', 'estate'}, {'fought', 'quarreled', 'disagreed', 'ha