In [31]:
import torch
import random
import numpy as np
import re

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

PATH="/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT/Sampling"
FASTBPE="/home/vzhekova/fastBPE/fast" # path to the fastBPE tool
FAST_ALIGN="/home/vzhekova/fast_align/build/fast_align" # path to the fast_align tool
TERCOM = "/export/data4/vzhekova/biases-data/Test_De/Statistics/Full_ambiguity_male/Perturbation-basedQE"

In [32]:
# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [33]:
%cd $PATH

/export/data4/vzhekova/biases-data/Test_De/Statistics/BERT/Sampling


# Translation English-German

In [137]:
# Tokenization
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')
md_en = MosesDetokenizer(lang='en')

with open('en_original.txt') as fin, open('tok.en_original.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout) 

print('Finished tokenizing.')

Finished tokenizing.


In [138]:
# Dividing text into subword units

!$FASTBPE applybpe bpe.en_original.en tok.en_original.en bpecodes.en

print('Finished subword.')

Loading codes from bpecodes.en ...
Read 30000 codes from the codes file.
Loading vocabulary from tok.en_original.en ...
Read 245 words (147 unique) from text file.
Applying BPE to tok.en_original.en ...
Modified 245 words from text file.
Finished subword.


In [139]:
# Binarize text
!fairseq-preprocess \
    --source-lang en \
    --target-lang de \
    --testpref bpe.en_original \
    --only-source \
    --srcdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.en.txt \
    --tgtdict /export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble/dict.de.txt \
    --destdir data-bin_original_en-de \
    --workers 8

print('Finished preprocessing.')

2023-10-09 15:10:42 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_en-de', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcd

In [34]:
MODELS="/export/data4/vzhekova/biases-data/En-De/wmt19.en-de.joined-dict.ensemble"

# Generate translations
!fairseq-generate data-bin_original_en-de  \
    --task translation \
    --source-lang en \
    --target-lang de \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --sampling \
    --temperature 0.9 \
    --beam 50 \
    --nbest 50 \
    --batch-size 4 \
    --memory-efficient-fp16 \
    --remove-bpe > original_en-de.decode_Beam_10.log

print('Finished translation.')

2023-10-26 10:57:56 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [35]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_en-de.decode_Beam_10.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_50.txt

## Extract 10 unique sentences for every source

In [36]:
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original_50.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 50):
            nbest_original.append(temp)
            counter = 0
            temp = []

#print(nbest_original)

sent_hyp = []
for elem in nbest_original:
    unique_sent = set()
    for sent in elem:
        unique_sent.add(sent)
        if (len(unique_sent) == 10):
            sent_hyp.extend(list(unique_sent))
            break
        
print (len(sent_hyp))

with open('hyp_original.txt','w') as fout:
    for sent in sent_hyp:
        print(sent, end='\n', file=fout)

100


# Backtranslation German-English

In [37]:
# Dividing tokenized text into subword units

!$FASTBPE applybpe bpe.hyp_original.de hyp_original.txt bpecodes.de

print('Finished subword.')

Loading codes from bpecodes.de ...
Read 30000 codes from the codes file.
Loading vocabulary from hyp_original.txt ...
Read 2189 words (363 unique) from text file.
Applying BPE to hyp_original.txt ...
Modified 2189 words from text file.
Finished subword.


In [38]:
!fairseq-preprocess \
    --source-lang de \
    --target-lang en \
    --only-source \
    --testpref bpe.hyp_original \
    --srcdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.de.txt \
    --tgtdict /export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble/dict.en.txt \
    --destdir data-bin_original_de-en \
    --workers 8

print('Finished preprocessing.')

2023-10-26 11:07:24 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin_original_de-en', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='de', srcd

In [55]:
MODELS="/export/data4/vzhekova/biases-data/De-En/wmt19.de-en.joined-dict.ensemble"

# Generate backtranslations
!fairseq-generate data-bin_original_de-en  \
    --task translation \
    --source-lang de \
    --target-lang en \
    --path $MODELS/model1.pt:$MODELS/model2.pt:$MODELS/model3.pt:$MODELS/model4.pt \
    --sampling \
    --beam 50 \
    --nbest 50 \
    --batch-size 4 \
    --memory-efficient-fp16 \
    --remove-bpe > original_de-en.decode_Beam_10_backtranslation.log

print('Finished translation.')

2023-10-26 11:19:32 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [56]:
# 'LC_ALL=C sort -V' sorts the results in natural order 
!grep ^H original_de-en.decode_Beam_10_backtranslation.log | LC_ALL=C sort -V | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_original_back_50.txt

## Extract 10 unique sentences for every source

In [57]:
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original_back_50.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 50):
            nbest_original.append(temp)
            counter = 0
            temp = []

#print(nbest_original)

sent_hyp = []
for elem in nbest_original:
    unique_sent = set()
    for sent in elem:
        unique_sent.add(sent)
        if (len(unique_sent) == 10):
            sent_hyp.extend(list(unique_sent))
            break
        
print (len(sent_hyp))

with open('hyp_original_back.txt','w') as fout:
    for sent in sent_hyp:
        print(sent, end='\n', file=fout)

1000


In [58]:
# Detokenize text        
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

md_en = MosesDetokenizer(lang='en')

with open('hyp_original_back.txt', encoding='utf8') as fin, open('original_back.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


# Statistics on translations

In [72]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())
          
# List with nbest sentences for every source in original
nbest_original = []
counter = 0
temp = []
with open('hyp_original.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 10):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
print(len(source))
print(len(nbest_original))

10
10


In [73]:
# Count unique sentences in source nbest list for each source sentence
def count_unique_sentences(nbest_sentences):
    unique_sent = []
    for source_nbest in nbest_sentences:
        num_values = len(set(source_nbest))
        #print(num_values)
        unique_sent.append(num_values)

    #print(unique_sent)
    return unique_sent 

In [74]:
# Value should be 10, because beam search generates 10 unique sentences
print(count_unique_sentences(nbest_original))

[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


# Statistics on backtranslations

In [79]:
# List with original source sentences
source = []
with open('en_original.txt', 'r') as fin:
    for line in fin:
        source.append(line.strip())      
    
# List with nbest sentences for every source in original 
nbest_original = []
counter = 0
temp = []
with open('original_back.txt', 'r') as fin:
    for line in fin:
        temp.append(line.strip())
        counter += 1
        if (counter == 100):
            nbest_original.append(temp)
            counter = 0
            temp = []
            
print(len(nbest_original))

10


In [80]:
print(count_unique_sentences(nbest_original))

[66, 78, 98, 100, 100, 100, 100, 100, 100, 100]


## Average results from masking

In [30]:
unique_1st = []
with open('unique_1st.txt', 'r') as fin:
    for line in fin:
        unique_1st.append([int(i) for i in line.strip().split(",")])    
        
print(unique_1st)

unique_2nd = []
with open('unique_2nd.txt', 'r') as fin:
    for line in fin:
        unique_2nd.append([int(i) for i in line.strip().split(",")])    
        
print(unique_2nd)

unique_3rd = []
with open('unique_3rd.txt', 'r') as fin:
    for line in fin:
        unique_3rd.append([int(i) for i in line.strip().split(",")])    
        
print(unique_3rd)

unique_4th = []
with open('unique_3rd.txt', 'r') as fin:
    for line in fin:
        unique_4th.append([int(i) for i in line.strip().split(",")])    
        
print(unique_4th)

unique_5th = []
with open('unique_3rd.txt', 'r') as fin:
    for line in fin:
        unique_5th.append([int(i) for i in line.strip().split(",")])    
        
print(unique_5th)


ValueError: invalid literal for int() with base 10: '[96'

In [282]:
unique_average_5 = []
for i in range(0, len(source)):
    #print(list(zip(unique_1st[i], unique_2nd[i], unique_3rd[i], unique_4th[i], unique_5th[i])))
    average = map(lambda x: (x[0] + x[1] + x[2] + x[3] + x[4])/5, zip(unique_1st[i], unique_2nd[i], unique_3rd[i], unique_4th[i], unique_5th[i]))
    unique_average_5.append(list(average))

print(unique_average_5)

[[38.4, 55.0, 46.0, 30.4, 49.4, 27.2, 24.8, 42.8, 30.2], [54.8, 68.2, 65.2, 65.2, 60.8, 71.8, 63.0, 60.6, 61.2, 37.2, 71.2], [70.0, 68.4, 68.8, 80.8, 68.6, 70.4, 65.4, 64.0, 75.0, 72.2, 65.4, 79.0, 69.4, 71.0, 63.6, 76.6, 74.4, 70.0, 78.4, 69.2, 74.2], [64.8, 54.8, 53.8, 49.0, 56.8, 56.0, 52.8, 59.6, 57.8, 45.2, 56.4, 58.4, 51.0, 52.8, 55.0, 60.0, 56.8, 36.2, 56.0, 55.8, 49.8], [38.2, 35.2, 27.8, 29.4, 26.8, 32.8, 26.8, 42.4, 28.4, 50.0, 26.8, 25.8, 34.0, 24.4, 39.6, 31.0, 37.8, 30.4, 43.8, 32.0, 40.8, 36.4, 24.8, 40.4, 47.4, 50.2, 33.0], [64.8, 62.8, 72.2, 52.0, 65.6, 60.6, 67.4, 58.8, 68.2, 65.6, 59.6, 58.2, 69.6, 67.4, 69.4, 49.8, 57.4, 65.8, 68.2, 72.4, 56.0, 71.0, 67.8, 59.8, 50.6, 68.6, 72.2, 61.2, 52.6, 60.0], [62.8, 66.2, 68.6, 64.8, 67.4, 65.2, 73.6, 55.6, 61.6, 73.8, 64.8, 55.2, 69.8, 71.2, 65.0, 54.0, 67.4, 78.6, 56.8, 73.0, 92.8, 75.0, 82.2, 77.8, 67.8, 69.4, 83.8], [40.6, 52.6, 45.0, 46.6, 47.4, 35.2, 48.0, 39.0, 44.6, 47.6, 41.4, 46.2, 69.0, 39.2, 44.0, 36.4, 43.6, 45.8, 

In [283]:
unique_average_3 = []
for i in range(0, len(source)):
    #print(list(zip(unique_1st[i], unique_2nd[i], unique_3rd[i])))
    average = map(lambda x: (x[0] + x[1] + x[2])/3, zip(unique_1st[i], unique_2nd[i], unique_3rd[i]))
    unique_average_3.append(list(average))

print(unique_average_3)

[[36.666666666666664, 57.0, 42.0, 32.0, 45.666666666666664, 26.666666666666668, 27.333333333333332, 38.666666666666664, 31.0], [55.333333333333336, 67.66666666666667, 64.66666666666667, 64.66666666666667, 61.333333333333336, 70.33333333333333, 63.0, 61.666666666666664, 64.0, 35.333333333333336, 70.0], [71.33333333333333, 73.33333333333333, 68.66666666666667, 82.0, 71.66666666666667, 75.33333333333333, 65.66666666666667, 66.66666666666667, 76.33333333333333, 72.33333333333333, 65.0, 77.66666666666667, 64.33333333333333, 59.666666666666664, 60.666666666666664, 74.33333333333333, 74.66666666666667, 69.33333333333333, 78.0, 70.0, 73.66666666666667], [62.0, 53.333333333333336, 51.666666666666664, 50.333333333333336, 52.666666666666664, 51.333333333333336, 52.0, 57.333333333333336, 55.666666666666664, 44.0, 61.333333333333336, 59.333333333333336, 50.333333333333336, 50.0, 57.0, 61.333333333333336, 54.0, 35.666666666666664, 55.333333333333336, 55.0, 49.666666666666664], [40.333333333333336, 3

In [284]:
import statistics

unique_median = []
for i in range(0, len(source)):
    med = map(lambda x: statistics.median([x[0], x[1], x[2] + x[3] + x[4]]), zip(unique_1st[i], unique_2nd[i], unique_3rd[i], unique_4th[i], unique_5th[i]))
    unique_median.append(list(med))

print(unique_median)

[[39, 66, 38, 35, 50, 32, 32, 35, 32], [58, 70, 64, 64, 66, 74, 63, 63, 72, 38, 74], [77, 89, 70, 84, 77, 95, 68, 72, 79, 77, 66, 84, 61, 53, 63, 75, 76, 69, 91, 74, 78], [64, 54, 58, 54, 58, 47, 58, 58, 58, 46, 69, 63, 58, 61, 61, 68, 58, 40, 58, 55, 57], [44, 39, 26, 34, 36, 27, 27, 26, 38, 48, 27, 26, 27, 27, 37, 27, 36, 26, 45, 32, 49, 36, 27, 45, 28, 67, 28], [76, 70, 75, 70, 70, 75, 80, 70, 70, 71, 70, 71, 70, 72, 70, 70, 70, 74, 70, 70, 70, 70, 70, 67, 54, 67, 72, 67, 70, 67], [70, 70, 73, 68, 70, 70, 82, 70, 65, 77, 83, 76, 56, 70, 70, 62, 78, 71, 73, 70, 97, 71, 71, 90, 69, 70, 79], [44, 56, 58, 46, 41, 40, 40, 47, 54, 48, 56, 40, 47, 41, 52, 40, 52, 40, 47, 49, 44, 50, 40, 51, 42, 55, 63, 50, 47, 48, 46, 49, 41, 53, 50, 40, 44, 40, 40, 52], [45, 53, 63, 48, 35, 46, 31, 37, 39, 29, 33, 39, 32, 32, 38, 36, 35, 24, 19, 31], [53, 35, 35, 44, 35, 65, 36, 41, 50, 74, 56, 53, 48, 63, 48, 69, 44, 90, 64, 60, 52, 52, 41, 64, 73, 40, 35, 40, 38, 45, 35, 44, 37, 35, 35, 35, 39, 29, 51]]

## Subtract original sentences value from average

In [285]:
unique_original = count_unique_sentences(nbest_original)

print(unique_original)

[32, 63, 68, 61, 26, 67, 70, 46, 31, 48]


In [286]:
# List with original source sentences
samples = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        samples.append(line.strip().split(" "))

unique_result = []
for c in range(0, len(unique_original)):
    subtracted = [round(abs(el - unique_original[c]), 2) for el in unique_average_3[c]] # calculate the absolute value
    print(subtracted)
    unique_result.append(subtracted)
    print(samples[c])
    idx = subtracted.index(max(subtracted)) # extracting the largest difference
    
    #Extract replacement words
    with open('1st/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('2nd/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('3rd/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('4th/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    with open('5th/sen' + str(c+1) + '.txt','r') as fin:
        for i, line in enumerate(fin):
            if i == idx:
                print(line.strip().split(" ")[idx])
                
    print((max(subtracted), idx, samples[c][idx]), end='\n\n')

[4.67, 25.0, 10.0, 0.0, 13.67, 5.33, 4.67, 6.67, 1.0]
['So', 'now', 'Thomson', 'becomes', 'the', 'more', 'likely', 'suspect', '.']
,
james
david
peter
john
(25.0, 1, 'now')

[7.67, 4.67, 1.67, 1.67, 1.67, 7.33, 0.0, 1.33, 1.0, 27.67, 7.0]
['There', 'was', 'one', 'black', 'professor', 'and', 'one', 'black', 'assistant', 'dean', '.']
man
woman
girl
guy
boy
(27.67, 9, 'dean')

[3.33, 5.33, 0.67, 14.0, 3.67, 7.33, 2.33, 1.33, 8.33, 4.33, 3.0, 9.67, 3.67, 8.33, 7.33, 6.33, 6.67, 1.33, 10.0, 2.0, 5.67]
['We', 'have', 'our', 'cognitive', 'biases', ',', 'so', 'that', 'I', 'can', 'take', 'a', 'perfect', 'history', 'on', 'a', 'patient', 'with', 'chest', 'pain', '.']
own
usual
research
same
personal
(14.0, 3, 'cognitive')

[1.0, 7.67, 9.33, 10.67, 8.33, 9.67, 9.0, 3.67, 5.33, 17.0, 0.33, 1.67, 10.67, 11.0, 4.0, 0.33, 7.0, 25.33, 5.67, 6.0, 11.33]
['That', '&apos;s', 'the', 'officer', 'who', 'emailed', 'me', 'back', ',', 'saying', 'I', 'think', 'you', 'can', 'have', 'a', 'few', 'classes', 'with', 

- Extract 5 best from the list

In [287]:
# List with original source sentences
samples = []
with open('tok.en_original.en', 'r') as fin:
    for line in fin:
        samples.append(line.strip().split(" "))

unique_result = []
for c in range(0, len(unique_original)):
    subtracted = [round(abs(el - unique_original[c]), 2) for el in unique_average_3[c]] # calculate the absolute value
    print(subtracted)
    unique_result.append(subtracted)
    print(samples[c])
    
    subtracted_sorted = sorted(subtracted, reverse=True)
    best_5 = subtracted_sorted[:5] # extracting the 5 largest differences
    idx_5 = [subtracted.index(i) for i in best_5]
    result_5 = [samples[c][idx] for idx in idx_5]
    
#     #Extract replacement words
#     with open('1st/sen' + str(c+1) + '.txt','r') as fin:
#         for i, line in enumerate(fin):
#             if i == idx:
#                 print(line.strip().split(" ")[idx])
                
#     with open('2nd/sen' + str(c+1) + '.txt','r') as fin:
#         for i, line in enumerate(fin):
#             if i == idx:
#                 print(line.strip().split(" ")[idx])
                
#     with open('3rd/sen' + str(c+1) + '.txt','r') as fin:
#         for i, line in enumerate(fin):
#             if i == idx:
#                 print(line.strip().split(" ")[idx])
                
#     with open('4th/sen' + str(c+1) + '.txt','r') as fin:
#         for i, line in enumerate(fin):
#             if i == idx:
#                 print(line.strip().split(" ")[idx])
                
#     with open('5th/sen' + str(c+1) + '.txt','r') as fin:
#         for i, line in enumerate(fin):
#             if i == idx:
#                 print(line.strip().split(" ")[idx])
                
    print((best_5, idx_5, result_5), end='\n\n')

[4.67, 25.0, 10.0, 0.0, 13.67, 5.33, 4.67, 6.67, 1.0]
['So', 'now', 'Thomson', 'becomes', 'the', 'more', 'likely', 'suspect', '.']
([25.0, 13.67, 10.0, 6.67, 5.33], [1, 4, 2, 7, 5], ['now', 'the', 'Thomson', 'suspect', 'more'])

[7.67, 4.67, 1.67, 1.67, 1.67, 7.33, 0.0, 1.33, 1.0, 27.67, 7.0]
['There', 'was', 'one', 'black', 'professor', 'and', 'one', 'black', 'assistant', 'dean', '.']
([27.67, 7.67, 7.33, 7.0, 4.67], [9, 0, 5, 10, 1], ['dean', 'There', 'and', '.', 'was'])

[3.33, 5.33, 0.67, 14.0, 3.67, 7.33, 2.33, 1.33, 8.33, 4.33, 3.0, 9.67, 3.67, 8.33, 7.33, 6.33, 6.67, 1.33, 10.0, 2.0, 5.67]
['We', 'have', 'our', 'cognitive', 'biases', ',', 'so', 'that', 'I', 'can', 'take', 'a', 'perfect', 'history', 'on', 'a', 'patient', 'with', 'chest', 'pain', '.']
([14.0, 10.0, 9.67, 8.33, 8.33], [3, 18, 11, 8, 8], ['cognitive', 'chest', 'a', 'I', 'I'])

[1.0, 7.67, 9.33, 10.67, 8.33, 9.67, 9.0, 3.67, 5.33, 17.0, 0.33, 1.67, 10.67, 11.0, 4.0, 0.33, 7.0, 25.33, 5.67, 6.0, 11.33]
['That', '&apos