# Onception

Online learning x (Active Learning x (Online learning x Machine Translation))

In [None]:
from pathlib import Path # only supported in Python 3
import os
import re

from io_utils import *
from nlp_utils import *
from OnlineAlgorithm import *
from QueryStrategy import *
from TaskModel import *
from Instance import *
from qs_params import *

print("Done.")

### Parameters

Online learning x Machine Translation

In [None]:
# EWAF / EXP3 params
algorithm_mt = "EXP3" #"EXP3" # "EWAF
eta_param_mt = 8 # weight update (EWAF)
dp_mt = 2 # reward decimal places
reward_function_mt = "human"
#"human" "human-avg" "human-comet" "comet" "bleu"

# MT params
src_lang = "lt"
mt_lang = "en"

lang = src_lang + "-" + mt_lang

Online learning x Active Learning

In [None]:
# EWAF / EXP3 params
algorithm_al = "EWAF"
eta_param_al = 8 # weight update (EWAF)
dp_al = 2 # reward decimal places
reward_function_al = "regret"

# AL

#All #NoDen #NoPrism
query_strategies_exp = query_strategies_params[lang][algorithm_mt]["All"]

query_strategies_exp_str = []
for qs in query_strategies_exp:
    qs_name, qs_params = qs
    qs_params2 = re.sub("\'|\(|\)|,| ", "", str(qs_params))
    query_strategies_exp_str.append(qs_name + qs_params2)
    
query_strategies_str = "+".join(query_strategies_exp_str)
print(query_strategies_str)

Setup

In [None]:
run ="1"

data_folder = Path("datasets/{}/".format(lang))
emb_folder = Path("embeddings/")
results_folder = Path("results/{}/".format(lang))

In [None]:
exp_settings = "{}_{}_{}_{}_{}_{}_{}_{}_run{}".format(algorithm_al, dp_al, eta_param_al, query_strategies_str, algorithm_mt, reward_function_mt, dp_mt, eta_param_mt, run)

### Data loading

In [None]:
learn_sent_ids_filepath = data_folder / "shuf_ids.txt"
corpus_filepath = data_folder / "{}_prism.pickle".format(lang)

learning_ids = load_int_list_from_txt(learn_sent_ids_filepath)
print("First:", learning_ids[0])

full_corpus = load_dataframe_from_pickle(corpus_filepath)

print(full_corpus.iloc[0])

print("Done.")

In [None]:
src_emb_filepath = emb_folder / "BERT_{}_src_{}.pickle".format(lang, src_lang)
mt_emb_filepath = emb_folder / "BERT_{}_mt_{}.pickle".format(lang, mt_lang)

src_embeddings = load_dict_from_pickle(src_emb_filepath)
mt_embeddings = load_dict_from_pickle(mt_emb_filepath)

In [None]:
if lang == "en-de":
    idx = 'sent_id'
else:
    idx = 'sid'

all_model_names = list(full_corpus.system.unique())
all_models = []

num_of_models = len(all_model_names)

for model_name in all_model_names:
    
    system_info = full_corpus.loc[full_corpus['system'] == model_name]
    
    translations = dict(zip(system_info[idx], system_info['mt']))
    human_scores = dict(zip(system_info[idx], system_info['raw_score']))
    bleu_scores = dict(zip(system_info[idx], system_info['bleu_score']))
    comet_scores = dict(zip(system_info[idx], system_info['comet_score']))
    
    model = TaskModel(model_name, translations, human_scores, bleu_scores, comet_scores, num_of_models, reward_function_mt)
    all_models.append(model)
    
    print(model)


### Outer online learning init

In [None]:
num_qs = len(query_strategies_exp)
print(num_qs)

oa_al = init_online_algorithm(algorithm_al, num_qs, decimal_places=dp_al, eta_value=eta_param_al, reward_function=reward_function_al)
print(oa_al)

### Active Learning init

In [None]:
query_strategies = []

for qs in query_strategies_exp:
    qs_name, params = qs
    new_strategy = init_query_strategy(qs_name, params, num_qs, reward_function_al)    
    print(new_strategy)
    query_strategies.append(new_strategy)
    
print(len(query_strategies)) 
print("Done.")

In [None]:
L = []
U = []

for sent_id in learning_ids:
    
    full_info = full_corpus.loc[full_corpus[idx] == sent_id]
    
    src_sentence = str(full_info['src'].iloc[0])
    src_sentence_pp = remove_punctuation(src_sentence)    
    src_embedding = src_embeddings[sent_id]
    
    inst = Instance(sent_id, src_sentence, src_sentence_pp, src_embedding)
    
    model_names = list(full_info.system.unique())
    
    for model_name in model_names:
        system_info = full_info.loc[full_corpus['system'] == model_name]
        
        mt_sentence = str(system_info['mt'].iloc[0])
        mt_sentence_pp = remove_punctuation(mt_sentence)
        mt_embedding = mt_embeddings[model_name][sent_id]
        
        mt_prism_score = float(system_info['prism_score'].iloc[0]) 
        
        inst.add_mt(model_name, mt_sentence, mt_sentence_pp, mt_embedding, mt_prism_score)
    
    U.append(inst)

print(len(U))

### Inner Online Learning init

In [None]:
oa_mt = init_online_algorithm(algorithm_mt, num_of_models, decimal_places=dp_mt, eta_value=eta_param_mt, reward_function=reward_function_mt)
print(oa_mt)

In [None]:
weights_filepath = results_folder / "weights_{}.csv".format(exp_settings)

# print initial weights
models_str = [str(m.model_name) for m in all_models]
weights_str = [str(w) for w in oa_mt.weights_as_probabilities]

with weights_filepath.open("w", encoding="utf8") as f:
    print(','.join(models_str), file=f)
    print(','.join(weights_str), file=f)

In [None]:
weights_qs_filepath = results_folder / "weights_qs_{}.csv".format(exp_settings)

# print initial weights
qs_str =  ["{}-{}".format(qs.name, str(qs.sm)) for qs in query_strategies]
weights_qs_str = [str(w) for w in oa_al.weights_as_probabilities]

with weights_qs_filepath.open("w", encoding="utf8") as f:
    print(','.join(qs_str), file=f)
    print(','.join(weights_qs_str), file=f)

In [None]:
regret_mt_filepath = results_folder / "regret_mt_{}.csv".format(exp_settings)

with regret_mt_filepath.open("w", encoding="utf8") as f:
    print("Regret OL_MT", file=f)

In [None]:
regret_al_filepath = results_folder / "regret_al_{}.csv".format(exp_settings)

with regret_al_filepath.open("w", encoding="utf8") as f:
    print("Regret OL_AL", file=f)

### Online learning process

In [None]:
t = 1

U_copy = U.copy()
U_log = []

for sent_id in learning_ids:
    
    current_inst = U_copy[t-1]

    print(" ------------------------- ITERATION {} -------------------------".format(t))  
    
    f_prediction = oa_mt.forecaster(all_models, params=sent_id)
    vote = oa_al.forecaster(query_strategies, params=(current_inst, L, U_log))
    
    if t == 1 or vote:
        
        oa_mt.update(all_models, t, params=sent_id)
        
        L.append(current_inst)
        U.remove(current_inst)
        
        oa_al.update(query_strategies, t, params=(oa_mt.previous_regret, oa_mt.regret, algorithm_mt))

        ### print weights MT
        weights_str = [str(w) for w in oa_mt.weights_as_probabilities]    
        with weights_filepath.open("a", encoding="utf8") as f:
            print(','.join(weights_str), file=f)
        
        ### print weights AL
        weights_qs_str = [str(w) for w in oa_al.weights_as_probabilities]
        with weights_qs_filepath.open("a", encoding="utf8") as f:
            print(','.join(weights_qs_str), file=f)

        ### print regret MT
        with regret_mt_filepath.open("a", encoding="utf8") as f:
            print(oa_mt.regret, file=f)
            
        ### print regret AL
        with regret_al_filepath.open("a", encoding="utf8") as f:
            print(oa_al.regret, file=f)
    else:
        U_log.append(current_inst)
            

    t = t + 1

print("Done.")