# Truecasing

In [1]:
import os

def truecase(experiment_folder):
    script = "/home/samuel/Software/mosesdecoder/scripts/recaser/train-truecaser.perl"
    dev_set = "/home/samuel/Documents/research_and_development/tweet-norm-dev_"
    
    print("Learning the truecaser model...")
    for extension in ["source", "target"]:
        !{script} --corpus dev_set{extension}.txt --corpus {experiment_folder}final_paraphrases.{extension}

# Cleaning

In [2]:
def clean(experiment_folder):
    script = "/home/samuel/Software/mosesdecoder/scripts/training/clean-corpus-n.perl"
    !{script} {experiment_folder}final_paraphrases source target paraphrases.clean 1 80
    print("Cleaned file.")

# Language Model

In [3]:
def create_lm_counts(input_dir, output_dir):
    ''' Create language models counts for all the files in a directory '''
    
    filenames = os.listdir(input_dir)
    for filename in filenames:
        !/local/kurs/mt/srilm/bin/i686-m64/ngram-count -kndiscount -text {input_dir}{filename} -lm {ouput_dir}/{filename}.counts -order 5 
        #!head n1 {input_dir}{filename}
        print("Created LM counts for file {filename}".format(**locals()))


# Translation Model

In [4]:
def train_smt(experiment_folder):
    script = "/home/samuel/Software/mosesdecoder/scripts/training/train-model.perl"
    giza = "/home/samuel/Software/mosesdecoder/tools"
    lm_file = "/home/samuel/Documents/research_and_development/language_model/lm.gz"
    corpus = "{experiment_folder}final_paraphrases".format(**locals())
    
    print("Training SMT model...")
    !{script} --corpus {corpus} --f source --e target --root-dir {experiment_folder}translation_model/ --lm 0:5:{lm_file} --external-bin-dir {giza} >{experiment_folder}logfile 2>&1
    print("SMT model trained.")

# Tuning

In [5]:
def tune_smt(experiment_folder):
    script = "/home/samuel/Software/mosesdecoder/scripts/training/mert-moses.pl"
    dev_source = "/home/samuel/Documents/research_and_development/dev_set/tweet-norm-dev_source.txt"
    dev_target = "/home/samuel/Documents/research_and_development/dev_set/tweet-norm-dev_target.txt"
    ini_file = "{experiment_folder}translation_model/model/moses.ini".format(**locals())
    tuning_dir = "{experiment_folder}tuning/".format(**locals())
    moses = "/home/samuel/Software/mosesdecoder/bin/moses"
    !mkdir -p {tuning_dir}
    
    print("Tuning SMT model...")
    !{script} {dev_source} {dev_target} {moses} {ini_file} --working-dir {tuning_dir} --mertdir /home/samuel/Software/mosesdecoder/bin >{tuning_dir}logfile.mert 2>&1 --decoder-flags="-threads 4"
    print("SMT model tuned.")

# Significance Test

In [6]:
def significance_test_pruning(experiment_folder):
    file = f"{experiment_folder}final_paraphrases."
    index_script = "/home/samuel/Software/salm/Bin/Linux/Index/IndexSA.O64"
    prune_script = "/home/samuel/Software/mosesdecoder/contrib/sigtest-filter/filter-pt"
    phrase_table_dir = f"{experiment_folder}/tuning/filtered/"
    
    print("Getting indices...")
    for extension in ["source", "target"]:
        !{index_script} {file}{extension}
    
    print("Unziping phrasetable...")
    !zcat {phrase_table_dir}phrase-table.0-0.1.1.gz > {phrase_table_dir}phrase-table
    
    print("Prunning phrasetable...")
    !cat {phrase_table_dir}phrase-table | {prune_script} -e {file}target -f {file}source -l a+e -n 30 > {phrase_table_dir}phrase-table.pruned
    print("Prunned Phrasetable.")
    !wc l 

In [7]:
def get_lines(experiment_folder):
    path_tuned = f"{experiment_folder}tuning/filtered/phrase-table"
    path_pruned = f"{experiment_folder}tuning/filtered/phrase-table.pruned"
    print("Tunned:")
    !wc -l {path_tuned}
    print("Pruned:")
    !wc -l {path_pruned}
    print()
    

# Testing Tuned Translation Model

In [8]:
def test_tuned(experiment_folder):
    script = "/home/samuel/Software/mosesdecoder/bin/moses"
    test_set_source = "/home/samuel/Documents/research_and_development/test_set/tweet-norm-test_source"
    !{script} -f {experiment_folder}tuning/moses.ini < {test_set_source} > results.tuned
    print("Tuned Translation Model.")


# Evaluate CER and Accuracy

In [12]:
def evaluate_cer_acc(experiment_folder):
    script = "/home/samuel/Software/mosesdecoder/bin/moses"
    test_set_source = "/home/samuel/Documents/research_and_development/test_set/tweet-norm-test_source"
    test_set_target = "/home/samuel/Documents/research_and_development/test_set/tweet-norm-test_target"
    modes = ["tuning"]
    for mode in modes:
        !{script} -f {experiment_folder}tuning/moses.ini < {test_set_source} > {experiment_folder}result.{mode} 2>/dev/null
        acc = !python3 eval_acc.py {experiment_folder}result.{mode}  {test_set_target} {test_set_source} {experiment_folder}acc.error.{mode}
        cer = !python3 eval_cer.py {experiment_folder}result.{mode}  {test_set_target} {test_set_source} {experiment_folder}cer.error.{mode}
        print("CER:\n{cer[0]}\nAcc:\n {acc[0]}\n\n".format(**locals()))
        return acc, cer

# Run experiments

In [33]:
import os
def run_experiments(experiment_folder):
    for experiment in os.listdir(experiment_folder):
        if experiment == "jaccard_0.5":
            print(f"Runing experiment for {experiment}")
            experiment_path = experiment_folder + experiment + "/"
            #clean(experiment_path)
            #train_smt(experiment_path)
            #tune_smt(experiment_path)
            #significance_test_pruning(experiment_path)
            evaluate_cer_acc(experiment_path)
            #get_lines(experiment_path)

        

In [34]:
experiment_folder = "/home/samuel/Documents/research_and_development/new_experiments_delete/"
run_experiments(experiment_folder)
#0.5024509803921569

Runing experiment for jaccard_0.5
CER:
0.17167001080413644
Acc:
 accuracy: 0.16176470588235295


