In [1]:
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/MyDrive/Courses/Fall 2021/dlsys/DeepLearningSystems-Fall2021/HW3"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Courses/Fall 2021/dlsys/DeepLearningSystems-Fall2021/HW3


In [2]:
import os
from glob import glob
import pprint
from src.routines import *

Since nonbreaking prefixes shouldn't be that common in Vietnamese, I created one with just the Vietnamese alphabet, following what some of the files on [mosesdecoder-nonbreaking_prefixes](https://github.com/moses-smt/mosesdecoder/tree/master/scripts/share/nonbreaking_prefixes) have at the beginning, leaving a few vowels that, if followed by a period or question mark or exclamation mark, could signify end of sentence (these are usually for expressing emotions, for example). 

In [3]:
# Only need to run once 
DATA_PATH = 'data/eng_vie_translations'
OUTPUT_PATH = 'output'
MODEL_PATH = 'model'
MODEL_CONFIG_PATH = os.path.join(MODEL_PATH, 'config')
TRAIN_FILENAME = 'vie.txt'

URL_DATA = 'https://www.manythings.org/anki/vie-eng.zip'
URL_NONBREAKING_ROOT = 'https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/share/nonbreaking_prefixes/'
URL_NONBREAKING_FILES = ['nonbreaking_prefix.en', 'nonbreaking_prefix.vn']

if not os.path.exists(os.path.join(DATA_PATH, TRAIN_FILENAME)):
    !wget -nc "$URL_DATA" -P "$DATA_PATH"
    !unzip -f "$DATA_PATH"/*.zip -d "$DATA_PATH"

    url_nf_eng = URL_NONBREAKING_FILES[0]
    !wget -nc "$URL_NONBREAKING_ROOT$url_nf_eng" -P "$DATA_PATH"

In [5]:
MODEL_PREFIX_NAME = 'engviet-transformer'
BASELINE_CONFIG_FILE = os.path.join(MODEL_CONFIG_PATH, MODEL_PREFIX_NAME + '_baseline.pkl')

if not os.path.exists(os.path.join(BASELINE_CONFIG_FILE)):

    data_files = configure_datafiles(
        data_path               = DATA_PATH, 
        train_filename          = TRAIN_FILENAME, 
        nonbreaking_filenames   = URL_NONBREAKING_FILES
    )

    data_config = dict(
        num_samples         = 80000,
        max_vocab_size      = 2**14,
        max_length          = 15,
        batch_size          = 32
    )

    model_config = dict(
        d_model             = 128,
        n_layers            = 4,
        FFN_units           = 512,
        n_heads             = 32,
        dropout_rate        = 0.1,
        act_fun             = 'swish',
        include_pos_enc     = True,
        vocab_size_factor   = 1
    )

    train_config = dict(
        ckpt_max2keep       = 5,
        num_epochs          = 50,
        print_every         = 25
    )

    translator_sentences = [ 
        "you should pay for it.",
        "we have no extra money.",
        "This is a problem to deal with.",
        "This is a really powerful method!",
        "This is an interesting course about Natural Language Processing",
        "Why is deep learning so popular?",
        "Three frogs walk into the bar and demand six beers. They threaten to tell a joke so bad that it could physically hurt the bartender.",
        "The weather is getting crazily hot these days. Let's go for a swim!",
        "Three summers ago, the managers decided to halt all operations related to amphibians.", 
        "In order to save the planet, humans will need to agree to the frog king's terms, which are insane.",
        "I am a translator that is translating four words while being physically translated 5 centimers to the left",
        "They can throw everything in the can next to the bank near the river after they withdraw all their money from the local bank."]


    with open(BASELINE_CONFIG_FILE, 'wb') as f:
        pickle.dump(dict(
            data_files              = data_files,
            data_config             = data_config, 
            model_config            = model_config,
            train_config            = train_config, 
            translator_sentences    = translator_sentences,
            ), f, protocol=pickle.HIGHEST_PROTOCOL)



In [6]:
exp_variations = dict(
    baseline        = dict(),
    vary_ffnunits   = dict(FFN_units = [256, 1024]),
    vary_dropout    = dict(dropout_rate = [0.05, 0.3])
)

model_id = 0

for exp_name, param_dict in exp_variations.items():
    if len(param_dict) == 0:
        param_name = 'None'
        param_vals = [None]
    else:
        param_name = list(param_dict.keys())
        if len(param_name) != 1: 
            raise('Not allowed, only one parameter to vary per experiment')
        param_name = param_name[0]
        param_vals = param_dict[param_name]

    for v in param_vals:
        model_name = '%s-%02d' %(MODEL_PREFIX_NAME, model_id)
        model_config_file = os.path.join(MODEL_CONFIG_PATH, '%s.pkl' %(model_name))

        with open(BASELINE_CONFIG_FILE, 'rb') as f:
            model_exp_config = pickle.load(f)

        model_exp_config['model_info'] = dict(
            desc        = MODEL_PREFIX_NAME,
            exp_name    = exp_name, 
            exp_param   = param_dict,
            param_name  = param_name, 
            param_val   = v,
            model_name  = model_name, 
            model_id    = model_id,
            config_file = model_config_file
        )

        if param_name != 'None':
            model_exp_config['model_config'][param_name] = v
            
        model_exp_config['output_files'] = configure_outputfiles(model_name, output_path=OUTPUT_PATH, checkpoint_path=MODEL_PATH)

        with open(model_config_file, 'wb') as f: 
            pickle.dump(model_exp_config, f, protocol=pickle.HIGHEST_PROTOCOL)

        model_id += 1