In [1]:
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/MyDrive/Courses/Fall 2021/dlsys/DeepLearningSystems-Fall2021/HW3"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Courses/Fall 2021/dlsys/DeepLearningSystems-Fall2021/HW3


In [18]:
import os
from glob import glob
import tensorflow as tf
import pandas as pd
from src.routines import *

In [3]:
# Only need to run once 
DATA_PATH = 'data/eng_spa_translations'
OUTPUT_PATH = 'output'
MODEL_PATH = 'model'
MODEL_CONFIG_PATH = os.path.join(MODEL_PATH, 'config')
TRAIN_FILENAME = 'spa.txt'

URL_DATA = 'https://www.manythings.org/anki/spa-eng.zip'
URL_NONBREAKING_ROOT = 'https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/share/nonbreaking_prefixes/'
URL_NONBREAKING_FILES = ['nonbreaking_prefix.en', 'nonbreaking_prefix.es']

if not os.path.exists(os.path.join(DATA_PATH, TRAIN_FILENAME)):
    !wget -nc "$URL_DATA" -P "$DATA_PATH"
    !unzip -f "$DATA_PATH"/*.zip -d "$DATA_PATH"

    for url_nf in URL_NONBREAKING_FILES: 
        !wget -nc "$URL_NONBREAKING_ROOT$url_nf" -P "$DATA_PATH"


In [71]:
BASELINE_CONFIG_FILE = os.path.join(MODEL_CONFIG_PATH, 'baseline.pkl')

if not os.path.exists(BASELINE_CONFIG_FILE):
    data_files = configure_datafiles(
        data_path               = DATA_PATH, 
        train_filename          = TRAIN_FILENAME, 
        nonbreaking_filenames   = URL_NONBREAKING_FILES
    )

    data_config = dict(
        num_samples         = 80000,
        max_vocab_size      = 2**14,
        max_length          = 15,
        batch_size          = 64
    )

    model_config = dict(
        d_model             = 512,
        n_layers            = 4,
        FFN_units           = 512,
        n_heads             = 8,
        dropout_rate        = 0.1,
        act_fun             = 'relu',
        include_pos_enc     = True,
        vocab_size_factor   = 1
    )

    train_config = dict(
        ckpt_max2keep       = 5,
        num_epochs          = 1,
        print_every         = 100
    )

    translator_sentences = [ 
        "you should pay for it.",
        "we have no extra money.",
        "This is a problem to deal with.",
        "This is a really powerful method!",
        "This is an interesting course about Natural Language Processing",
        "Why is deep learning so popular?",
        "I am a translator that is translating four words while being physically translated 5 centimers to the left",
        "They can throw everything in the can next to the bank near the river after they withdraw all their money from the local bank."]


    with open(BASELINE_CONFIG_FILE, 'wb') as f:
        pickle.dump(dict(
            data_files              = data_files,
            data_config             = data_config, 
            model_config            = model_config,
            train_config            = train_config, 
            translator_sentences    = translator_sentences,
            ), f, protocol=pickle.HIGHEST_PROTOCOL)



In [72]:
exp_variations = dict(
    baseline        = dict(),
    vary_depth      = dict(n_layers = [2, 6, 8]),
    vary_embeddim   = dict(d_model = [128, 256, 1024]),
    vary_numheads   = dict(n_heads = [4, 16, 32]),
    half_vocab      = dict(vocab_size_factor = [0.5]),
    remove_posenc   = dict(include_pos_enc = [False]),
    vary_actfun     = dict(act_fun = ['gelu', 'swish'])
)

model_id = 0

for exp_name, param_dict in exp_variations.items():
    if len(param_dict) == 0:
        param_name = 'None'
        param_vals = [None]
    else:
        param_name = list(param_dict.keys())
        if len(param_name) != 1: 
            raise('Not allowed, only one parameter to vary per experiment')
        param_name = param_name[0]
        param_vals = param_dict[param_name]

    for v in param_vals:
        model_name = 'transformer-%02d' %(model_id)
        model_config_file = os.path.join(MODEL_CONFIG_PATH, '%s.pkl' %(model_name))

        with open(BASELINE_CONFIG_FILE, 'rb') as f:
            model_exp_config = pickle.load(f)

        model_exp_config['model_info'] = dict(
            exp_name    = exp_name, 
            exp_param   = param_dict,
            param_name  = param_name, 
            param_val   = v,
            model_name  = model_name, 
            model_id    = model_id,
            config_file = model_config_file
        )

        if param_name != 'None':
            model_exp_config['model_config'][param_name] = v
            
        model_exp_config['output_files'] = configure_outputfiles(model_name, output_path=OUTPUT_PATH, checkpoint_path=MODEL_PATH)

        with open(model_config_file, 'wb') as f: 
            pickle.dump(model_exp_config, f, protocol=pickle.HIGHEST_PROTOCOL)

        model_id += 1

In [73]:
# Double check
config_files = glob(MODEL_CONFIG_PATH +'/transformer*.pkl')

sel_keys = ['model_info', 'model_config', 'output_files']
all_confs = []
for i,fn in enumerate(config_files):
    with open(fn, 'rb') as f: 
        conf = pickle.load(f)
    conf = {k:v for k, v in conf.items() if k in sel_keys}
    cat_conf = {}
    for k in sel_keys: 
        cat_conf = dict(**cat_conf, **conf[k])
    del cat_conf['exp_param']
    all_confs.append(pd.DataFrame(cat_conf,index=[i]))
all_confs = pd.concat(all_confs, ignore_index=True)
all_confs

Unnamed: 0,exp_name,param_name,param_val,model_name,model_id,config_file,d_model,n_layers,FFN_units,n_heads,dropout_rate,act_fun,include_pos_enc,vocab_size_factor,results,checkpoint
0,baseline,,,transformer-00,0,model/config/transformer-00.pkl,512,4,512,8,0.1,relu,True,1.0,output/transformer-00.pkl,model/transformer-00
1,vary_depth,n_layers,2,transformer-01,1,model/config/transformer-01.pkl,512,2,512,8,0.1,relu,True,1.0,output/transformer-01.pkl,model/transformer-01
2,vary_depth,n_layers,6,transformer-02,2,model/config/transformer-02.pkl,512,6,512,8,0.1,relu,True,1.0,output/transformer-02.pkl,model/transformer-02
3,vary_depth,n_layers,8,transformer-03,3,model/config/transformer-03.pkl,512,8,512,8,0.1,relu,True,1.0,output/transformer-03.pkl,model/transformer-03
4,vary_embeddim,d_model,128,transformer-04,4,model/config/transformer-04.pkl,128,4,512,8,0.1,relu,True,1.0,output/transformer-04.pkl,model/transformer-04
5,vary_embeddim,d_model,256,transformer-05,5,model/config/transformer-05.pkl,256,4,512,8,0.1,relu,True,1.0,output/transformer-05.pkl,model/transformer-05
6,vary_embeddim,d_model,1024,transformer-06,6,model/config/transformer-06.pkl,1024,4,512,8,0.1,relu,True,1.0,output/transformer-06.pkl,model/transformer-06
7,vary_numheads,n_heads,4,transformer-07,7,model/config/transformer-07.pkl,512,4,512,4,0.1,relu,True,1.0,output/transformer-07.pkl,model/transformer-07
8,vary_numheads,n_heads,16,transformer-08,8,model/config/transformer-08.pkl,512,4,512,16,0.1,relu,True,1.0,output/transformer-08.pkl,model/transformer-08
9,vary_numheads,n_heads,32,transformer-09,9,model/config/transformer-09.pkl,512,4,512,32,0.1,relu,True,1.0,output/transformer-09.pkl,model/transformer-09


In [74]:
config_files

['model/config/transformer-00.pkl',
 'model/config/transformer-01.pkl',
 'model/config/transformer-02.pkl',
 'model/config/transformer-03.pkl',
 'model/config/transformer-04.pkl',
 'model/config/transformer-05.pkl',
 'model/config/transformer-06.pkl',
 'model/config/transformer-07.pkl',
 'model/config/transformer-08.pkl',
 'model/config/transformer-09.pkl',
 'model/config/transformer-10.pkl',
 'model/config/transformer-11.pkl',
 'model/config/transformer-12.pkl',
 'model/config/transformer-13.pkl']

In [None]:
select_config_files_to_run = ['model/config/transformer-00.pkl', 'model/config/transformer-04.pkl']

for fn in select_config_files_to_run:
    with open(fn, 'rb') as f: 
        conf = pickle.load(f)
    run_each_model(**conf)

Starting epoch 1


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch 1 Batch 0 Loss 4.7441 Accuracy 0.0000


  8%|▊         | 100/1250 [00:47<09:03,  2.12it/s]

Epoch 1 Batch 100 Loss 4.4875 Accuracy 0.0533


 16%|█▌        | 200/1250 [01:35<08:10,  2.14it/s]

Epoch 1 Batch 200 Loss 4.1702 Accuracy 0.0712


 24%|██▍       | 300/1250 [02:22<07:50,  2.02it/s]

Epoch 1 Batch 300 Loss 3.8229 Accuracy 0.0952


 32%|███▏      | 400/1250 [03:10<06:48,  2.08it/s]

Epoch 1 Batch 400 Loss 3.5437 Accuracy 0.1122


 40%|████      | 500/1250 [03:57<06:04,  2.06it/s]

Epoch 1 Batch 500 Loss 3.3383 Accuracy 0.1255


 48%|████▊     | 600/1250 [04:44<05:02,  2.15it/s]

Epoch 1 Batch 600 Loss 3.1759 Accuracy 0.1365


 56%|█████▌    | 700/1250 [05:31<04:27,  2.06it/s]

Epoch 1 Batch 700 Loss 3.0450 Accuracy 0.1455


 64%|██████▍   | 800/1250 [06:19<03:37,  2.07it/s]

Epoch 1 Batch 800 Loss 2.9318 Accuracy 0.1537


 65%|██████▍   | 811/1250 [06:24<03:26,  2.13it/s]