# Prepare

## References
* 94.99 from the paper: [Universal Language Model Fine-tuning for Text Classification](https://aclweb.org/anthology/P18-1031)
* 92 or 93 from [anubhavmaity's notebook](https://github.com/anubhavmaity/Ag-News-Category-Classifier/blob/master/news_category_classifier.ipynb)
* **95.16** by Mike

In [0]:
# Ensure GPU spec; T4 is for colab and one can change it for another env.
gpu_list = !nvidia-smi -L
if gpu_list[0].startswith('NVIDIA-SMI has failed'):
  print('Runtime type should be GPU.')
elif not gpu_list[0].startswith('GPU 0: Tesla T4'):
  display(gpu_list)
  print('Please reset all runtimes. We need a Tesla T4 to reproduce the experiments!')
else:
  display(gpu_list)

## Dependency

### Install

In [0]:
# Ensure no surprises from conflict packages.
!pip check

In [0]:
%%capture pip_logs
!pip install -U fastai==1.0.57 ipyexperiments jupyter-console==5.2.0 coverage==4.5.3 coveralls datascience albumentations

In [0]:
colab_vnd = 'application/vnd.colab-display-data+json'
for o in pip_logs.outputs:
  if colab_vnd in o.data and 'pip_warning' in o.data[colab_vnd]:
    o.display()
!pip check

### Import

In [0]:
import gc
import math
from pathlib import Path
import pickle
import random
from shutil import copytree
from typing import Optional, Tuple

from google.colab import drive
import numpy as np
import pandas as pd
import torch

from fastai import basic_data, basic_train, core
from fastai import *
from fastai.callbacks import CSVLogger, MixedPrecision
from fastai.core import plt
from fastai.text import *
from fastprogress import fastprogress

from ipyexperiments import *

### Init


In [0]:
# Not set earlier because pip may require a restart.
SESSN_START, = !date +%Y%m%dT%H%M

In [0]:
%load_ext autoreload
%autoreload 2

%config InlineBackend.figure_formats = {'png', 'retina'}

In [0]:
# Stylize the plot of `lr_find()`
plt.style.use(['dark_background','seaborn-poster','seaborn-deep'])
plt.rcParams['axes.grid'] = True
plt.rcParams['axes.grid.axis'] = 'x'
plt.rcParams['axes.grid.which'] = 'both'
plt.rcParams['grid.alpha'] = 0.5
plt.rcParams['grid.color'] = 'xkcd:lime green'
plt.rcParams['grid.linestyle'] = ':'

In [0]:
# A special treatment for colab to decrease network traffic.
fastprogress.NO_BAR = True
master_bar, progress_bar = fastprogress.force_console_behavior()
basic_train.master_bar, basic_train.progress_bar = master_bar, progress_bar
basic_data.master_bar, basic_data.progress_bar = master_bar, progress_bar
dataclass.master_bar, dataclass.progress_bar = master_bar, progress_bar
text.master_bar, text.progress_bar = master_bar, progress_bar
text.data.master_bar, text.data.progress_bar = master_bar, progress_bar
core.master_bar, core.progress_bar = master_bar, progress_bar

In [0]:
COLAB_CONTENT_DIR = Path('/content')
GD_DIR = COLAB_CONTENT_DIR / 'gdrive'
drive.mount(str(GD_DIR), force_remount=True)

In [0]:
CORPUS = 'ag_news_csv'
BASE_DIR = GD_DIR / 'My Drive' / CORPUS
BASE_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR = BASE_DIR / 'data'
DATA_DIR.mkdir(parents=True, exist_ok=True)
MDLS_DIR = BASE_DIR / 'models'
MDLS_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR = BASE_DIR / 'logs'
LOGS_DIR.mkdir(parents=True, exist_ok=True)

FASTAI_DATA_DIR = Path('/root/.fastai/data')
FASTAI_DATA_DIR.mkdir(parents=True, exist_ok=True)
COLAB_DATA_DIR = COLAB_CONTENT_DIR / 'data'
if not COLAB_DATA_DIR.is_symlink():
  COLAB_DATA_DIR.symlink_to(FASTAI_DATA_DIR)
if (COLAB_CONTENT_DIR / 'sample_data').exists():
  !rm -rf /content/sample_data/

CORPUS_IN_COLAB_DATA_DIR = COLAB_DATA_DIR / CORPUS

In [0]:
downloaded_corpus_dir = untar_data(URLs.AG_NEWS, dest=COLAB_DATA_DIR)
assert downloaded_corpus_dir == CORPUS_IN_COLAB_DATA_DIR

# Assign

In [0]:
#@title Hyper-parameters

lm_bs = 64  #@param {type: "number"}
cf_bs = 64  #@param {type: "number"}
bptt = 70  #@param {type: "number"}
moms = (0.8, 0.7)  #@param

REFERRED_LM_LR = 1e-2 * lm_bs / 48
REFERRED_CF_LR = 2e-2 * cf_bs / 48

#@markdown ---

lm_wd = 0.01  #@param {type: "number"}
cf_wd = 0.01  #@param {type: "number"}
lm_drop_mult = 1.0  #@param {type: "number"}
cf_drop_mult = 0.5  #@param {type: "number"}

FW_LM_DBNCH_FNAME = f'fw_lm_dbnch-b{lm_bs}-bptt{bptt}.pkl'
FW_CF_DBNCH_FNAME = f'fw_cf_dbnch-b{cf_bs}-bptt{bptt}.pkl'

VOCAB_FILE = DATA_DIR / 'vocab.pkl'

In [0]:
CLASSES = []  # Unused

In [0]:
# Set num_workers to main process since the training set will be shuffled.
N_DBNCH_WRKRS = 0

In [0]:
# One seed to rule pseudo-random number generators all.
SEED = 42

# Define

## PRNG State Fixer

In [0]:
@dataclass
class PseudoRandomStatesHolder:
  py3_state: Tuple[int, Tuple[int], Optional[float]]
  np_state: Tuple[str, np.ndarray, int, int, float]
  torch_state: torch.ByteTensor
  cuda_states: List[torch.ByteTensor]

In [0]:
def reset_prng_states(seed=SEED):
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)  # This implies torch.cuda.manual_seed_all(SEED) now
  # if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
  torch.backends.cudnn.deterministic = True  # About 15% slower but...
  torch.backends.cudnn.benchmark = False

In [0]:
def get_prng_states():
  prng_states = PseudoRandomStatesHolder(random.getstate(),
                                         np.random.get_state(),
                                         torch.get_rng_state(),
                                         torch.cuda.get_rng_state_all())
  # print(f'Got prng_states:\n'
  #       f'  py3:   {prng_states.py3_state}\n'
  #       f'  np:    {prng_states.np_state}\n'
  #       f'  torch: {prng_states.torch_state}\n'
  #       f'  cuda:  {prng_states.cuda_states}')
  return prng_states

In [0]:
def set_prng_states(prng_states: PseudoRandomStatesHolder):
  random.setstate(prng_states.py3_state)
  np.random.set_state(prng_states.np_state)
  torch.set_rng_state(prng_states.torch_state)
  torch.cuda.set_rng_state_all(prng_states.cuda_states)
  # print(f'Set prng_states:\n'
  #       f'  py3:   {prng_states.py3_state}\n'
  #       f'  np:    {prng_states.np_state}\n'
  #       f'  torch: {prng_states.torch_state}\n'
  #       f'  cuda:  {prng_states.cuda_states}')

In [0]:
def save_prng_states(name, prng_states, data_dir=DATA_DIR):
  prng_states_pkl_path = data_dir / f'ps-{name}.pkl'
  with open(prng_states_pkl_path, 'wb') as f:
    pickle.dump(prng_states, f)

In [0]:
def load_prng_states(name, data_dir=DATA_DIR):
  prng_states_pkl_path = data_dir / f'ps-{name}.pkl'
  if (prng_states_pkl_path.exists() and prng_states_pkl_path.is_file()):
    with open(prng_states_pkl_path, 'rb') as f:
      prng_states = pickle.load(f)
      return prng_states
  else:
    raise FileNotFoundError(f'No {prng_states_pkl_path} to load!')

## Checkpoint Helpers

In [0]:
@dataclass
class Checkpoint:
  name: str
  frozen_to: int
  mp_loss_scale: float

In [0]:
def preserve_event(name, learner, frozen_to=-1, data_dir=DATA_DIR):
  save_prng_states(name, get_prng_states())

  learner.save(name, with_opt=True)

  mp_loss_scale = None
  for i, cb in enumerate(learner.callbacks):
    if isinstance(cb, MixedPrecision):
      print(f'Found MixedPrecision loss_scale={cb.loss_scale}')
      mp_loss_scale = cb.loss_scale
      break

  checkpoint = Checkpoint(name, frozen_to, mp_loss_scale)
  checkpoint_fpath = data_dir / f'cp-{name}.pkl'
  with open(checkpoint_fpath, 'wb') as f:
    pickle.dump(checkpoint, f)
    print(f'{checkpoint} saved to {checkpoint_fpath}')

In [0]:
def retain_event(name, learner, data_dir=DATA_DIR):
  if name is None:
    reset_prng_states()
  else:
    set_prng_states(load_prng_states(name))

  checkpoint_fpath = data_dir / f'cp-{name}.pkl'
  if not (checkpoint_fpath.exists() and checkpoint_fpath.is_file()):
    print(f'No {checkpoint_fpath} to load!')
    return learner
  with open(checkpoint_fpath, 'rb') as f:
    checkpoint = pickle.load(f)
    print(f'{checkpoint} loaded from {checkpoint_fpath}')

    learner.freeze_to(checkpoint.frozen_to)  # This must be before `load()`.
    print(f'Frozen to {checkpoint.frozen_to}')

    learner = learner.load(checkpoint.name, with_opt=True)

    if checkpoint.mp_loss_scale:
      for i, cb in enumerate(learner.callbacks):
        if isinstance(cb, MixedPrecision):
          learner.callbacks[i].loss_scale = checkpoint.mp_loss_scale
          print(f'Retained mb_loss_scale={learner.callbacks[i].loss_scale}')
          break

    return learner

In [0]:
def fit_a_named_cycle(name, learner, lrs, moms, wd, clbks, cyc_len, freeze_to,
                      prev_event_name):
  for i, cb in enumerate(learner.callbacks):
    if isinstance(cb, MixedPrecision):
      print(f'Found init. loss_scale={cb.loss_scale}')
  learner = retain_event(prev_event_name, learner)

  # `unfreeze()` does just `freeze_to(0)`
  # `freeze()` does `freeze_to(-1)` with a size-assertion of layer groups
  learner.freeze_to(freeze_to)
  learner.fit_one_cycle(cyc_len=cyc_len, max_lr=lrs, moms=moms, wd=wd,
                        callbacks=clbks)

  preserve_event(name, learner, freeze_to)

  return learner

## LM-specific Helpers

In [0]:
def set_lm_databunch(fname, bs, bptt, seed=SEED, presort=True,
                     n_wrkrs=N_DBNCH_WRKRS, data_dir=DATA_DIR,
                     raw_data_dir=CORPUS_IN_COLAB_DATA_DIR):
  reset_prng_states()

  # tl = TextList.from_folder(raw_data_dir, presort=presort)
  trn_tl = TextList.from_csv(raw_data_dir, 'train.csv', cols=[1,2], header=None)
  tst_tl = TextList.from_csv(raw_data_dir, 'test.csv', cols=[1,2], header=None)

  # il = tl.filter_by_folder(include=['train', 'test', 'unsup'])
  il = trn_tl.add(tst_tl)

  ils = il.split_by_rand_pct(0.1, seed)  # Set the seed again since in theory one may have called np.random before this.
  lls = ils.label_for_lm()
  dbnch = lls.databunch(bs=bs, bptt=bptt, num_workers=n_wrkrs)
  dbnch.save(data_dir / fname)
  return dbnch

In [0]:
def get_lm_databunch(fname, bs, bptt, backwards=False, n_wrkrs=N_DBNCH_WRKRS,
                     data_dir=DATA_DIR):
  reset_prng_states()
  return load_data(data_dir, fname, bs, num_workers=n_wrkrs,
                   backwards=backwards, bptt=bptt)

In [0]:
def new_lm_learner_with_ulmfit(name, dbnch, drop_mult, base_dir=BASE_DIR):
  reset_prng_states()
  lrnr = language_model_learner(dbnch, AWD_LSTM, drop_mult=drop_mult,
                                path=base_dir)
  # lrnr = lrnr.to_fp16(clip=0.1)  # 2x faster
  lrnr = lrnr.to_fp16()  # 2x faster
  save_prng_states(name, get_prng_states())
  return lrnr

## CF-specific Helpers

In [0]:
def set_cf_databunch(fname, bs, vocab, tags=CLASSES, presort=True,
                     n_wrkrs=N_DBNCH_WRKRS, data_dir=DATA_DIR,
                     raw_data_dir=CORPUS_IN_COLAB_DATA_DIR):
  reset_prng_states()
  trn_df = pd.read_csv(raw_data_dir/'train.csv')
  tst_df = pd.read_csv(raw_data_dir/'test.csv')
  dbnch = TextClasDataBunch.from_df(raw_data_dir, trn_df, tst_df, vocab=vocab,
                                    text_cols=[1,2], label_cols=0, bs=bs,
                                    num_workers=n_wrkrs)
  dbnch.save(data_dir / fname)
  return dbnch

In [0]:
def get_cf_databunch(fname, bs, backwards=False, n_wrkrs=N_DBNCH_WRKRS,
                     data_dir=DATA_DIR):
  reset_prng_states()
  return load_data(data_dir, fname, bs, num_workers=n_wrkrs,
                   backwards=backwards)

In [0]:
def new_cf_learner_with_encoder(name, dbnch, drop_mult, enc_name, bptt,
                                base_dir=BASE_DIR):
  reset_prng_states()
  lrnr = text_classifier_learner(dbnch, AWD_LSTM, drop_mult=drop_mult,
                                 path=base_dir, bptt=bptt, pretrained=False)
  lrnr = lrnr.to_fp16()
  lrnr = lrnr.load_encoder(enc_name)
  save_prng_states(name, get_prng_states())
  return lrnr

# Fit

## Forward LM

### Process Data Once

In [0]:
if not (DATA_DIR / FW_LM_DBNCH_FNAME).exists():
  fw_lm_dbnch = set_lm_databunch(FW_LM_DBNCH_FNAME, lm_bs, bptt)
  print(f'Built and saved {DATA_DIR / FW_LM_DBNCH_FNAME}')
  # fw_lm_dbnch.show_batch()
  if not VOCAB_FILE.exists():
    fw_lm_dbnch.vocab.save(VOCAB_FILE)
    print(f'Saved {VOCAB_FILE}')

### Init-fit

In [0]:
lm_lr = 0.01
init_fw_lm_name = f'init_fw_lm-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
print(f'designated lm_lr = {lm_lr}')

designated lm_lr = 0.01


In [0]:
%%capture init_fw_lm_scope_begin_log
init_fw_lm_scope = IPyExperimentsPytorch(cl_enable=False)
fw_lm_dbnch = get_lm_databunch(FW_LM_DBNCH_FNAME, lm_bs, bptt)
fw_lm_lrnr = new_lm_learner_with_ulmfit('new-fw_lm', fw_lm_dbnch, lm_drop_mult)

In [0]:
print(init_fw_lm_name)
init_fw_lm_log_name = LOGS_DIR / f'{SESSN_START}-{init_fw_lm_name}'  # w/o .csv
init_fw_lm_clbks = [CSVLogger(fw_lm_lrnr, init_fw_lm_log_name, True)]
fw_lm_lrnr = fit_a_named_cycle(init_fw_lm_name, fw_lm_lrnr, lm_lr, moms, lm_wd,
                               init_fw_lm_clbks, cyc_len=1, freeze_to=-1,
                               prev_event_name='new-fw_lm')
# init_fw_lm_lrnr.csv_logger.read_logged_file()
# (init_fw_lm_lrnr.path / init_fw_lm_lrnr.model_dir).ls()

init_fw_lm-b64-bptt70-lr0.01
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-fw_lm.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         3.914846    3.503064    0.385084  02:54     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_fw_lm-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_lm-b64-bptt70-lr0.01.pkl


init_fw_lm-b64-bptt70-lr0.01125
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-fw_lm.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         3.919724    3.504137    0.385012  02:53     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_fw_lm-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_lm-b64-bptt70-lr0.01125.pkl


In [0]:
%%capture init_fw_lm_scope_end_log
del init_fw_lm_scope
gc.collect()

### Fine-tune

In [0]:
lm_lr = 0.01
init_fw_lm_name = f'init_fw_lm-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
fw_enc_name = f'fw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

finer_lm_lr = round(lm_lr / 10, 6)
tune_fw_lm_name = f'tune_fw_lm-b{lm_bs}-bptt{bptt}-lr{finer_lm_lr}'
print(f'designated finer_lm_lr = {finer_lm_lr}')

designated finer_lm_lr = 0.001


In [0]:
%%capture tune_fw_lm_scope_begin_log
tune_fw_lm_scope = IPyExperimentsPytorch(cl_enable=False)
fw_lm_dbnch = get_lm_databunch(FW_LM_DBNCH_FNAME, lm_bs, bptt)
fw_lm_lrnr = new_lm_learner_with_ulmfit('new-fw_lm', fw_lm_dbnch, lm_drop_mult)

In [0]:
print(tune_fw_lm_name)
tune_fw_lm_log_name = LOGS_DIR / f'{SESSN_START}-{tune_fw_lm_name}'
tune_fw_lm_clbks = [CSVLogger(fw_lm_lrnr, tune_fw_lm_log_name, True)]
fw_lm_lrnr = fit_a_named_cycle(tune_fw_lm_name, fw_lm_lrnr, finer_lm_lr, moms,
                               lm_wd, tune_fw_lm_clbks, cyc_len=10, freeze_to=0,
                               prev_event_name=init_fw_lm_name)
fw_lm_lrnr.save_encoder(fw_enc_name)

tune_fw_lm-b64-bptt70-lr0.001
Found init. loss_scale=65536
Checkpoint(name='init_fw_lm-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_lm-b64-bptt70-lr0.01.pkl
Frozen to -1
Retained mb_loss_scale=131072
epoch     train_loss  valid_loss  accuracy  time    
0         3.488972    3.263032    0.412775  03:25     
1         3.291793    3.115633    0.432071  03:31     
2         3.196101    3.017808    0.445388  03:33     
3         3.068143    2.947401    0.454521  03:32     
4         2.982534    2.896328    0.461378  03:34     
5         2.912928    2.857755    0.467164  03:32     
6         2.817010    2.828697    0.471728  03:35     
7         2.757135    2.810780    0.474898  03:32     
8         2.724917    2.801478    0.476289  03:33     
9         2.710906    2.800801    0.476421  03:32     
Found MixedPrecision loss_scale=1048576.0
Checkpoint(name='tune_fw_lm-b64-bptt70-lr0.001', frozen_to=0, mp_loss_scale=104

tune_fw_lm-b64-bptt70-lr0.001125
Found init. loss_scale=65536
Checkpoint(name='init_fw_lm-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=131072) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_lm-b64-bptt70-lr0.01125.pkl
Frozen to -1
Retained mb_loss_scale=131072
epoch     train_loss  valid_loss  accuracy  time    
0         3.477912    3.255535    0.413854  03:30     
1         3.282680    3.109124    0.432999  03:30     
2         3.189605    3.013494    0.445783  03:30     
3         3.061557    2.943755    0.454912  03:30     
4         2.974828    2.892176    0.461967  03:30     
5         2.903400    2.852659    0.468161  03:30     
6         2.803582    2.822880    0.472577  03:30     
7         2.739901    2.803609    0.476157  03:30     
8         2.704510    2.793952    0.477598  03:30     
9         2.688962    2.793329    0.477845  03:30     
Found MixedPrecision loss_scale=1048576.0
Checkpoint(name='tune_fw_lm-b64-bptt70-lr0.001125', frozen_to=0, mp_lo

In [0]:
%%capture tune_fw_lm_scope_end_log
del tune_fw_lm_scope
gc.collect()

## Forward CF

### Process Data Once

In [0]:
if not (DATA_DIR / FW_CF_DBNCH_FNAME).exists():
  VOC = Vocab.load(VOCAB_FILE)
  print(f'Loaded {VOCAB_FILE}')
  fw_cf_dbnch = set_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs, VOC)
  print(f'Built and saved {DATA_DIR / FW_CF_DBNCH_FNAME}')

# fw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs)
# print(f'Loaded {DATA_DIR / FW_CF_DBNCH_FNAME}')

### Init-fit

In [0]:
lm_lr = 0.01
fw_enc_name = f'fw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
init_fw_cf_name = f'init_fw_cf-b{cf_bs}-bptt{bptt}-lr{cf_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
print(f'designated cf_lr={cf_lr}')

designated cf_lr=0.075


In [0]:
%%capture init_fw_cf_scope_begin_log
init_fw_cf_scope = IPyExperimentsPytorch(cl_enable=False)
fw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs)
fw_cf_lrnr = new_cf_learner_with_encoder('new-fw_cf', fw_cf_dbnch, cf_drop_mult,
                                         fw_enc_name, bptt)

In [0]:
print(init_fw_cf_name)
init_fw_cf_log_name = LOGS_DIR / f'{SESSN_START}-{init_fw_cf_name}'
init_fw_cf_clbks = [CSVLogger(fw_cf_lrnr, init_fw_cf_log_name, True)]
fw_cf_lrnr = fit_a_named_cycle(init_fw_cf_name, fw_cf_lrnr, cf_lr, moms, cf_wd,
                               init_fw_cf_clbks, cyc_len=1, freeze_to=-1,
                               prev_event_name='new-fw_cf')

init_fw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-fw_cf.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         0.284361    0.249493    0.912094  00:59     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_fw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01.pkl


init_fw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-fw_cf.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         0.285937    0.249204    0.912094  00:58     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_fw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125.pkl


init_fw_cf-b64-bptt70-lr0.0723_enc-b64-bptt70-lr0.01125
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-fw_cf.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         0.285542    0.247191    0.911173  00:59     
Found MixedPrecision loss_scale=65536.0
Checkpoint(name='init_fw_cf-b64-bptt70-lr0.0723_enc-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=65536.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_cf-b64-bptt70-lr0.0723_enc-b64-bptt70-lr0.01125.pkl


init_fw_cf-b64-bptt70-lr0.07_enc-b64-bptt70-lr0.01125
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-fw_cf.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         0.285121    0.246714    0.914068  00:57     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_fw_cf-b64-bptt70-lr0.07_enc-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_cf-b64-bptt70-lr0.07_enc-b64-bptt70-lr0.01125.pkl


In [0]:
%%capture init_fw_cf_scope_end_log
del init_fw_cf_scope; gc.collect()

### Fine-tune

#### Act-1

In [0]:
lm_lr = 0.01
fw_enc_name = f'fw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
init_fw_cf_name = f'init_fw_cf-b{cf_bs}-bptt{bptt}-lr{cf_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

finer_cf_1_lr = cf_lr / 2
finer_cf_1_lrs = slice(finer_cf_1_lr / (2.6 ** 4), finer_cf_1_lr)
tune_fw_cf_1_name = f'tune_fw_cf-b{cf_bs}-bptt{bptt}-lr{finer_cf_1_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
print(f'designated finer_cf_lr={finer_cf_1_lr}')

designated finer_cf_lr=0.0375


In [0]:
%%capture tune_fw_cf_1_scope_begin_log
tune_fw_cf_1_scope = IPyExperimentsPytorch(cl_enable=False)
fw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs)
fw_cf_lrnr = new_cf_learner_with_encoder('new-fw_cf', fw_cf_dbnch, cf_drop_mult,
                                         fw_enc_name, bptt)

In [0]:
print(tune_fw_cf_1_name)
tune_fw_cf_1_log_name = LOGS_DIR / f'{SESSN_START}-{tune_fw_cf_1_name}'
tune_fw_cf_1_clbks = [CSVLogger(fw_cf_lrnr, tune_fw_cf_1_log_name, True)]
fw_cf_lrnr = fit_a_named_cycle(tune_fw_cf_1_name, fw_cf_lrnr, finer_cf_1_lrs,
                               moms, cf_wd, tune_fw_cf_1_clbks, cyc_len=1,
                               freeze_to=-2, prev_event_name=init_fw_cf_name)

tune_fw_cf-b64-bptt70-lr0.0375_enc-b64-bptt70-lr0.01
Found init. loss_scale=65536
Checkpoint(name='init_fw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01.pkl
Frozen to -1
Retained mb_loss_scale=131072
epoch     train_loss  valid_loss  accuracy  time    
0         0.239697    0.192150    0.933017  01:06     
Found MixedPrecision loss_scale=32768.0
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.0375_enc-b64-bptt70-lr0.01', frozen_to=-2, mp_loss_scale=32768.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.0375_enc-b64-bptt70-lr0.01.pkl


tune_fw_cf-b64-bptt70-lr0.03617_enc-b64-bptt70-lr0.01125
Found init. loss_scale=65536
Checkpoint(name='init_fw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=131072) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_fw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125.pkl
Frozen to -1
Retained mb_loss_scale=131072
epoch     train_loss  valid_loss  accuracy  time    
0         0.239709    0.225405    0.933017  01:09     
Found MixedPrecision loss_scale=32768.0
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.03617_enc-b64-bptt70-lr0.01125', frozen_to=-2, mp_loss_scale=32768.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.03617_enc-b64-bptt70-lr0.01125.pkl


In [0]:
%%capture tune_fw_cf_1_scope_end_log
del tune_fw_cf_1_scope; gc.collect()

#### Act-2

In [0]:
lm_lr = 0.01
fw_enc_name = f'fw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
finer_cf_1_lr = cf_lr / 2
tune_fw_cf_1_name = f'tune_fw_cf-b{cf_bs}-bptt{bptt}-lr{finer_cf_1_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

finer_cf_2_lr = cf_lr / (2 * 2)
finer_cf_2_lrs = slice(finer_cf_2_lr / (2.6 ** 4), finer_cf_2_lr)
tune_fw_cf_2_name = f'tune_fw_cf-b{cf_bs}-bptt{bptt}-lr{finer_cf_2_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
print(f'designated finer_cf_lr={finer_cf_2_lr}')

designated finer_cf_lr=0.01875


In [0]:
%%capture tune_fw_cf_2_scope_begin_log
tune_fw_cf_2_scope = IPyExperimentsPytorch(cl_enable=False)
fw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs)
fw_cf_lrnr = new_cf_learner_with_encoder('new-fw_cf', fw_cf_dbnch, cf_drop_mult,
                                         fw_enc_name, bptt)

In [0]:
print(tune_fw_cf_2_name)
tune_fw_cf_2_log_name = LOGS_DIR / f'{SESSN_START}-{tune_fw_cf_2_name}'
tune_fw_cf_2_clbks = [CSVLogger(fw_cf_lrnr, tune_fw_cf_2_log_name, True)]
fw_cf_lrnr = fit_a_named_cycle(tune_fw_cf_2_name, fw_cf_lrnr, finer_cf_2_lrs,
                               moms, cf_wd, tune_fw_cf_2_clbks, cyc_len=1,
                               freeze_to=-3, prev_event_name=tune_fw_cf_1_name)

tune_fw_cf-b64-bptt70-lr0.01875_enc-b64-bptt70-lr0.01
Found init. loss_scale=65536
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.0375_enc-b64-bptt70-lr0.01', frozen_to=-2, mp_loss_scale=32768.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.0375_enc-b64-bptt70-lr0.01.pkl
Frozen to -2
Retained mb_loss_scale=32768.0
epoch     train_loss  valid_loss  accuracy  time    
0         0.210162    0.283485    0.936439  01:44     
Found MixedPrecision loss_scale=32768.0
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.01875_enc-b64-bptt70-lr0.01', frozen_to=-3, mp_loss_scale=32768.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.01875_enc-b64-bptt70-lr0.01.pkl


tune_fw_cf-b64-bptt70-lr0.018085_enc-b64-bptt70-lr0.01125
Found init. loss_scale=65536
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.03617_enc-b64-bptt70-lr0.01125', frozen_to=-2, mp_loss_scale=32768.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.03617_enc-b64-bptt70-lr0.01125.pkl
Frozen to -2
Retained mb_loss_scale=32768.0
epoch     train_loss  valid_loss  accuracy  time    
0         0.205372    0.173472    0.942229  01:47     
Found MixedPrecision loss_scale=16384.0
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.018085_enc-b64-bptt70-lr0.01125', frozen_to=-3, mp_loss_scale=16384.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.018085_enc-b64-bptt70-lr0.01125.pkl


In [0]:
%%capture tune_fw_cf_2_scope_end_log
del tune_fw_cf_2_scope; gc.collect()

#### Act-3

In [0]:
lm_lr = 0.01
fw_enc_name = f'fw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
finer_cf_2_lr = cf_lr / (2 * 2)
tune_fw_cf_2_name = f'tune_fw_cf-b{cf_bs}-bptt{bptt}-lr{finer_cf_2_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

finer_cf_3_lr = round(cf_lr / (2 * 2 * 5), 6)
finer_cf_3_lrs = slice(finer_cf_3_lr / (2.6 ** 4), finer_cf_3_lr)
tune_fw_cf_3_name = f'tune_fw_cf-b{cf_bs}-bptt{bptt}-lr{finer_cf_3_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
print(f'designated finer_cf_lr={finer_cf_3_lr}')

designated finer_cf_lr=0.00375


In [0]:
%%capture tune_fw_cf_3_scope_begin_log
tune_fw_cf_3_scope = IPyExperimentsPytorch(cl_enable=False)
fw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs)
fw_cf_lrnr = new_cf_learner_with_encoder('new-fw_cf', fw_cf_dbnch, cf_drop_mult,
                                         fw_enc_name, bptt)

In [0]:
print(tune_fw_cf_3_name)
tune_fw_cf_3_log_name = LOGS_DIR / f'{SESSN_START}-{tune_fw_cf_3_name}'
tune_fw_cf_3_clbks = [CSVLogger(fw_cf_lrnr, tune_fw_cf_3_log_name, True)]
fw_cf_lrnr = fit_a_named_cycle(tune_fw_cf_3_name, fw_cf_lrnr, finer_cf_3_lrs,
                               moms, cf_wd, tune_fw_cf_3_clbks, cyc_len=2,
                               freeze_to=0, prev_event_name=tune_fw_cf_2_name)

tune_fw_cf-b64-bptt70-lr0.00375_enc-b64-bptt70-lr0.01
Found init. loss_scale=65536
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.01875_enc-b64-bptt70-lr0.01', frozen_to=-3, mp_loss_scale=32768.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.01875_enc-b64-bptt70-lr0.01.pkl
Frozen to -3
Retained mb_loss_scale=32768.0
epoch     train_loss  valid_loss  accuracy  time    
0         0.174698    0.180005    0.942492  02:27     
1         0.123461    0.230101    0.946440  02:22     
Found MixedPrecision loss_scale=32768.0
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.00375_enc-b64-bptt70-lr0.01', frozen_to=0, mp_loss_scale=32768.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.00375_enc-b64-bptt70-lr0.01.pkl


tune_fw_cf-b64-bptt70-lr0.003617_enc-b64-bptt70-lr0.01125
Found init. loss_scale=65536
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.018085_enc-b64-bptt70-lr0.01125', frozen_to=-3, mp_loss_scale=16384.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.018085_enc-b64-bptt70-lr0.01125.pkl
Frozen to -3
Retained mb_loss_scale=16384.0
epoch     train_loss  valid_loss  accuracy  time    
0         0.170520    0.164761    0.945651  02:27     
1         0.122431    0.165208    0.947888  02:21     
Found MixedPrecision loss_scale=32768.0
Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.003617_enc-b64-bptt70-lr0.01125', frozen_to=0, mp_loss_scale=32768.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.003617_enc-b64-bptt70-lr0.01125.pkl


In [0]:
%%capture tune_fw_cf_3_scope_end_log
del tune_fw_cf_3_scope
gc.collect()

## Backward LM

In [0]:
if not (DATA_DIR / FW_LM_DBNCH_FNAME).exists():
  fw_lm_dbnch = set_lm_databunch(FW_LM_DBNCH_FNAME, lm_bs, bptt)
  print(f'Built and saved {DATA_DIR / FW_LM_DBNCH_FNAME}')
  fw_lm_dbnch.vocab.save(VOCAB_FILE)
  print(f'Saved {VOCAB_FILE}')  

In [0]:
lm_lr = 0.01
init_bw_lm_name = f'init_bw_lm-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

In [0]:
%%capture init_lm_scope_begin_log
init_lm_scope = IPyExperimentsPytorch(cl_enable=False)
bw_lm_dbnch = get_lm_databunch(FW_LM_DBNCH_FNAME, lm_bs, bptt, backwards=True)
bw_lm_lrnr = new_lm_learner_with_ulmfit('new-bw_lm', bw_lm_dbnch, lm_drop_mult)

In [0]:
print(init_bw_lm_name)
init_bw_lm_log_name = LOGS_DIR / f'{SESSN_START}-{init_bw_lm_name}'
init_bw_lm_clbks = [CSVLogger(bw_lm_lrnr, init_bw_lm_log_name, append=True)]
bw_lm_lrnr = fit_a_named_cycle(init_bw_lm_name, bw_lm_lrnr, lm_lr, moms, lm_wd,
                               init_bw_lm_clbks, cyc_len=1, freeze_to=-1,
                               prev_event_name='new-bw_lm')

init_bw_lm-b64-bptt70-lr0.01
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-bw_lm.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         3.915792    3.509392    0.424315  03:02     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_bw_lm-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_lm-b64-bptt70-lr0.01.pkl


init_bw_lm-b64-bptt70-lr0.01125
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-bw_lm.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         3.919958    3.509583    0.424120  03:01     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_bw_lm-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_lm-b64-bptt70-lr0.01125.pkl


In [0]:
%%capture init_lm_scope_end_log
del init_lm_scope
gc.collect()

In [0]:
lm_lr = 0.01
init_bw_lm_name = f'init_bw_lm-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

bw_enc_name = f'bw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

tune_lm_lr = round(lm_lr/10, 6)
tune_bw_lm_name = f'tune_bw_lm-b{lm_bs}-bptt{bptt}-lr{tune_lm_lr}'
tune_lm_lr

0.001

In [0]:
%%capture tune_lm_scope_begin_log
tune_lm_scope = IPyExperimentsPytorch(cl_enable=False)
bw_lm_dbnch = get_lm_databunch(FW_LM_DBNCH_FNAME, lm_bs, bptt, backwards=True)
bw_lm_lrnr = new_lm_learner_with_ulmfit('new-bw_lm', bw_lm_dbnch, lm_drop_mult)

In [0]:
print(tune_bw_lm_name)
tune_bw_lm_log_name = LOGS_DIR / f'{SESSN_START}-{tune_bw_lm_name}'
tune_bw_lm_clbks = [CSVLogger(bw_lm_lrnr, tune_bw_lm_log_name, append=True)]
bw_lm_lrnr = fit_a_named_cycle(tune_bw_lm_name, bw_lm_lrnr, tune_lm_lr, moms,
                               lm_wd, tune_bw_lm_clbks, cyc_len=10, freeze_to=0,
                               prev_event_name=init_bw_lm_name)
bw_lm_lrnr.save_encoder(bw_enc_name)

tune_bw_lm-b64-bptt70-lr0.001
Found init. loss_scale=65536
Checkpoint(name='init_bw_lm-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_lm-b64-bptt70-lr0.01.pkl
Frozen to -1
Retained mb_loss_scale=131072
epoch     train_loss  valid_loss  accuracy  time    
0         3.490535    3.275046    0.452523  03:31     
1         3.313985    3.130130    0.469138  03:31     
2         3.210563    3.037704    0.479883  03:34     
3         3.101941    2.967201    0.488420  03:34     
4         3.000156    2.917279    0.494431  03:32     
5         2.917671    2.878529    0.499641  03:33     
6         2.853822    2.850992    0.503628  03:31     
7         2.773312    2.833041    0.506211  03:32     
8         2.750083    2.824233    0.507407  03:34     
9         2.728345    2.823306    0.507528  03:32     
Found MixedPrecision loss_scale=2097152.0
Checkpoint(name='tune_bw_lm-b64-bptt70-lr0.001', frozen_to=0, mp_loss_scale=209

tune_bw_lm-b64-bptt70-lr0.001125
Found init. loss_scale=65536
Checkpoint(name='init_bw_lm-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=131072) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_lm-b64-bptt70-lr0.01125.pkl
Frozen to -1
Retained mb_loss_scale=131072
epoch     train_loss  valid_loss  accuracy  time    
0         3.478573    3.267550    0.453320  03:30     
1         3.305273    3.124413    0.469611  03:30     
2         3.204501    3.034152    0.480451  03:30     
3         3.095929    2.964124    0.488613  03:30     
4         2.993055    2.913858    0.494942  03:30     
5         2.908212    2.874102    0.500363  03:30     
6         2.841247    2.845684    0.504504  03:30     
7         2.755752    2.826777    0.507199  03:30     
8         2.730006    2.817731    0.508515  03:30     
9         2.706766    2.816746    0.508666  03:30     
Found MixedPrecision loss_scale=2097152.0
Checkpoint(name='tune_bw_lm-b64-bptt70-lr0.001125', frozen_to=0, mp_lo

In [0]:
%%capture tune_lm_scope_end_log
del tune_lm_scope
gc.collect()

## Backward CF

In [0]:
lm_lr = 0.01
bw_enc_name = f'bw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
init_bw_cf_name = f'init_bw_cf-b{cf_bs}-bptt{bptt}-lr{cf_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
print(f'designated cf_lr={cf_lr}')

designated cf_lr=0.075


In [0]:
%%capture init_bw_cf_scope_begin_log
init_bw_cf_scope = IPyExperimentsPytorch(cl_enable=False)
bw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs, backwards=True)
bw_cf_lrnr = new_cf_learner_with_encoder('new-bw_cf', bw_cf_dbnch, cf_drop_mult,
                                         bw_enc_name, bptt)

In [0]:
print(init_bw_cf_name)
init_bw_cf_log_name = LOGS_DIR / f'{SESSN_START}-{init_bw_cf_name}'
init_bw_cf_clbks = [CSVLogger(bw_cf_lrnr, init_bw_cf_log_name, True)]
bw_cf_lrnr = fit_a_named_cycle(init_bw_cf_name, bw_cf_lrnr, cf_lr,
                                    moms, cf_wd, init_bw_cf_clbks, cyc_len=1,
                                    freeze_to=-1, prev_event_name='new-bw_cf')

init_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-bw_cf.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         0.346686    0.294429    0.899066  00:59     
Found MixedPrecision loss_scale=131072
Checkpoint(name='init_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01.pkl


init_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125
Found init. loss_scale=65536
No /content/gdrive/My Drive/ag_news_csv/data/cp-new-bw_cf.pkl to load!
epoch     train_loss  valid_loss  accuracy  time    
0         0.344697    0.294021    0.900250  00:59     
Found MixedPrecision loss_scale=65536.0
Checkpoint(name='init_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=65536.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125.pkl


In [0]:
%%capture init_bw_cf_scope_end_log
del init_bw_cf_scope
gc.collect()

In [0]:
lm_lr = 0.01
bw_enc_name = f'bw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
init_bw_cf_name = f'init_bw_cf-b{cf_bs}-bptt{bptt}-lr{cf_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'
tune_bw_cf_name = f'tune_bw_cf-b{cf_bs}-bptt{bptt}-lr{cf_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

In [0]:
%%capture tune_bw_cf_scope_begin_log
tune_bw_cf_scope = IPyExperimentsPytorch(cl_enable=False)
bw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs, backwards=True)
bw_cf_lrnr = new_cf_learner_with_encoder('new-bw_cf', bw_cf_dbnch, cf_drop_mult,
                                         bw_enc_name, bptt)

In [0]:
print(tune_bw_cf_name)
tune_bw_cf_log_name = LOGS_DIR / f'{SESSN_START}-{tune_bw_cf_name}'
tune_bw_cf_clbks = [CSVLogger(bw_cf_lrnr, tune_bw_cf_log_name, True)]

finer_cf_1_lr = cf_lr / 2
print(finer_cf_1_lr)
cf_lrs = slice(finer_cf_1_lr / (2.6 ** 4), finer_cf_1_lr)
bw_cf_lrnr = fit_a_named_cycle(tune_bw_cf_name, bw_cf_lrnr, cf_lrs, moms, cf_wd,
                               tune_bw_cf_clbks, cyc_len=1, freeze_to=-2,
                               prev_event_name=init_bw_cf_name)

finer_cf_2_lr = cf_lr / 2 / 2
print(finer_cf_2_lr)
cf_lrs = slice(finer_cf_2_lr / (2.6 ** 4), finer_cf_2_lr)
bw_cf_lrnr = fit_a_named_cycle(tune_bw_cf_name, bw_cf_lrnr, cf_lrs, moms, cf_wd,
                               tune_bw_cf_clbks, cyc_len=1, freeze_to=-3,
                               prev_event_name=tune_bw_cf_name)

finer_cf_3_lr = round(cf_lr / 2 / 2 / 5, 6)
print(finer_cf_3_lr)
cf_lrs = slice(finer_cf_3_lr / (2.6 ** 4), finer_cf_3_lr)
bw_cf_lrnr = fit_a_named_cycle(tune_bw_cf_name, bw_cf_lrnr, cf_lrs, moms, cf_wd,
                               tune_bw_cf_clbks, cyc_len=2, freeze_to=0,
                               prev_event_name=tune_bw_cf_name)

bw_preds, tgt_lbls = bw_cf_lrnr.get_preds(ordered=True)
accuracy(bw_preds, tgt_lbls)

tune_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01
0.0375
Found init. loss_scale=65536
Checkpoint(name='init_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01', frozen_to=-1, mp_loss_scale=131072) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01.pkl
Frozen to -1
Retained mb_loss_scale=131072
epoch     train_loss  valid_loss  accuracy  time    
0         0.253172    0.232787    0.929728  01:09     
Found MixedPrecision loss_scale=16384.0
Checkpoint(name='tune_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01', frozen_to=-2, mp_loss_scale=16384.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01.pkl
0.01875
Found init. loss_scale=16384.0
Checkpoint(name='tune_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01', frozen_to=-2, mp_loss_scale=16384.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01.pkl
Frozen to -2
Retained m

tensor(0.9453)

tune_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125
0.03617
Found init. loss_scale=65536
Checkpoint(name='init_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125', frozen_to=-1, mp_loss_scale=65536.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-init_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125.pkl
Frozen to -1
Retained mb_loss_scale=65536.0
epoch     train_loss  valid_loss  accuracy  time    
0         0.264795    0.205384    0.928675  01:10     
Found MixedPrecision loss_scale=16384.0
Checkpoint(name='tune_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125', frozen_to=-2, mp_loss_scale=16384.0) saved to /content/gdrive/My Drive/ag_news_csv/data/cp-tune_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125.pkl
0.018085
Found init. loss_scale=16384.0
Checkpoint(name='tune_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125', frozen_to=-2, mp_loss_scale=16384.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt7

tensor(0.9478)

In [0]:
%%capture tune_bw_cf_scope_end_log
del tune_bw_cf_scope
gc.collect()

# Ensemble

In [0]:
lm_lr = 0.01
fw_enc_name = f'fw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
finer_cf_3_lr = round(cf_lr / (2 * 2 * 5), 6)
tune_fw_cf_3_name = f'tune_fw_cf-b{cf_bs}-bptt{bptt}-lr{finer_cf_3_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

fw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs)
fw_cf_lrnr = new_cf_learner_with_encoder('new-fw_cf', fw_cf_dbnch, cf_drop_mult,
                                         fw_enc_name, bptt)

set_prng_states(load_prng_states(tune_fw_cf_3_name))
fw_cf_lrnr = retain_event(tune_fw_cf_3_name, fw_cf_lrnr)
fw_preds, tgt_lbls = fw_cf_lrnr.get_preds(ordered=True)
accuracy(fw_preds, tgt_lbls)

Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.00375_enc-b64-bptt70-lr0.01', frozen_to=0, mp_loss_scale=32768.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.00375_enc-b64-bptt70-lr0.01.pkl
Frozen to 0
Retained mb_loss_scale=32768.0


tensor(0.9464)

Checkpoint(name='tune_fw_cf-b64-bptt70-lr0.003617_enc-b64-bptt70-lr0.01125', frozen_to=0, mp_loss_scale=32768.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_fw_cf-b64-bptt70-lr0.003617_enc-b64-bptt70-lr0.01125.pkl
Frozen to 0
Retained mb_loss_scale=32768.0


tensor(0.9478)

In [0]:
lm_lr = 0.01
bw_enc_name = f'bw_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

cf_lr = 0.075
tune_bw_cf_name = f'tune_bw_cf-b{cf_bs}-bptt{bptt}-lr{cf_lr}_enc-b{lm_bs}-bptt{bptt}-lr{lm_lr}'

bw_cf_dbnch = get_cf_databunch(FW_CF_DBNCH_FNAME, cf_bs, backwards=True)
bw_cf_lrnr = new_cf_learner_with_encoder('new-bw_cf', bw_cf_dbnch, cf_drop_mult,
                                         bw_enc_name, bptt)

set_prng_states(load_prng_states(tune_bw_cf_name))
bw_cf_lrnr = retain_event(tune_bw_cf_name, bw_cf_lrnr)

bw_preds, tgt_lbls = bw_cf_lrnr.get_preds(ordered=True)
accuracy(bw_preds, tgt_lbls)

Checkpoint(name='tune_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01', frozen_to=0, mp_loss_scale=16384.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_bw_cf-b64-bptt70-lr0.075_enc-b64-bptt70-lr0.01.pkl
Frozen to 0
Retained mb_loss_scale=16384.0


tensor(0.9453)

Checkpoint(name='tune_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125', frozen_to=0, mp_loss_scale=32768.0) loaded from /content/gdrive/My Drive/ag_news_csv/data/cp-tune_bw_cf-b64-bptt70-lr0.07234_enc-b64-bptt70-lr0.01125.pkl
Frozen to 0
Retained mb_loss_scale=32768.0


tensor(0.9478)

In [0]:
print(f'lm_lr={lm_lr}\tcf_lr={cf_lr}')
avg_preds = (fw_preds + bw_preds) / 2
accuracy(avg_preds, tgt_lbls)

lm_lr=0.01	cf_lr=0.075


tensor(0.9505)

lm_lr=0.01125	cf_lr=0.07234


tensor(0.9516)