<a href="https://colab.research.google.com/github/tianjianjiang/nlp_data_aug/blob/%231-control_random_factors/imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare

An example as baseline: [ULMFit](https://nbviewer.jupyter.org/github/fastai/fastai/blob/master/examples/ULMFit.ipynb) tutorial.

> Fine-tuning a forward and backward langauge model to get to 95.4% accuracy on the IMDB movie reviews dataset. This tutorial is done with fastai v1.0.53.

> The example was run on a Titan RTX (24 GB of RAM) so you will probably need to adjust the batch size accordinly. If you divide it by 2, don't forget to divide the learning rate by 2 as well in the following cells. You can also reduce a little bit the bptt to gain a bit of memory.

In [0]:
# Ensure GPU spec; T4 is for colab and one can change it for another env.
gpu_list = !nvidia-smi -L
if gpu_list[0].startswith('NVIDIA-SMI has failed'):
  print('Runtime type should be GPU.')
elif not gpu_list[0].startswith('GPU 0: Tesla T4'):
  display(gpu_list)
  print('Please reset all runtimes. We need a Tesla T4 to reproduce the experiments!')
else:
  display(gpu_list)

## Dependency

### Install

In [0]:
# Ensure no surprises from conflict packages.
!pip check

In [0]:
%%capture pip_logs
!pip install -U fastai==1.0.55 ipyexperiments jupyter-console==5.2.0 coveralls coverage datascience albumentations

In [0]:
!pip check

### Import

In [0]:
import gc
import math
from pathlib import Path
import random

import numpy as np
import torch
from google.colab import drive

from fastai import basic_data, basic_train, core
from fastai import *
from fastai.callbacks import CSVLogger
from fastai.core import plt
from fastai.text import *
from fastprogress import fastprogress

from ipyexperiments import *

### Init


In [0]:
# Not set earlier because pip may require a restart.
SESSN_START_T, = !date +%Y%m%dT%H%M

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
# A special treatment for colab to decrease network traffic.
fastprogress.NO_BAR = True
master_bar, progress_bar = fastprogress.force_console_behavior()
basic_train.master_bar, basic_train.progress_bar = master_bar, progress_bar
basic_data.master_bar, basic_data.progress_bar = master_bar, progress_bar
dataclass.master_bar, dataclass.progress_bar = master_bar, progress_bar
text.master_bar, text.progress_bar = master_bar, progress_bar
text.data.master_bar, text.data.progress_bar = master_bar, progress_bar
core.master_bar, core.progress_bar = master_bar, progress_bar

In [0]:
COLAB_CONTENT_DIR_P = Path('/content')
GD_DIR_P = COLAB_CONTENT_DIR_P / 'gdrive'
drive.mount(str(GD_DIR_P), force_remount=True)

In [0]:
BASE_DIR_P = GD_DIR_P / 'My Drive/imdb'
BASE_DIR_P.mkdir(parents=True, exist_ok=True)
DATA_DIR_P = BASE_DIR_P / 'data'
DATA_DIR_P.mkdir(parents=True, exist_ok=True)
MDLS_DIR_P = BASE_DIR_P / 'models'
MDLS_DIR_P.mkdir(parents=True, exist_ok=True)
LOGS_DIR_P = BASE_DIR_P / 'logs'
LOGS_DIR_P.mkdir(parents=True, exist_ok=True)

FASTAI_DATA_DIR_P = Path('/root/.fastai/data')
FASTAI_DATA_DIR_P.mkdir(parents=True, exist_ok=True)

COLAB_DATA_DIR_P = COLAB_CONTENT_DIR_P / 'data'
if not COLAB_DATA_DIR_P.is_symlink():
  COLAB_DATA_DIR_P.symlink_to(FASTAI_DATA_DIR_P)
if (COLAB_CONTENT_DIR_P / 'sample_data').exists():
  !set -x; rm -rf /content/sample_data/

# Assign

## Shared Hyperparams

In [0]:
lm_bs = 128
cf_bs = round(lm_bs / 2)
print(f'Our lm_bs: {lm_bs}; cf_bs: {cf_bs}')
bptt = 80  # From the example, but fastai defaults to 70.
moms = (0.8, 0.7)

FW_LM_DBNCH_FILE_S = f'fw_lm_dbnch-b{lm_bs}.pkl'
BW_LM_DBNCH_FILE_S = f'bw_lm_dbnch-b{lm_bs}.pkl'
FW_CF_DBNCH_FILE_S = f'fw_cf_dbnch-b{cf_bs}.pkl'
BW_CF_DBNCH_FILE_S = f'bw_cf_dbnch-b{cf_bs}.pkl'

## LM-specific Hyperparams

In [0]:
# Decrease the lr from the example's 2e-2 proportionally to the orig lm bs 256.
ORIG_LM_BS = 256
ORIG_LM_LR = 2e-2
# lm_lr = ORIG_LM_LR
# lm_lr = lm_bs / ORIG_LM_BS * ORIG_LM_LR
# lm_lr = round(lm_lr, 7)
# print(f'In proportion to our lm_bs, our lm_lr : {lm_lr}')

lm_drop_mult = 1.0
lm_wd = 0.1  # From the example, except forward classifier uses fastai default 1e-2.

# FW_ENC_NAME = f'fw_enc-b{lm_bs}-lr{lm_lr}'
# BW_ENC_NAME = f'bw_enc-b{lm_bs}-lr{lm_lr}'

## CF-specific Hyperparams

In [0]:
ORIG_CF_BS = round(ORIG_LM_BS / 2)
ORIG_CF_LR = 1e-1
# cf_lr = ORIG_CF_LR
# cf_lr = cf_bs / ORIG_CF_BS * ORIG_CF_LR * 1.2
# cf_lr = round(cf_lr, 7)
# print(f'In proportion to our cf_bs, our cf_lr: {cf_lr}')

cf_drop_mult = lm_drop_mult / 2
cf_wd = 0.1

# FW_CF_NAME = f'fw_cf-b{cf_bs}-lr{cf_lr}'
# BW_CF_NAME = f'bw_cf-b{cf_bs}-lr{cf_lr}'

## Args

In [0]:
# Set num_workers to main process since the training set will be shuffled.
n_dbnch_wrkrs = 0

In [0]:
plt.style.use(['dark_background','seaborn-poster','seaborn-deep'])
plt.rcParams['axes.grid'] = True
plt.rcParams['axes.grid.axis'] = 'x'
plt.rcParams['axes.grid.which'] = 'both'
plt.rcParams['grid.alpha'] = 0.5
plt.rcParams['grid.color'] = 'xkcd:lime green'
plt.rcParams['grid.linestyle'] = ':'

# Define

## Random State Fixer

In [0]:
# Set a constant seed for every random number generator.
SEED = 42

def reset_all_nondeterministic_states(seed=SEED):
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
  torch.backends.cudnn.deterministic = True  # About 15% slower but...
  torch.backends.cudnn.benchmark = False

## LM-specific Helpers

In [0]:
def build_lm_databunch(data_dir_p, n_workers, bs, bptt):
  reset_all_nondeterministic_states()
  return (TextList.from_folder(data_dir_p)
          #Inputs: all the text files in path
          .filter_by_folder(include=['train', 'test', 'unsup'])
          #We may have other temp folders that contain text files so we only keep what's in train and test
          .split_by_rand_pct(
              0.1,
              seed=SEED  # Set the seed again since in theory one can call np.random before this.
          )
          #We randomly split and keep 10% (10,000 reviews) for validation
          .label_for_lm()
          #We want to do a language model so we label accordingly
          .databunch(bs=bs, bptt=bptt, num_workers=n_workers))

In [0]:
def init_lm_learner_with_ulmfit(dbnch, drop_mult, base_path=BASE_DIR_P):
  reset_all_nondeterministic_states()
  lm_learn = language_model_learner(dbnch, AWD_LSTM, drop_mult=drop_mult, path=base_path)
  lm_learn = lm_learn.to_fp16(clip=0.1)  # 2x faster
  return lm_learn

In [0]:
def init_lm_cycles(learner, lr, moms, wd, clbks=[], n_cycles=1):
  print(f'init lm lr: {lr}')
  reset_all_nondeterministic_states()
  learner.fit_one_cycle(n_cycles, lr, moms=moms, wd=wd, callbacks=clbks)
  return learner

In [0]:
def tune_lm_cycles(learner, lr, moms, wd, clbks=[], n_cycles=10):
  print(f'tune lm lr: {lr}')
  reset_all_nondeterministic_states()
  learner.unfreeze()
  learner.fit_one_cycle(n_cycles, lr, moms=moms, wd=wd, callbacks=clbks)
  return learner

## CF-specific Helpers

In [0]:
def build_cf_databunch(data_dir_p, n_workers, bs, vocab):
  reset_all_nondeterministic_states()
  return (TextList.from_folder(data_dir_p, vocab=vocab)
          #grab all the text files in path
          .split_by_folder(valid='test')
          #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
          .label_from_folder(classes=['neg', 'pos'])
          #label them all with their folders
          .databunch(bs=bs, num_workers=n_workers))

In [0]:
def init_cf_learner_with_encoder(dbnch, drop_mult, enc_name, base_path=BASE_DIR_P):
  reset_all_nondeterministic_states()
  cf_learn = text_classifier_learner(dbnch, AWD_LSTM, drop_mult=drop_mult, path=base_path, pretrained=False)
  cf_learn.load_encoder(enc_name)
  return cf_learn

In [0]:
def init_cf_cycles(learner, lr, moms, wd, clbks, n_cycles=1):
  print(f'init cf lr: {lr}')
  reset_all_nondeterministic_states()
  learner.fit_one_cycle(n_cycles, lr, moms=moms, wd=wd, callbacks=clbks)
  return learner

In [0]:
def tune_cf_cycles(
    learner,
    lr,
    moms,
    wd,
    clbks_tuple,
    n_cycles_tuple=(1,1,2),
    freeze_steps=(-2,-3,None),
    lr_decays=(2,2,5)
):
  reset_all_nondeterministic_states()
  for n_cycles, freeze_step, lr_decay, clbks in zip(
      n_cycles_tuple, freeze_steps, lr_decays, clbks_tuple):
    if freeze_step is not None:
      learner.freeze_to(freeze_step)
    else:
      learner.unfreeze()
    lr /= lr_decay
    print(f'tune cf lr: {lr}')
    learner.fit_one_cycle(n_cycles, slice(lr/(2.6**4),lr), moms=moms, wd=wd, callbacks=clbks)
  return learner

# Fit

## Forward LM

### Process Data Once

In [0]:
IMDB_DATA_IN_COLAB_DIR_P = COLAB_DATA_DIR_P / 'imdb'

In [0]:
# Untar into colab disk so no latency to GDrive.
downloaded_imdb_data_dir_p = untar_data(URLs.IMDB, dest=FASTAI_DATA_DIR_P)
assert IMDB_DATA_IN_COLAB_DIR_P.resolve() == downloaded_imdb_data_dir_p

In [0]:
fw_lm_dbnch = build_lm_databunch(IMDB_DATA_IN_COLAB_DIR_P, n_dbnch_wrkrs, lm_bs, bptt)
# fw_lm_dbnch.show_batch()

### Use Persistent Path

In [0]:
# Save the databunch to a non-voatile path (e.g.: GDrive).
fw_lm_dbnch.save(DATA_DIR_P / FW_LM_DBNCH_FILE_S)

In [0]:
reset_all_nondeterministic_states()
fw_lm_dbnch = load_data(DATA_DIR_P, FW_LM_DBNCH_FILE_S, bs=lm_bs, bptt=bptt, num_workers=n_dbnch_wrkrs)
# fw_lm_dbnch.path.ls()

In [0]:
# The batch should look the same if the above efforts keep the reproducibility.
# fw_lm_dbnch.show_batch()

### Find Learning Rate

In [0]:
assert fw_lm_dbnch.train_dl.batch_size == lm_bs
lm_epoch_sz = math.ceil(len(fw_lm_dbnch.train_ds) / lm_bs)
lm_epoch_sz

In [0]:
lr_find_scope = IPyExperimentsPytorch(cl_enable=False)
fw_lm_learn = init_lm_learner_with_ulmfit(fw_lm_dbnch, lm_drop_mult)
fw_lm_learn.lr_find(end_lr=1, num_it=math.ceil(lm_epoch_sz/9), wd=lm_wd)

In [0]:
%%capture lr_find_log
fw_lm_learn.recorder.plot(suggestion=True)

In [0]:
(found_lr_name,
 found_lr_val_str), _ = [line.split(': ')
                         for line in lr_find_log.stdout.split('\n') if line]

In [0]:
display(lr_find_log.outputs[0])
print(found_lr_name, found_lr_val_str)

In [0]:
lr_find_scope.keep_var_names('found_lr_val_str')

In [0]:
del lr_find_scope; gc.collect()

### Init-fit

In [0]:
# lm_lr = 0.0251
lm_lr = float(found_lr_val_str)
fw_lm_learn = init_lm_learner_with_ulmfit(fw_lm_dbnch, lm_drop_mult)
init_fw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_fw_lm-b{lm_bs}-lr{lm_lr}'  # w/o .csv
init_fw_lm_clbks = [CSVLogger(fw_lm_learn, init_fw_lm_log_p, append=True)]

In [38]:
fw_lm_learn = init_lm_cycles(fw_lm_learn, lm_lr, moms, lm_wd, init_fw_lm_clbks)
# fw_lm_learn.csv_logger.read_logged_file()
fw_lm_learn.save(f'init_fw_lm-b{lm_bs}-lr{lm_lr}')
# (fw_lm_learn.path/fw_lm_learn.model_dir).ls()

init lm lr: 0.0251
epoch     train_loss  valid_loss  accuracy  time    
0         4.344767    4.036683    0.291658  18:57     


### Fine-tune

In [39]:
# reset_all_nondeterministic_states()
# fw_lm_learn = init_lm_learner_with_ulmfit(fw_lm_dbnch, lm_drop_mult)
# fw_lm_learn = fw_lm_learn.load(f'init_fw_lm-b{lm_bs}-lr{lm_lr}')
tune_lm_lr = round(lm_lr/10, 5)
tune_fw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-tune_fw_lm-b{lm_bs}-lr{tune_lm_lr}'
tune_fw_lm_clbks = [CSVLogger(fw_lm_learn, tune_fw_lm_log_p, append=True)]
fw_lm_learn = tune_lm_cycles(fw_lm_learn, tune_lm_lr, moms, lm_wd, tune_fw_lm_clbks)

tune lm lr: 0.00251
epoch     train_loss  valid_loss  accuracy  time    
0         4.054212    3.867507    0.311515  20:14     
1         4.004817    3.832057    0.315235  20:21     
2         3.987055    3.814739    0.317900  20:20     
3         3.951325    3.784540    0.321025  20:22     
4         3.930473    3.760004    0.323519  20:21     
5         3.869126    3.720796    0.327975  20:22     
6         3.844987    3.685802    0.332085  20:19     
7         3.768153    3.649244    0.336223  20:20     
8         3.716011    3.626792    0.338881  20:20     
9         3.689258    3.621286    0.339571  20:20     


In [0]:
fw_lm_learn.save(f'tuned_fw_lm-b{lm_bs}-lr{lm_lr}')
FW_ENC_NAME = f'fw_enc-b{lm_bs}-lr{lm_lr}'
fw_lm_learn.save_encoder(FW_ENC_NAME)
# (fw_lm_learn.path/fw_learn_lm.model_dir).ls()

## Forward CF

In [0]:
# reset_all_nondeterministic_states()
# fw_lm_dbnch = load_data(DATA_DIR_P, FW_LM_DBNCH_FILE_S, bs=lm_bs, bptt=bptt, num_workers=n_dbnch_wrkrs)

fw_cf_dbnch = build_cf_databunch(IMDB_DATA_IN_COLAB_DIR_P, n_dbnch_wrkrs, cf_bs, fw_lm_dbnch.vocab)
fw_cf_dbnch.save(DATA_DIR_P / FW_CF_DBNCH_FILE_S)
# fw_cf_dbnch.show_batch()

In [0]:
assert fw_cf_dbnch.train_dl.batch_size == cf_bs
cf_epoch_sz = math.ceil(len(fw_cf_dbnch.train_ds) / fw_cf_dbnch.train_dl.batch_size)
cf_epoch_sz

In [0]:
lr_find_scope = IPyExperimentsPytorch(cl_enable=False)
fw_cf_learn = init_cf_learner_with_encoder(fw_cf_dbnch, cf_drop_mult, FW_ENC_NAME)
fw_cf_learn.lr_find(end_lr=10, num_it=math.ceil(cf_epoch_sz/8), wd=cf_wd)

In [0]:
%%capture lr_find_log
fw_cf_learn.recorder.plot(suggestion=True)

In [0]:
list(map(partial(str.split, sep=': '), filter(None, lr_find_log.stdout.split('\n'))))

In [0]:
display(lr_find_log.outputs[0])

In [0]:
del lr_find_scope; gc.collect()

In [0]:
# reset_all_nondeterministic_states()
# fw_cf_dbnch = load_data(DATA_DIR_P, FW_CF_DBNCH_FILE_S, bs=cf_bs, num_workers=n_dbnch_wrkrs)

fw_cf_learn = init_cf_learner_with_encoder(fw_cf_dbnch, cf_drop_mult, FW_ENC_NAME)

In [66]:
cf_lr = 5.18e-2
init_fw_cf_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_fw_cf-b{cf_bs}-lr{cf_lr}'
init_fw_cf_clbks = [CSVLogger(fw_cf_learn, init_fw_cf_log_p, append=True)]
fw_cf_learn = init_cf_cycles(fw_cf_learn, cf_lr, moms, cf_wd, init_fw_cf_clbks)

init cf lr: 0.0518
epoch     train_loss  valid_loss  accuracy  time    
0         0.222152    0.176532    0.934160  04:57     


In [67]:
tune_fw_cf_clbks_tuple = (
    [CSVLogger(
        fw_cf_learn,
        LOGS_DIR_P / f'{SESSN_START_T}_history-tune_fw_cf-b{cf_bs}-p{period}',
        append=True)]
    for period in range(1,4)
)
fw_cf_learn = tune_cf_cycles(fw_cf_learn, cf_lr, moms, cf_wd, tune_fw_cf_clbks_tuple)
fw_cf_learn.save(FW_CF_NAME)
# (fw_cf_learn.path/fw_cf_learn.model_dir).ls()

tune cf lr: 0.0259
epoch     train_loss  valid_loss  accuracy  time    
0         0.190170    0.153062    0.944160  05:36     
tune cf lr: 0.01295
epoch     train_loss  valid_loss  accuracy  time    
0         0.180019    0.138620    0.949240  08:08     
tune cf lr: 0.00259
epoch     train_loss  valid_loss  accuracy  time    
0         0.130078    0.143215    0.945520  09:48     
1         0.083496    0.142682    0.949720  09:03     


In [0]:
fw_cf_learn.export(MDLS_DIR_P / f'export-fw_cf-b{cf_bs}-lr{cf_lr}')

In [0]:
fw_cf_learn.destroy(); del fw_cf_learn; gc.collect()

## Backward LM

In [0]:
reset_all_nondeterministic_states()
bw_lm_dbnch = load_data(DATA_DIR_P, FW_LM_DBNCH_FILE_S, bs=lm_bs, bptt=bptt, num_workers=n_dbnch_wrkrs, backwards=True)
# bw_lm_dbnch.show_batch()

In [0]:
bw_lm_learn = init_lm_learner_with_ulmfit(bw_lm_dbnch, lm_drop_mult)

In [0]:
init_bw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_bw_lm-b{lm_bs}'
init_bw_lm_clbks = [CSVLogger(bw_lm_learn, init_bw_lm_log_p, append=True)]
bw_lm_learn = init_lm_cycles(bw_lm_learn, lm_lr, moms, lm_wd, init_bw_lm_clbks)
bw_lm_learn.save(f'init_bw_lm-b{lm_bs}')
# (bw_lm_learn.path/bw_lm_learn.model_dir).ls()

In [0]:
# reset_all_nondeterministic_states()
# bw_lm_learn = bw_lm_learn.load(f'init_bw_lm-b{lm_bs}')

tune_bw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-tune_bw_lm-b{lm_bs}'
tune_bw_lm_clbks = [CSVLogger(bw_lm_learn, tune_bw_lm_log_p, append=True)]
bw_lm_learn = tune_lm_cycles(bw_lm_learn, lm_lr/10, moms, lm_wd, tune_bw_lm_clbks)
bw_lm_learn.save(f'tuned_bw_lm-b{lm_bs}')
bw_lm_learn.save_encoder(BW_ENC_NAME)
# (bw_lm_learn.path/bw_lm_learn.model_dir).ls()

## Backward CF

In [0]:
reset_all_nondeterministic_states()
bw_cf_dbnch = load_data(DATA_DIR_P, FW_CF_DBNCH_FILE_S, bs=cf_bs, num_workers=n_dbnch_wrkrs, backwards=True)
# bw_cf_dbnch.show_batch()

In [0]:
bw_cf_learn = init_cf_learner_with_encoder(bw_cf_dbnch, cf_drop_mult, BW_ENC_NAME)

In [0]:
init_bw_cf_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_bw_cf-b{cf_bs}'
init_bw_cf_clbks = [CSVLogger(bw_cf_learn, init_bw_cf_log_p, append=True)]
bw_cf_learn = init_cf_cycles(bw_cf_learn, cf_lr, moms, cf_wd, init_bw_cf_clbks)

In [0]:
tune_bw_cf_clbks_tuple = (
    [CSVLogger(
        bw_cf_learn,
        LOGS_DIR_P / f'{SESSN_START_T}_history-tune_bw_cf-b{cf_bs}-p{period}',
        append=True)]
    for period in range(1,4)
)
bw_cf_learn = tune_cf_cycles(bw_cf_learn, cf_lr, moms, cf_wd, tune_bw_cf_clbks_tuple)

In [0]:
bw_cf_learn.save(BW_CF_NAME)
# (bw_cf_learn.path/bw_cf_learn.model_dir).ls()

# Ensemble

In [0]:
pred_fw, lbl_fw = fw_cf_learn.get_preds(ordered=True)

In [0]:
pred_bw, lbl_bw = bw_cf_learn.get_preds(ordered=True)

In [0]:
avg_pred = (pred_fw + pred_bw) / 2

In [0]:
accuracy(avg_pred, lbl_fw)