<a href="https://colab.research.google.com/github/tianjianjiang/nlp_data_aug/blob/%231-control_random_factors/imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare

An example as baseline: [ULMFit](https://nbviewer.jupyter.org/github/fastai/fastai/blob/master/examples/ULMFit.ipynb) tutorial.

> Fine-tuning a forward and backward langauge model to get to 95.4% accuracy on the IMDB movie reviews dataset. This tutorial is done with fastai v1.0.53.

> The example was run on a Titan RTX (24 GB of RAM) so you will probably need to adjust the batch size accordinly. If you divide it by 2, don't forget to divide the learning rate by 2 as well in the following cells. You can also reduce a little bit the bptt to gain a bit of memory.

In [0]:
# Ensure GPU spec; T4 is for colab and one can change it for another env.
gpu_list = !nvidia-smi -L
if gpu_list[0].startswith('NVIDIA-SMI has failed'):
  print('Runtime type should be GPU.')
elif not gpu_list[0].startswith('GPU 0: Tesla T4'):
  display(gpu_list)
  print('Please reset all runtimes. We need a Tesla T4 to reproduce the experiments!')
else:
  display(gpu_list)

## Dependency

### Install

In [0]:
# Ensure no surprises from conflict packages.
!pip check

In [0]:
!pip install -U fastai==1.0.55 jupyter-console==5.2.0 coveralls coverage datascience albumentations
!pip check

### Import

In [0]:
from pathlib import Path
import random

import numpy as np
import torch
from google.colab import drive

from fastai import basic_train, basic_data, core
from fastai import *
from fastai.text import *
from fastprogress import fastprogress

### Init

In [0]:
# Not set earlier because pip may require a restart.
SESSN_START_T, = !date +%Y%m%dT%H%M

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
# A special treatment for colab to decrease network traffic.
fastprogress.NO_BAR = True
master_bar, progress_bar = fastprogress.force_console_behavior()
basic_train.master_bar, basic_train.progress_bar = master_bar, progress_bar
basic_data.master_bar, basic_data.progress_bar = master_bar, progress_bar
dataclass.master_bar, dataclass.progress_bar = master_bar, progress_bar
text.master_bar, text.progress_bar = master_bar, progress_bar
text.data.master_bar, text.data.progress_bar = master_bar, progress_bar
core.master_bar, core.progress_bar = master_bar, progress_bar

In [0]:
GD_DIR_S = '/content/gdrive/'
drive.mount(GD_DIR_S, force_remount=True)

In [0]:
BASE_DIR_P = GD_DIR_S / Path('My Drive/imdb/')
BASE_DIR_P.mkdir(parents=True, exist_ok=True)
DATA_DIR_P = BASE_DIR_P / 'data/'
DATA_DIR_P.mkdir(parents=True, exist_ok=True)
LOGS_DIR_P = BASE_DIR_P / 'logs/'
LOGS_DIR_P.mkdir(parents=True, exist_ok=True)

FASTAI_DATA_DIR_P = Path('/root/.fastai/data/')
FASTAI_DATA_DIR_P.mkdir(parents=True, exist_ok=True)
Path('/content/data').symlink_to(FASTAI_DATA_DIR_P)

In [0]:
!set -x; rm -rf /content/sample_data/

# Assign

## Shared Hyperparams

In [0]:
lm_bs = 64
cf_bs = round(lm_bs / 2)
print(f'Our lm_bs: {lm_bs}; cf_bs: {cf_bs}')
bptt = 80  # From the example, but fastai defaults to 70.
moms = (0.8, 0.7)
wd = 0.1  # From the example, except forward classifier uses fastai default 1e-2.

## LM-specific Hyperparams

In [0]:
# Decrease the lr from the example's 2e-2 proportionally to the orig lm bs 256.
ORIG_LM_BS = 256
ORIG_LM_LR = 2e-2
lm_lr = lm_bs / ORIG_LM_BS * ORIG_LM_LR
print(f'In proportion to our lm_bs, our lm_lr : {lm_lr}')

lm_drop_mult = 1.0

## CF-specific Hyperparams

In [0]:
ORIG_CF_BS = round(ORIG_LM_BS / 2)
ORIG_CF_LR = 1e-1
cf_lr = cf_bs / ORIG_CF_BS * ORIG_CF_LR
print(f'In proportion to our cf_bs, our cf_lr: {cf_lr}')

cf_drop_mult = lm_drop_mult / 2

## Args

In [0]:
# Set num_workers to main process since the training set will be shuffled.
n_dbnch_wrkrs = 0

In [0]:
FW_LM_DBNCH_FILE_S = f'fw_lm_dbnch-b{lm_bs}.pkl'
BW_LM_DBNCH_FILE_S = f'bw_lm_dbnch-b{lm_bs}.pkl'

FW_ENC_NAME = f'fw_enc-b{lm_bs}'
BW_ENC_NAME = f'bw_enc-b{lm_bs}'

FW_CF_DBNCH_FILE_S = f'fw_cf_dbnch-b{cf_bs}.pkl'
BW_CF_DBNCH_FILE_S = f'bw_cf_dbnch-b{cf_bs}.pkl'

FW_CF_NAME = f'fw_cf-b{cf_bs}'
BW_CF_NAME = f'bw_cf-b{cf_bs}'

# Define

## Random State Fixer

In [0]:
# Set a constant seed for every random number generator.
SEED = 42

def reset_all_nondeterministic_states(seed=SEED):
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
  torch.backends.cudnn.deterministic = True  # About 15% slower but...
  torch.backends.cudnn.benchmark = False

## LM-specific Helpers

In [0]:
def build_lm_databunch(data_dir_p, n_workers, bs, bptt):
  reset_all_nondeterministic_states()
  return (TextList.from_folder(data_dir_p)
          #Inputs: all the text files in path
          .filter_by_folder(include=['train', 'test', 'unsup'])
          #We may have other temp folders that contain text files so we only keep what's in train and test
          .split_by_rand_pct(
              0.1,
              seed=SEED  # Set the seed again since in theory one can call np.random before this.
          )
          #We randomly split and keep 10% (10,000 reviews) for validation
          .label_for_lm()
          #We want to do a language model so we label accordingly
          .databunch(bs=bs, bptt=bptt, num_workers=n_workers))

In [0]:
def init_lm_learner_with_ulmfit(dbnch, drop_mult, base_path=BASE_DIR_P):
  lm_learn = language_model_learner(dbnch, AWD_LSTM, drop_mult=drop_mult, path=base_path)
  lm_learn = lm_learn.to_fp16(clip=0.1)  # 2x faster
  return lm_learn

In [0]:
def init_lm_cycles(learner, lr, moms, wd, clbks=[], n_cycles=1):
  print(f'init lm lr: {lr}')
  reset_all_nondeterministic_states()
  learner.fit_one_cycle(n_cycles, lr, moms=moms, wd=wd, callbacks=clbks)
  return learner

In [0]:
def tune_lm_cycles(learner, lr, moms, wd, clbks=[], n_cycles=10):
  print(f'tune lm lr: {lr}')
  reset_all_nondeterministic_states()
  learner.unfreeze()
  learner.fit_one_cycle(n_cycles, lr, moms=moms, wd=wd, callbacks=clbks)
  return learner

## CF-specific Helpers

In [0]:
def build_cf_databunch(data_dir_p, n_workers, bs, vocab):
  return (TextList.from_folder(data_dir_p, vocab=vocab)
          #grab all the text files in path
          .split_by_folder(valid='test')
          #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
          .label_from_folder(classes=['neg', 'pos'])
          #label them all with their folders
          .databunch(bs=bs, num_workers=n_workers))

In [0]:
def init_cf_learner_with_encoder(dbnch, drop_mult, enc_name, base_path=BASE_DIR_P):
  cf_learn = text_classifier_learner(dbnch, AWD_LSTM, drop_mult=drop_mult, path=base_path, pretrained=False)
  cf_learn.load_encoder(enc_name)
  return cf_learn

In [0]:
def init_cf_cycles(learner, lr, moms, wd, clbks, n_cycles=1):
  print(f'init cf lr: {lr}')
  reset_all_nondeterministic_states()
  learner.fit_one_cycle(n_cycles, lr, moms=moms, wd=wd, callbacks=clbks)
  return learner

In [0]:
def tune_cf_cycles(
    learner,
    lr,
    moms,
    wd,
    clbks_tuple,
    n_cycles_tuple=(1,1,2),
    freeze_steps=(-2,-3,None),
    lr_decays=(2,2,5)
):
  reset_all_nondeterministic_states()
  for n_cycles, freeze_step, lr_decay, clbks in zip(
      n_cycles_tuple, freeze_steps, lr_decays, clbks_tuple):
    if freeze_step is not None:
      learner.freeze_to(freeze_step)
    else:
      learner.unfreeze()
    lr /= lr_decay
    print(f'tune cf lr: {lr}')
    learner.fit_one_cycle(n_cycles, slice(lr/(2.6**4),lr), moms=moms, wd=wd, callbacks=clbks)
  return learner

# Fit

## Forward LM

#### Process Data Once

In [0]:
# Untar into colab disk so no latency to GDrive.
colab_dir_p = untar_data(URLs.IMDB, dest=FASTAI_DATA_DIR_P)
colab_dir_p.ls()

In [0]:
lm_dbnch = build_lm_databunch(colab_dir_p, n_dbnch_wrkrs, lm_bs, bptt)
# lm_dbnch.show_batch()

#### Use Persistent Path

In [0]:
# Save and load the databunch using a non-voatile path (e.g.: GDrive).
lm_dbnch.save(DATA_DIR_P / FW_LM_DBNCH_FILE_S)
fw_lm_dbnch = load_data(DATA_DIR_P, FW_LM_DBNCH_FILE_S, bs=lm_bs, bptt=bptt, num_workers=n_dbnch_wrkrs)
# fw_lm_dbnch.path.ls()

In [0]:
# The batch should look the same if the above efforts keep the reproducibility.
# fw_lm_dbnch.show_batch()

#### Init-fit

In [0]:
fw_lm_learn = init_lm_learner_with_ulmfit(fw_lm_dbnch, lm_drop_mult)

In [0]:
# Not sure why partial didn't work, so initialize the logger here.
init_fw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_fw_lm-b{lm_bs}'  # w/o .csv
init_fw_lm_clbks = [callbacks.CSVLogger(fw_lm_learn, init_fw_lm_log_p, append=True)]

In [0]:
fw_lm_learn = init_lm_cycles(fw_lm_learn, lm_lr, moms, wd, init_fw_lm_clbks)
# fw_lm_learn.csv_logger.read_logged_file()
fw_lm_learn.save(f'init_fw_lm-b{lm_bs}')

init lm lr: 0.005
epoch     train_loss  valid_loss  accuracy  time    
0         4.380906    4.081290    0.290308  17:22     


In [0]:
# (fw_lm_learn.path/fw_lm_learn.model_dir).ls()

#### Fine-tune

In [0]:
tune_fw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-tune_fw_lm-b{lm_bs}'
tune_fw_lm_clbks = [callbacks.CSVLogger(fw_lm_learn, tune_fw_lm_log_p, append=True)]
fw_lm_learn = tune_lm_cycles(fw_lm_learn, lm_lr/10, moms, wd, tune_fw_lm_clbks)

tune lm lr: 0.0005
epoch     train_loss  valid_loss  accuracy  time    
0         4.156863    3.955999    0.304184  20:33     
1         4.055799    3.887233    0.313336  20:34     
2         4.009748    3.843621    0.318697  20:32     
3         3.974771    3.816171    0.322405  20:33     
4         3.945872    3.796026    0.324830  20:31     
5         3.914127    3.773188    0.327346  20:34     
6         3.889194    3.756511    0.329368  20:34     
7         3.863370    3.741999    0.331158  20:33     
8         3.855726    3.733190    0.332307  20:34     
9         3.844754    3.730709    0.332586  20:35     


In [0]:
fw_lm_learn.save(f'tuned_fw_lm-b{lm_bs}')
fw_lm_learn.save_encoder(FW_ENC_NAME)
# (fw_lm_learn.path/fw_learn_lm.model_dir).ls()

## Forward CF

In [0]:
# fw_lm_dbnch = load_data(DATA_DIR_P, FW_LM_DBNCH_FILE_S, bs=lm_bs, bptt=bptt, num_workers=n_dbnch_wrkrs)
cf_dbnch = build_cf_databunch(colab_dir_p, n_dbnch_wrkrs, cf_bs, fw_lm_dbnch.vocab)
cf_dbnch.save(DATA_DIR_P / FW_CF_DBNCH_FILE_S)

In [0]:
fw_cf_dbnch = load_data(DATA_DIR_P, FW_CF_DBNCH_FILE_S, bs=cf_bs, num_workers=n_dbnch_wrkrs)
# fw_cf_dbnch.show_batch()

In [0]:
fw_cf_learn = init_cf_learner_with_encoder(fw_cf_dbnch, cf_drop_mult, FW_ENC_NAME)

In [0]:
init_fw_cf_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_fw_cf-b{cf_bs}'
init_fw_cf_clbks = [callbacks.CSVLogger(fw_cf_learn, init_fw_cf_log_p, append=True)]
fw_cf_learn = init_cf_cycles(fw_cf_learn, cf_lr, moms, wd, init_fw_cf_clbks)

init cf lr: 0.037500000000000006
epoch     train_loss  valid_loss  accuracy  time    
0         0.224170    0.185326    0.930120  05:03     


In [0]:
tune_fw_cf_clbks_tuple = (
    [callbacks.CSVLogger(
        fw_cf_learn,
        LOGS_DIR_P / f'{SESSN_START_T}_history-tune_fw_cf-b{cf_bs}-p{period}',
        append=True)]
    for period in range(1,4)
)
fw_cf_learn = tune_cf_cycles(fw_cf_learn, cf_lr, moms, wd, tune_fw_cf_clbks_tuple)
fw_cf_learn.save(FW_CF_NAME)

tune cf lr: 0.018750000000000003
epoch     train_loss  valid_loss  accuracy  time    
0         0.208068    0.156441    0.941680  06:26     
tune cf lr: 0.009375000000000001
epoch     train_loss  valid_loss  accuracy  time    
0         0.190641    0.147478    0.945200  08:04     
tune cf lr: 0.0018750000000000004
epoch     train_loss  valid_loss  accuracy  time    
0         0.153830    0.144531    0.946640  10:40     
1         0.091982    0.145425    0.947800  09:58     


In [0]:
# (fw_cf_learn.path/fw_cf_learn.model_dir).ls()

## Backward LM

In [0]:
bw_lm_dbnch = load_data(DATA_DIR_P, FW_LM_DBNCH_FILE_S, bs=lm_bs, bptt=bptt, num_workers=n_dbnch_wrkrs, backwards=True)
# bw_lm_dbnch.show_batch()

In [0]:
bw_lm_learn = init_lm_learner_with_ulmfit(bw_lm_dbnch, lm_drop_mult)

In [0]:
init_bw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_bw_lm'
init_bw_lm_clbks = [callbacks.CSVLogger(bw_lm_learn, init_bw_lm_log_p, append=True)]
bw_lm_learn = init_lm_cycles(bw_lm_learn, lm_lr, moms, wd, init_bw_lm_clbks)
bw_lm_learn.save('init_bw_lm')
# (bw_lm_learn.path/bw_lm_learn.model_dir).ls()

In [0]:
tune_bw_lm_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-tune_bw_lm'
tune_bw_lm_clbks = [callbacks.CSVLogger(bw_lm_learn, tune_bw_lm_log_p, append=True)]
bw_lm_learn = tune_lm_cycles(bw_lm_learn, lm_lr/10, moms, wd, tune_bw_lm_clbks)
bw_lm_learn.save('tuned_bw_lm')
bw_lm_learn.save_encoder(BW_ENC_NAME)
# (bw_lm_learn.path/bw_lm_learn.model_dir).ls()

## Backward CF

In [0]:
bw_cf_dbnch = load_data(DATA_DIR_P, FW_CF_DBNCH_FILE_S, bs=cf_bs, num_workers=n_dbnch_wrkrs, backwards=True)
# bw_cf_dbnch.show_batch()

In [0]:
bw_cf_learn = init_cf_learner_with_encoder(fw_cf_dbnch, cf_drop_mult, FW_ENC_NAME)

In [0]:
init_bw_cf_log_p = LOGS_DIR_P / f'{SESSN_START_T}_history-init_bw_cf'
init_bw_cf_clbks = [callbacks.CSVLogger(bw_cf_learn, init_bw_cf_log_p, append=True)]
bw_cf_learn = init_cf_cycles(bw_cf_learn, cf_lr, moms, wd, init_bw_cf_clbks)

init cf lr: 0.037500000000000006
epoch     train_loss  valid_loss  accuracy  time    
0         0.237566    0.184433    0.929280  05:15     


In [0]:
tune_bw_cf_clbks_tuple = (
    [callbacks.CSVLogger(
        bw_cf_learn,
        LOGS_DIR_P / f'{SESSN_START_T}_history-tune_bw_cf-p{period}',
        append=True)]
    for period in range(1,4)
)
bw_cf_learn = tune_cf_cycles(bw_cf_learn, cf_lr, moms, wd, tune_bw_cf_clbks_tuple)

tune cf lr: 0.018750000000000003
epoch     train_loss  valid_loss  accuracy  time    
0         0.200242    0.155295    0.942120  06:33     
tune cf lr: 0.009375000000000001
epoch     train_loss  valid_loss  accuracy  time    
0         0.193940    0.151276    0.944640  07:51     
tune cf lr: 0.0018750000000000004
epoch     train_loss  valid_loss  accuracy  time    
0         0.165703    0.142958    0.948120  11:49     
1         0.095749    0.145562    0.949080  11:04     


In [0]:
bw_cf_learn.save(BW_CF_NAME)
# (bw_cf_learn.path/bw_cf_learn.model_dir).ls()

# Ensemble

In [0]:
pred_fw, lbl_fw = fw_cf_learn.get_preds(ordered=True)

In [0]:
pred_bw, lbl_bw = bw_cf_learn.get_preds(ordered=True)

In [0]:
avg_pred = (pred_fw + pred_bw) / 2

In [0]:
accuracy(avg_pred, lbl_fw)