<a href="https://github.com/tianjianjiang/nlp_data_aug/blob/%232-double_redaction/DataAugmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare

In [0]:
# Ensure GPU spec; T4 is for colab and one can change it for another env.
gpu_list = !nvidia-smi -L
if gpu_list[0].startswith('NVIDIA-SMI has failed'):
  print('Runtime type should be GPU.')
elif not gpu_list[0].startswith('GPU 0: Tesla T4'):
  display(gpu_list)
  print('Please reset all runtimes. We need a Tesla T4 to reproduce!')
else:
  display(gpu_list)

In [0]:
%%capture pip_logs
!pip install -U fastai==1.0.55 ipyexperiments jupyter-console==5.2.0 coveralls coverage datascience albumentations

In [0]:
import gc
import math
from pathlib import Path
import random

import numpy as np
import torch
from google.colab import drive

from fastai import basic_data, basic_train, core
from fastai import *
from fastai.callbacks import CSVLogger
from fastai.core import plt
from fastai.text import *
from fastprogress import fastprogress

from ipyexperiments import *

In [0]:
# Not set earlier because pip may require a restart.
SESSN_START_T, = !date +%Y%m%dT%H%M

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
# A special treatment for colab to decrease network traffic.
fastprogress.NO_BAR = True
master_bar, progress_bar = fastprogress.force_console_behavior()
basic_train.master_bar, basic_train.progress_bar = master_bar, progress_bar
basic_data.master_bar, basic_data.progress_bar = master_bar, progress_bar
dataclass.master_bar, dataclass.progress_bar = master_bar, progress_bar
text.master_bar, text.progress_bar = master_bar, progress_bar
text.data.master_bar, text.data.progress_bar = master_bar, progress_bar
core.master_bar, core.progress_bar = master_bar, progress_bar

In [0]:
COLAB_CONTENT_DIR_P = Path('/content')
GD_DIR_P = COLAB_CONTENT_DIR_P / 'gdrive'
drive.mount(str(GD_DIR_P), force_remount=True)

In [0]:
BASE_DIR_P = GD_DIR_P / 'My Drive/imdb'
BASE_DIR_P.mkdir(parents=True, exist_ok=True)
DATA_DIR_P = BASE_DIR_P / 'data'
DATA_DIR_P.mkdir(parents=True, exist_ok=True)
MDLS_DIR_P = BASE_DIR_P / 'models'
MDLS_DIR_P.mkdir(parents=True, exist_ok=True)
LOGS_DIR_P = BASE_DIR_P / 'logs'
LOGS_DIR_P.mkdir(parents=True, exist_ok=True)

FASTAI_DATA_DIR_P = Path('/root/.fastai/data')
FASTAI_DATA_DIR_P.mkdir(parents=True, exist_ok=True)

COLAB_DATA_DIR_P = COLAB_CONTENT_DIR_P / 'data'
if not COLAB_DATA_DIR_P.is_symlink():
  COLAB_DATA_DIR_P.symlink_to(FASTAI_DATA_DIR_P)
if (COLAB_CONTENT_DIR_P / 'sample_data').exists():
  !set -x; rm -rf /content/sample_data/

## Download Data Once

In [0]:
# IMDB_DATA_IN_COLAB_DIR_P = COLAB_DATA_DIR_P / 'imdb'
IMDB_DATA_IN_COLAB_DIR_P = untar_data(URLs.IMDB, dest=FASTAI_DATA_DIR_P)

# Define

In [0]:
n_dbnch_wrkrs = 0

BPTT = 80
LM_BS = 128
cf_bs = 64
LM_LR = 0.0251
cf_lr = 5.18e-2
cf_wd = 0.1
cf_drop_mult = 0.5
moms = (0.8, 0.7)

VOCAB_FILE_P = DATA_DIR_P / 'imdb_vocab.pkl'

FW_LM_DBNCH_FILE_S = f'fw_lm_dbnch-b{LM_BS}.pkl'
FW_CF_DBNCH_FILE_S = f'fw_cf_dbnch-b{cf_bs}.pkl'

FW_ENC_NAME = f'fw_enc-b{LM_BS}-lr{LM_LR}'

IMDb_CLASSES = ['neg', 'pos']

In [0]:
# Set a constant seed for every random number generator.
SEED = 42


def reset_random_states(seed=SEED):
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
  torch.backends.cudnn.deterministic = True  # About 15% slower but...
  torch.backends.cudnn.benchmark = False


def build_cf_databunch(
    data_dir_p,
    bs,
    vocab,
    ag_data_dir_p_lst=[],
    tags=IMDb_CLASSES,
    n_workers=n_dbnch_wrkrs,
):
  reset_random_states()
  il = TextList.from_folder(data_dir_p, vocab=vocab).split_by_folder(valid='test')
  for ag_data_dir_p in ag_data_dir_p_lst:
    il.train.add(TextList.from_folder(ag_data_dir_p, vocab=vocab).split_none().train)
  ll = il.label_from_folder(classes=tags)
  return ll.databunch(bs=bs, num_workers=n_workers)


def init_cf_learner_with_encoder(dbnch, drop_mult, enc_name, base_path=BASE_DIR_P):
  reset_random_states()
  cf_learn = text_classifier_learner(dbnch, AWD_LSTM, drop_mult=drop_mult, path=base_path, pretrained=False)
  cf_learn.load_encoder(enc_name)
  return cf_learn


def init_cf_cycles(learner, lr, moms, wd, clbks, n_cycles=1):
  print(f'init cf lr: {lr}')
  reset_random_states()
  learner.fit_one_cycle(n_cycles, lr, moms=moms, wd=wd, callbacks=clbks)
  return learner


def tune_cf_cycles(
    learner,
    lr,
    moms,
    wd,
    clbks_tuple,
    n_cycles_tuple=(1,1,2),
    freeze_steps=(-2,-3,None),
    lr_decays=(2,2,5)
):
  reset_random_states()
  for n_cycles, freeze_step, lr_decay, clbks in zip(
      n_cycles_tuple, freeze_steps, lr_decays, clbks_tuple):
    if freeze_step is not None:
      learner.freeze_to(freeze_step)
    else:
      learner.unfreeze()
    lr /= lr_decay
    print(f'tune cf lr: {lr}')
    learner.fit_one_cycle(n_cycles, slice(lr/(2.6**4),lr), moms=moms, wd=wd, callbacks=clbks)
  return learner

In [0]:
def double_redact(txt_p, txt_id, chance, out_dir_p, noise='xxunk', seed=SEED):
    random.seed(seed)
#     pick_some = False
#     picks = 1 if pick_some else None
    with open(txt_p, 'r') as txt_f:
        txt = txt_f.read()
        chosen_idx_set_list = []
        tkns = txt.split()
        n_tkn = len(tkns)
        tkn_idx_set = set(range(n_tkn))
        n_sample = math.ceil(n_tkn * chance)
#         picked = 0
        while n_sample <= n_tkn:
            chosen_idx_set = set(random.sample(sorted(tkn_idx_set), k=n_sample))
            chosen_idx_set_list += [chosen_idx_set]
            tkn_idx_set -= chosen_idx_set
            n_tkn = len(tkn_idx_set)
#             picked += 1
#             if pick_some and picked == picks:
#                 break
        for ver, idx_set in enumerate(chosen_idx_set_list):
            dr_tkns = tkns.copy()
            dr_fname = f'{txt_p.stem}_dr-c{chance}-v{ver}{txt_p.suffix}'
            with open(out_dir_p / dr_fname, 'w') as dr_f:
                for chosen_idx in idx_set:
                    dr_tkns[chosen_idx] = noise
                dr_f.write(' '.join(dr_tkns))

In [0]:
# Single-threaded although the func is compatible with fastai's `parallel()`
def double_redact_trn_txts(
    chance,
    parent_dir_p=IMDB_DATA_IN_COLAB_DIR_P,
    tags=IMDb_CLASSES,
):
  tgt_parent_dir_p = parent_dir_p.parent / f'{parent_dir_p.stem}-dr{chance}'
  for tag in tags:
    src_dir_p = parent_dir_p / f'train/{tag}'
    tgt_dir_p = tgt_parent_dir_p / f'train/{tag}'
    tgt_dir_p.mkdir(parents=True, exist_ok=True)
    txts = src_dir_p.glob('*.txt')
    for i, txt in enumerate(txts):
      double_redact(txt, i, chance, tgt_dir_p)
  return tgt_parent_dir_p

# Fit

## Double-redacted Data in All Cycles

A.k.a. Two Act Structure

If Act-1 is the setup of language model, this is the second Act on confronting perturbations.

In [0]:
redact_chance = 0.25
fw_cf_dbnch_dr_fname = f'fw_cf_dbnch-dr{redact_chance}-b{cf_bs}.pkl'

### Augment

In [0]:
orig_trn_txts = list(IMDB_DATA_IN_COLAB_DIR_P.glob(f'./train/*/*[0-9].txt'))
sorted(orig_trn_txts)[:3]

In [0]:
dr_trn_dir_p = double_redact_trn_txts(redact_chance)

In [0]:
dr_trn_txts = list(dr_trn_dir_p.glob('**/*.txt'))
sorted(dr_trn_txts)[:12]

In [0]:
n_orig_trn = len(orig_trn_txts)
n_dr_trn = len(dr_trn_txts)
print(f'{n_orig_trn} original\n{n_dr_trn} double-redacted')

In [0]:
# reset_random_states()
# fw_lm_dbnch = load_data(DATA_DIR_P, FW_LM_DBNCH_FILE_S, bs=LM_BS, bptt=BPTT, num_workers=n_dbnch_wrkrs)
# fw_lm_dbnch.vocab.save(VOCAB_FILE_P)
# del fw_lm_dbnch; gc.collect()

In [0]:
imdb_vocab = Vocab.load(VOCAB_FILE_P)
fw_cf_dbnch_dr = build_cf_databunch(IMDB_DATA_IN_COLAB_DIR_P, cf_bs, imdb_vocab, [dr_trn_dir_p])
fw_cf_dbnch_dr.save(DATA_DIR_P / fw_cf_dbnch_dr_fname)

In [0]:
# reset_random_states()
# fw_cf_dbnch_dr = load_data(DATA_DIR_P, fw_cf_dbnch_dr_fname, bs=cf_bs, num_workers=n_dbnch_wrkrs)

In [0]:
fw_cf_dr_learn = init_cf_learner_with_encoder(fw_cf_dbnch_dr, cf_drop_mult, FW_ENC_NAME)

### Init-fit

In [0]:
init_fw_cf_dr_log_p = LOGS_DIR_P / f'{SESSN_START_T}-init_fw_cf-a2-s1-dr{redact_chance}-b{cf_bs}-lr{cf_lr}'
init_fw_cf_dr_clbks = [CSVLogger(fw_cf_dr_learn, init_fw_cf_dr_log_p, append=True)]
fw_cf_dr_learn = init_cf_cycles(fw_cf_dr_learn, cf_lr, moms, cf_wd, init_fw_cf_dr_clbks)
FW_CF_CONFRONTATION_SCENE1_NAME = f'fw_cf-a2-s1-dr{redact_chance}-b{cf_bs}-lr{cf_lr}'
fw_cf_dr_learn.save(FW_CF_CONFRONTATION_SCENE1_NAME)

init cf lr: 0.0518
epoch     train_loss  valid_loss  accuracy  time    
0         0.309854    0.179218    0.933240  08:54     


init cf lr: 0.0518
epoch     train_loss  valid_loss  accuracy  time    
0         0.268904    0.173237    0.935800  13:02     


init cf lr: 0.0518
epoch     train_loss  valid_loss  accuracy  time    
0         0.272350    0.172224    0.935720  13:56     


init cf lr: 0.0518
epoch     train_loss  valid_loss  accuracy  time    
0         0.278960    0.173263    0.933920  11:36     



#### Comparing with baselines of init. cycle
----
    Mike
    0         0.222152    0.176532    0.934160  04:57

----
    fastai example
    0         0.246949    0.180387    0.931840  01:14

### Fine-tune

In [0]:
a2_tune_fw_cf_dr_clbks_tuple = (
    [CSVLogger(
        fw_cf_dr_learn,
        LOGS_DIR_P / f'{SESSN_START_T}-tune_fw_cf-a2-s2-dr{redact_chance}-b{cf_bs}-p1',
        append=True)],
)
fw_cf_dr_learn = tune_cf_cycles(
    fw_cf_dr_learn,
    cf_lr,
    moms,
    cf_wd,
    a2_tune_fw_cf_dr_clbks_tuple,
    n_cycles_tuple=(1,),
    freeze_steps=(-2,),
    lr_decays=(2,)
)
FW_CF_CONFRONTATION_SCENE2_NAME = f'fw_cf-a2-s2-dr{redact_chance}-b{cf_bs}-lr{cf_lr}'
fw_cf_dr_learn.save(FW_CF_CONFRONTATION_SCENE2_NAME)

a2_tune_fw_cf_dr_clbks_tuple = (
    [CSVLogger(
        fw_cf_dr_learn,
        LOGS_DIR_P / f'{SESSN_START_T}-tune_fw_cf-a2-s3-dr{redact_chance}-b{cf_bs}-p{period}',
        append=True)]
    for period in range(2,4)
)
fw_cf_dr_learn = tune_cf_cycles(
    fw_cf_dr_learn,
    cf_lr,
    moms,
    cf_wd,
    a2_tune_fw_cf_dr_clbks_tuple,
    n_cycles_tuple=(1,2),
    freeze_steps=(-3,None),
    lr_decays=(2*2,5)
)

tune cf lr: 0.0259
epoch     train_loss  valid_loss  accuracy  time    
0         0.253780    0.143132    0.947360  10:32     
tune cf lr: 0.01295
epoch     train_loss  valid_loss  accuracy  time    
0         0.213023    0.137004    0.950280  16:18     
tune cf lr: 0.00259
epoch     train_loss  valid_loss  accuracy  time    
0         0.150868    0.152152    0.945960  21:36     
1         0.102201    0.154852    0.947160  22:07     


tune cf lr: 0.0259
epoch     train_loss  valid_loss  accuracy  time    
0         0.199623    0.135700    0.950200  15:33     
tune cf lr: 0.01295
epoch     train_loss  valid_loss  accuracy  time    
0         0.148599    0.133771    0.950960  24:57     
tune cf lr: 0.00259
epoch     train_loss  valid_loss  accuracy  time    
0         0.090739    0.172821    0.947680  33:40     
1         0.045258    0.182786    0.950920  32:02     


tune cf lr: 0.0259
epoch     train_loss  valid_loss  accuracy  time    
0         0.189328    0.143150    0.947920  15:41     
tune cf lr: 0.01295
epoch     train_loss  valid_loss  accuracy  time    
0         0.115525    0.145131    0.947680  26:20     
tune cf lr: 0.00259
epoch     train_loss  valid_loss  accuracy  time    
0         0.072761    0.171645    0.947720  32:42     
1         0.029662    0.204256    0.947640  31:28     


tune cf lr: 0.0259
epoch     train_loss  valid_loss  accuracy  time    
0         0.206510    0.137578    0.949840  13:55     
tune cf lr: 0.01295
epoch     train_loss  valid_loss  accuracy  time    
0         0.170256    0.135338    0.950080  18:30     
tune cf lr: 0.00259
epoch     train_loss  valid_loss  accuracy  time    
0         0.118340    0.146070    0.949160  28:36     
1         0.061189    0.177552    0.949440  26:11     


#### Comparing with baselines of fine-tuning cycles
----
    Mike
    0         0.190170    0.153062    0.944160  05:36
    0         0.180019    0.138620    0.949240  08:08
    0         0.130078    0.143215    0.945520  09:48
    1         0.083496    0.142682    0.949720  09:03

----
    fastai example
    0         0.206164    0.152391    0.945360  01:28
    0         0.181309    0.141463    0.948080  02:30
    0         0.123944    0.145212    0.948840  03:18
    1         0.072845    0.155692    0.949560  03:00


In [0]:
# fw_cf_dr_learn.export(MDLS_DIR_P / f'export-fw_cf-dr{redact_chance}-b{cf_bs}-lr{cf_lr}')

In [0]:
fw_cf_dr_learn.destroy(); del fw_cf_dr_learn; del fw_cf_dbnch_dr; gc.collect(); torch.cuda.empty_cache()

# Fit Double-redacted Data in Middle Cycles (3-Act Structure)

Act-1, a.k.a. setup, is the language model (transfer) learning.

## Act-2: Confrontation on Double-redacted Data

In [0]:
# reset_all_nondeterministic_states()
# fw_cf_dbnch_dr = load_data(DATA_DIR_P, fw_cf_dbnch_dr_fname, bs=cf_bs, num_workers=n_dbnch_wrkrs)

In [0]:
# a2_fw_cf_dr_learn = init_cf_learner_with_encoder(fw_cf_dbnch_dr, cf_drop_mult, FW_ENC_NAME)
# a2_fw_cf_dr_log_p = LOGS_DIR_P / f'{SESSN_START_T}-init_fw_cf-a2-dr{redact_chance}-b{cf_bs}-lr{cf_lr}'
# a2_fw_cf_dr_clbks = [CSVLogger(a2_fw_cf_dr_learn, a2_fw_cf_dr_log_p, append=True)]
# a2_fw_cf_dr_learn = init_cf_cycles(a2_fw_cf_dr_learn, cf_lr, moms, cf_wd, a2_fw_cf_dr_clbks)

In [0]:
# FW_CF_CONFRONTATION_SCENE1_NAME = f'fw_cf-a2-s1-dr{redact_chance}-b{cf_bs}-lr{cf_lr}'
# a2_fw_cf_dr_learn.save(FW_CF_CONFRONTATION_SCENE1_NAME)

In [0]:
# a2_tune_fw_cf_dr_clbks_tuple = (
#     [CSVLogger(
#         a2_fw_cf_dr_learn,
#         LOGS_DIR_P / f'{SESSN_START_T}-tune_fw_cf-a2-dr{redact_chance}-b{cf_bs}-p1',
#         append=True)],
# )
# a2_fw_cf_dr_learn = tune_cf_cycles(
#     a2_fw_cf_dr_learn,
#     cf_lr,
#     moms,
#     cf_wd,
#     a2_tune_fw_cf_dr_clbks_tuple,
#     n_cycles_tuple=(1,),
#     freeze_steps=(-2,),
#     lr_decays=(2,)
# )

In [0]:
# FW_CF_CONFRONTATION_SCENE2_NAME = f'fw_cf-a2-s2-dr{redact_chance}-b{cf_bs}-lr{cf_lr}'
# a2_fw_cf_dr_learn.save(FW_CF_CONFRONTATION_SCENE2_NAME)

In [0]:
# a2_fw_cf_dr_learn.destroy(); del a2_fw_cf_dr_learn; del fw_cf_dbnch_dr; gc.collect(); torch.cuda.empty_cache()

## Act-3: Resolution on Orig. Data

In [0]:
reset_all_nondeterministic_states()
fw_cf_dbnch = load_data(DATA_DIR_P, FW_CF_DBNCH_FILE_S, bs=cf_bs, num_workers=n_dbnch_wrkrs)

In [0]:
a3_fw_cf_learn = init_cf_learner_with_encoder(fw_cf_dbnch, cf_drop_mult, FW_ENC_NAME)

In [0]:
a3_fw_cf_learn = a3_fw_cf_learn.load(FW_CF_CONFRONTATION_SCENE2_NAME)

In [0]:
a3_tune_fw_cf_clbks_tuple = (
    [CSVLogger(
        a3_fw_cf_learn,
        LOGS_DIR_P / f'{SESSN_START_T}-tune_fw_cf-a3-dr{redact_chance}-b{cf_bs}-p{period}',
        append=True)]
   for period in range(2,4)
)
a3_fw_cf_learn = tune_cf_cycles(
    a3_fw_cf_learn,
    cf_lr,
    moms,
    cf_wd,
    a3_tune_fw_cf_clbks_tuple,
    n_cycles_tuple=(1,2),
    freeze_steps=(-3,None),
    lr_decays=(2*2,5)
)

tune cf lr: 0.01295
epoch     train_loss  valid_loss  accuracy  time    
0         0.169314    0.143823    0.946400  07:38     
tune cf lr: 0.00259
epoch     train_loss  valid_loss  accuracy  time    
0         0.122866    0.139531    0.947920  09:38     
1         0.076164    0.150975    0.948240  09:17     


In [0]:
FW_CF_RESOLUTION_NAME = f'fw_cf-a3-dr{redact_chance}-b{cf_bs}-lr{cf_lr}'
a3_fw_cf_learn.save(FW_CF_RESOLUTION_NAME)

In [0]:
a3_fw_cf_learn.destroy(); del a3_fw_cf_learn; del fw_cf_dbnch; gc.collect(); torch.cuda.empty_cache()