# FastHugs

In [None]:
import sys
if 'google.colab' in sys.modules:
    !pip install -Uqq fastai transformers datasets wandb tqdm
    !pip install -qq git+git://github.com/aikindergarten/fasthugs.git

[K     |████████████████████████████████| 186 kB 5.2 MB/s 
[K     |████████████████████████████████| 2.6 MB 36.4 MB/s 
[K     |████████████████████████████████| 264 kB 50.0 MB/s 
[K     |████████████████████████████████| 1.7 MB 36.8 MB/s 
[K     |████████████████████████████████| 76 kB 4.7 MB/s 
[K     |████████████████████████████████| 56 kB 3.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 35.9 MB/s 
[K     |████████████████████████████████| 895 kB 39.3 MB/s 
[K     |████████████████████████████████| 636 kB 51.5 MB/s 
[K     |████████████████████████████████| 243 kB 45.0 MB/s 
[K     |████████████████████████████████| 118 kB 53.1 MB/s 
[K     |████████████████████████████████| 97 kB 6.7 MB/s 
[K     |████████████████████████████████| 170 kB 46.0 MB/s 
[K     |████████████████████████████████| 133 kB 47.9 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for p

## GLUE Benchmark

In [None]:
from transformers import AutoModelForSequenceClassification
from fastai.text.all import *
from fastai.callback.wandb import WandbCallback

from fasthugs.learner import TransLearner
from fasthugs.data import TransformersTextBlock, TextGetter, get_splits, PreprocCategoryBlock

from datasets import load_dataset, concatenate_datasets

import wandb
import gc

In [None]:
ds_name = 'glue'
model_name = "roberta-base"

n_epoch = 5

max_len = 512
bs = 32
val_bs = bs*2

lr = 3e-5

In [None]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

In [None]:
glue_metrics = {
    'cola':[MatthewsCorrCoef()],
    'sst2':[accuracy],
    'mrpc':[F1Score(), accuracy],
    'stsb':[PearsonCorrCoef(), SpearmanCorrCoef()],
    'qqp' :[F1Score(), accuracy],
    'mnli':[accuracy],
    'qnli':[accuracy],
    'rte' :[accuracy],
    'wnli':[accuracy],
}

## MRPC - Microsoft Reserach Paraphrase Corpus

In [None]:
task = 'mrpc'
ds = load_dataset(ds_name, task)

Downloading:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


In [None]:
print(f"Train set {len(ds['train'])}; Valdation set {len(ds['validation'])}")

Train set 3668; Valdation set 408


In [None]:
train_idx, valid_idx = get_splits(ds)
train_ds = concatenate_datasets([ds['train'], ds['validation']])

In [None]:
train_ds[0]

{'idx': 0,
 'label': 1,
 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'}

In [None]:
label_vocab = train_ds.features['label'].names
blocks = [
    TransformersTextBlock(pretrained_model_name=model_name),
    PreprocCategoryBlock(label_vocab)
]

dblock = DataBlock(
    blocks=blocks,
    get_x=TextGetter('sentence1', 'sentence2'),
    get_y=ItemGetter('label'),
    splitter=IndexSplitter(valid_idx)
)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
%%time
dls = dblock.dataloaders(train_ds, bs=bs, val_bs=val_bs)

CPU times: user 5.02 s, sys: 1 s, total: 6.02 s
Wall time: 12.9 s


In [None]:
dls.show_batch(max_n=4)

Unnamed: 0,text,text_,category
0,"Amrozi accused his brother, whom he called "" the witness "", of deliberately distorting his evidence.","Referring to him as only "" the witness "", Amrozi accused his brother of deliberately distorting his evidence.",equivalent
1,"Blair has said there is not'' a shred of truth'' in allegations the government manipulated evidence, and has resisted calls for a full public inquiry.",Blair has said there is not ``a shred of truth'' in allegations that the government manipulated evidence about Iraq's weapons programs.,not_equivalent
2,"Overall control will be wielded by a national security council, headed by Mr Arafat.",The other six security agencies will report to a National Security Council headed by Arafat.,not_equivalent
3,"Three-year-old Jaryd Atadero vanished on Oct. 2, 1999 while on a hiking trip with a church group.",He was on a hiking trip that day with a church group.,not_equivalent


In [None]:
WANDB_NAME = f'{ds_name}-{task}-{model_name}'
GROUP = f'{ds_name}-{task}-{model_name}-{lr:.0e}'
NOTES = f'finetuning {model_name} with Adam lr={lr:.0e}'
TAGS =[model_name, ds_name, 'adam']

In [None]:
wandb.init(reinit=True, project="fasthugs", entity="fastai_community",
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS);

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
metrics = glue_metrics[task]
learn = TransLearner(dls, model, metrics=metrics).to_fp16()

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
learn.show_training_loop()

Start Fit
   - before_fit     : [TrainEvalCallback, MixedPrecision, Recorder, ProgressCallback]
  Start Epoch Loop
     - before_epoch   : [Recorder, ProgressCallback]
    Start Train
       - before_train   : [TrainEvalCallback, Recorder, ProgressCallback]
      Start Batch Loop
         - before_batch   : [TransCallback, MixedPrecision]
         - after_pred     : [TransCallback, MixedPrecision]
         - after_loss     : [TransCallback, MixedPrecision]
         - before_backward: [MixedPrecision]
         - before_step    : [MixedPrecision]
         - after_step     : [MixedPrecision]
         - after_cancel_batch: []
         - after_batch    : [TrainEvalCallback, Recorder, ProgressCallback]
      End Batch Loop
    End Train
     - after_cancel_train: [Recorder]
     - after_train    : [Recorder, ProgressCallback]
    Start Valid
       - before_validate: [TrainEvalCallback, Recorder, ProgressCallback]
      Start Batch Loop
         - **CBs same as train batch**: []
      End Ba

In [None]:
cbs = [WandbCallback(log_preds=False, log_model=False), SaveModelCallback(monitor=metrics[0].name)]
learn.fit_one_cycle(n_epoch, lr, cbs=cbs)

Could not gather input dimensions


epoch,train_loss,valid_loss,f1_score,accuracy,time
0,0.585122,0.54051,0.818182,0.696078,03:53
1,0.433094,0.335536,0.912521,0.875,03:51
2,0.264747,0.27306,0.908088,0.877451,03:51
3,0.146949,0.334426,0.917108,0.884804,03:51
4,0.085187,0.35045,0.923623,0.894608,03:51


Better model found at epoch 0 with f1_score value: 0.8181818181818181.
Better model found at epoch 1 with f1_score value: 0.9125214408233276.
Better model found at epoch 3 with f1_score value: 0.9171075837742505.
Better model found at epoch 4 with f1_score value: 0.9236234458259325.


In [None]:
learn.show_results()

Unnamed: 0,text,text_,category,category_
0,He said the foodservice pie business doesn 't fit the company's long-term growth strategy.,""" The foodservice pie business does not fit our long-term growth strategy.",equivalent,equivalent
1,""" Biotech products, if anything, may be safer than conventional products because of all the testing, "" Fraley said, adding that 18 countries have adopted biotechnology.",""" Biotech products, if anything, may be safer than conventional products because of all the testing, "" said Robert Fraley, Monsanto's executive vice president.",not_equivalent,not_equivalent
2,The company emphasized that McDonald's USA does not import any raw beef or hamburger patties from Canada for McDonald's use in the United States.,McDonald's said in a statement that it does not import any raw beef or hamburger patties from Canada for use in the United States.,equivalent,equivalent
3,"The chain operates more than 3,400 stores, and has annual revenue of about $ 15.8 billion.","The chain, which has been under new management since late 1999, has more than 3,400 stores and $ 15.8 billion in annual revenue.",not_equivalent,not_equivalent
4,"In February 2000, the officers — Kenneth Boss, Sean Carroll, Edward McMellon and Richard Murphy — were acquitted of all charges in the killing.","The officers -- Kenneth Boss, Sean Carroll, Edward McMellon and Richard Murphy -- were acquitted in 2000 of state murder charges.",equivalent,equivalent
5,""" The government elements who have been causing trouble are still in place.","The government elements who have been causing trouble are still in place, they are attacking us. """,not_equivalent,not_equivalent
6,"Bush wanted "" to see an aircraft landing the same way that the pilots saw an aircraft landing, "" White House press secretary Ari Fleischer said yesterday.","On Tuesday, before Byrd's speech, Fleischer said Bush wanted'' to see an aircraft landing the same way that the pilots saw an aircraft landing.",not_equivalent,equivalent
7,Cortisol levels in the saliva of day care children were highest and rose most steeply in those judged by day care center personnel to be the shyest.,Cortisol levels in the saliva of day-care children were highest and rose most steeply in those whom day-care centre staffed judged to be the shyest.,equivalent,equivalent
8,"He was arrested Friday night at an Alpharetta seafood restaurant while dining with his wife, singer Whitney Houston.",He was arrested again Friday night at an Alpharetta restaurant where he was having dinner with his wife.,equivalent,equivalent


## Inference

In [None]:
test_dl = learn.dls.test_dl(ds["test"])
preds, _ = learn.get_preds(dl=test_dl)

In [None]:
preds[:5]

tensor([[0.0041, 0.9959],
        [0.0015, 0.9985],
        [0.0014, 0.9986],
        [0.0024, 0.9976],
        [0.8266, 0.1734]])

## Optimieren der Performance

### Andere Modellarchitekturen

* RoBERTa - more training data and no NSP task
* ALBERT - parameter sharing
* ELECTRA - discriminator pretraining objective
* DeBERTa - disantengled attention

In [None]:
del learn, model
gc.collect()
torch.cuda.empty_cache()

In [None]:
model_name = "microsoft/deberta-base"

In [None]:
blocks = [
    TransformersTextBlock(pretrained_model_name=model_name),
    PreprocCategoryBlock(label_vocab)
]

dblock = DataBlock(
    blocks=blocks,
    get_x=TextGetter('sentence1', 'sentence2'),
    get_y=ItemGetter('label'),
    splitter=IndexSplitter(valid_idx)
)

dls = dblock.dataloaders(train_ds, bs=bs, val_bs=val_bs)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
WANDB_NAME = f'{ds_name}-{task}-{model_name}'
GROUP = f'{ds_name}-{task}-{model_name}-{lr:.0e}'
NOTES = f'finetuning {model_name} with RAdam lr={lr:.0e}'
TAGS =[model_name, ds_name, 'radam']

In [None]:
wandb.init(reinit=True, project="fasthugs", entity="fastai_community",
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS);

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,5.0
train_loss,0.08519
raw_loss,0.16201
wd_0,0.01
sqr_mom_0,0.99
lr_0,0.0
mom_0,0.95
eps_0,1e-05
wd_1,0.01
sqr_mom_1,0.99


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_loss,█████▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁
raw_loss,██▇█▆▇▇█▆▅▅▄▄▄▃▅▂▃▄▂▅▅▃▂▃▂▁▃▁▃▁▃▁▂▂▁▁▁▂▂
wd_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
sqr_mom_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr_0,▁▁▂▃▄▅▆▇████████▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁▁▁
mom_0,██▇▆▅▅▃▂▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇█████
eps_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
wd_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
sqr_mom_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
metrics = glue_metrics[task]
learn = TransLearner(dls, model, metrics=metrics).to_fp16()

cbs = [WandbCallback(log_preds=False, log_model=False), SaveModelCallback(monitor=metrics[0].name)]
learn.fit_one_cycle(n_epoch, lr, cbs=cbs)

Downloading:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['config', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.weight

Could not gather input dimensions


epoch,train_loss,valid_loss,f1_score,accuracy,time
0,0.5606,0.491765,0.853583,0.769608,06:04
1,0.410831,0.28838,0.918728,0.887255,06:03
2,0.212002,0.28523,0.919105,0.884804,06:04
3,0.096798,0.347049,0.927586,0.897059,06:05
4,0.03818,0.39633,0.929674,0.89951,06:04


Better model found at epoch 0 with f1_score value: 0.8535825545171339.
Better model found at epoch 1 with f1_score value: 0.9187279151943463.
Better model found at epoch 2 with f1_score value: 0.919104991394148.
Better model found at epoch 3 with f1_score value: 0.9275862068965517.
Better model found at epoch 4 with f1_score value: 0.9296740994854202.


### Hyperparameter-Tuning mit wandb sweeps

In [None]:
def layerwise_splitter(model):
    emb = L(model.base_model.embeddings)
    layers = L(model.base_model.encoder.layer.children())
    clf = L(m for m in list(model.children())[1:] if params(m))
    groups = emb + layers + clf
    return groups.map(params)

In [None]:
def train():
    with wandb.init() as run:
        cfg = run.config
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
        metrics = glue_metrics[task]
        k = len(layerwise_splitter(model))
        if cfg.diff_lr_decay_factor: lr = slice(cfg.lr*cfg.diff_lr_decay_factor**k,cfg.lr)
        learn = TransLearner(dls, model, metrics=metrics, opt_func=Adam, splitter=layerwise_splitter).to_fp16()
        learn.fit_one_cycle(n_epoch, cfg.lr, wd=cfg.wd, cbs=[WandbCallback(log_preds=False, log_model=False)])
        del learn
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

In [None]:
model_name = "microsoft/deberta-base"
metrics = glue_metrics[task]
metric_to_monitor = metrics[0].name if isinstance(metrics[0], Metric) else metrics[0].__name__
sweep_name = f"glue-{task}-deberta-base-sweep"
sweep_config = {
    "project":"glue-benchmark",
    "entity": "fastai_cimmunity",
    "name": sweep_name,
    "method": "grid",
    "parameters": {
        "lr": {"values":[2e-5,3e-5,5e-5,1e-4]},
        "wd": {"values":[0.,1e-2,5e-2]},
        "diff_lr_decay_factor":{"values":[0., 0.9, 0.8, 0.7, 0.6]}
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project='glue-benchmark', entity="fastai_community")



Create sweep with ID: q7vdm8r2
Sweep URL: https://wandb.ai/fastai_community/glue-benchmark/sweeps/q7vdm8r2


In [None]:
wandb.agent(sweep_id, function=train)

[34m[1mwandb[0m: Agent Starting Run: uerz6p9j with config:
[34m[1mwandb[0m: 	diff_lr_decay_factor: 0
[34m[1mwandb[0m: 	lr: 2e-05
[34m[1mwandb[0m: 	wd: 0


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['config', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.weight

Could not gather input dimensions


epoch,train_loss,valid_loss,f1_score,accuracy,time
0,0.583198,0.525642,0.820789,0.754902,06:05
1,0.395967,0.284594,0.920354,0.889706,06:05
2,0.237165,0.268252,0.92559,0.89951,06:05
3,0.105514,0.329102,0.923894,0.894608,06:04


In [None]:
wandb.finish()