%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [57]:
! which jupyter

/home/abaumann/anaconda3/envs/cuda/bin/jupyter


In [58]:
import logging
import random
from functools import partial
from pathlib import Path

import numpy as np

import fire
import torch
from fastai.basic_data import DataBunch
from fastai.basic_train import Learner
from fastai.metrics import fbeta
from fastai.train import to_fp16
from learner import (OneHotCallBack, conll_f1, create_fp16_cb,
                     ner_loss_func, Conll_F1)
from ner_data import NerDataset, pad
from optimizer import BertAdam
from pytorch_pretrained_bert import BertForTokenClassification
from torch.utils.data import DataLoader
from fastai.torch_core import flatten_model

In [59]:
lang='deu'

In [60]:
log_dir:str='logs'
batch_size:int=32
lr:float=5e-5
epochs:int=1
trainset:str='data/conll-2003/'
devset:str='data/conll-2003/'
testset:str='data/conll-2003/'
max_seq_len:int=128
do_lower_case:bool=False
warmup_proportion:float=0.1
grad_acc_steps:int=1
rand_seed:int=None
fp16:bool=False
loss_scale:float=None
ds_size:int=None
data_bunch_path:str='data/conll-2003/db'
freez:bool=False
one_cycle:bool=False
discr:bool=False
tuned_learner:str=None
do_train:str=False
do_eval:str=False
save:bool=False

In [61]:
bert_model = 'bert-base-cased' if lang=='eng' else 'bert-base-multilingual-cased'


In [62]:
trainset += lang + '/train.txt'
devset += lang + '/dev.txt'
testset += lang + '/test.txt'

In [63]:
if grad_acc_steps < 1:
    raise ValueError(f"""Invalid grad_acc_steps parameter:
                     {grad_acc_steps}, should be >= 1""")

# TODO proper training with grad accum step??
batch_size //= grad_acc_steps

In [64]:
train_dl = DataLoader(
    dataset=NerDataset(trainset, bert_model, max_seq_len=max_seq_len, ds_size=ds_size),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=pad
)

dev_dl = DataLoader(
    dataset=NerDataset(devset, bert_model, max_seq_len=max_seq_len, ds_size=ds_size),
    batch_size=batch_size,
    shuffle=False,
    collate_fn=pad
)

test_dl = DataLoader(
    dataset=NerDataset(testset, bert_model, max_seq_len=max_seq_len, ds_size=ds_size),
    batch_size=batch_size,
    shuffle=False,
    collate_fn=pad
)

data = DataBunch(
    train_dl= train_dl,
    valid_dl= dev_dl,
    test_dl = test_dl,
    collate_fn=pad,
    path = Path(data_bunch_path)
)


data/conll-2003/deu/train.txt
lines 12705 sents 12705
Truncated examples: 0.0% => 0/12705 

data/conll-2003/deu/dev.txt
lines 3068 sents 3068
Truncated examples: 0.0% => 0/3068 

data/conll-2003/deu/test.txt
lines 3160 sents 3160
Truncated examples: 0.0% => 0/3160 


In [65]:
batch_size

32

In [66]:
def bert_layer_list(model):
    ms = torch.nn.ModuleList()

    flm = flatten_model(model)
    # embedding = [0:5] layer
    ms.append(torch.nn.ModuleList(flm[0:5]))
    # encoder (12 layers) = [5:16] [16:27] ... [126:136]
    for i in range(5, 137, 11):
        ms.append(torch.nn.ModuleList(flm[i: i+11]))
    # pooling layer = [137:139]
    ms.append(torch.nn.ModuleList(flm[-4:-2]))
    # head = [-2:]
    ms.append(torch.nn.ModuleList(flm[-2:]))
    return ms

In [67]:

model = BertForTokenClassification.from_pretrained(bert_model, num_labels=10, cache_dir='bertm')
model = torch.nn.DataParallel(model)
optim = BertAdam

train_opt_steps = int(len(train_dl.dataset) / batch_size / grad_acc_steps) * epochs
f1 = partial(fbeta, beta=1, sigmoid=False)
loss_fun = ner_loss_func
fp16_cb_fns = partial(create_fp16_cb,
                      train_opt_steps = train_opt_steps,
                      gradient_accumulation_steps = grad_acc_steps,
                      warmup_proportion = warmup_proportion,
                      fp16 = fp16)

In [68]:
metrics = [conll_f1, Conll_F1()]

In [69]:
if fp16:
    try:
        from apex.optimizers import FP16_Optimizer
        from apex.optimizers import FusedAdam
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex"
                          "to use distributed and fp16 training.")
    optim, dynamic=(FusedAdam, True) if not loss_scale else (FP16_Optimizer,False)

learn = Learner(data, model, optim,
                    loss_func=loss_fun,
                    metrics=metrics,
                    #true_wd=False,
                    #callback_fns=fp16_cb_fns,
                    layer_groups= bert_layer_list(model),
                    path='learn',
                    )

if fp16: learn.to_fp16(loss_scale=loss_scale, dynamic=dynamic)


In [70]:
batch_size

32

In [71]:
lang

'deu'

In [72]:
learn.freeze()
##learn.lr_find()
##learn.recorder.plot()


In [74]:
learn.fit_one_cycle(1, 5e-5, moms=(0.8, 0.7))

epoch,train_loss,valid_loss,conll_f1,Total F1,time
0,0.311295,0.448011,0.001429,0.001502,04:48


In [None]:
learn.freeze_to(-5)
learn.lr_find()
learn.recorder.plot()

In [None]:
stop

In [None]:
learn.fit_one_cylce(1, lrs, mom=(0.8, 0.7))

In [28]:
name = "1_model"
m_path = learn.save(name, return_path=True)
m_path

PosixPath('data/conll-2003/db/models/1_model.pth')

In [None]:
learn.fit(1, 1e-05)

epoch,train_loss,valid_loss,conll_f1,time


In [None]:
learn.save("2_model")

In [None]:
learn.load(name)
learn.fit(1, 3e-05)

In [None]:
learn.load(name)
learn.fit(1, 3e-04)