In [1]:
!pip install --upgrade fast-bert

Requirement already up-to-date: fast-bert in /usr/local/lib/python3.6/dist-packages (1.6.4)


In [2]:
import logging
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import apex
from box import Box

from fast_bert.data_cls import BertDataBunch
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy #, roc_auc, fbeta

In [3]:
torch.cuda.empty_cache()

### Model

In [4]:
MODEL_TYPE = 'distilbert'
MODEL_NAME = 'distilbert-base-uncased'

### Path

In [5]:
DATA_PATH = Path('data/2019-03-12-Ebbu/3_csv_bert/splitted')
LABEL_PATH = Path('label/')

MODEL_PATH = Path('bert_models/')
LOG_PATH = Path('bert_logs/')
OUTPUT_DIR = MODEL_PATH / ('output.%s' % MODEL_NAME)

MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)

In [6]:
TRAIN_FILE = 'train_900000.csv'
VAL_FILE = 'val_100000.csv'
LABEL_FILE = 'labels.csv'

### GPU & device

In [7]:
device = torch.device('cuda')
multi_gpu = torch.cuda.device_count() > 1

### Arguments

In [8]:
args = Box({
    'data_path': DATA_PATH,
    'label_path': LABEL_PATH,
    'model_path': MODEL_PATH,
    'log_path': LOG_PATH,
    'output_dir': OUTPUT_DIR,
    'finetuned_path': None,
    
    'model_type': MODEL_TYPE,
    'model_name': MODEL_NAME,
    'do_lower_case': True,
    'multi_label': False,

    'num_train_epochs': 12,
    'learning_rate': 3e-5,
    'max_seq_length': 256,
    'train_batch_size': 64,
    
    'device': device,
    'multi_gpu': multi_gpu,
    
    'warmup_steps': 500,
    'fp16': True,
    'logging_steps': 0,
    
    'schedule_type': 'warmup_cosine',
    'optimizer_type': 'lamb',
    'warmup_proportion': 0.002,
    'local_rank': -1,
    'gradient_accumulation_steps': 1,
    'loss_scale': 128
})

### 1. Create a DataBunch object

In [9]:
databunch = BertDataBunch(args.data_path, args.label_path,
                          tokenizer=args.model_name,
                          train_file=TRAIN_FILE,
                          val_file=VAL_FILE,
                          label_file=LABEL_FILE,
                          text_col='text',
                          label_col='label',
                          batch_size_per_gpu=args.train_batch_size,
                          max_seq_length=args.max_seq_length,
                          multi_gpu=args.multi_gpu,
                          multi_label=args.multi_label,
                          model_type=args.model_type)

### 2. Create a Learner Object

In [10]:
logger = logging.getLogger()
metrics = [
    {'name': 'accuracy', 'function': accuracy}
    # {'name': 'roc_auc', 'function': roc_auc}, # ValueError: Found input variables with inconsistent numbers of samples: [300000, 600000]
    # {'name': 'fbeta', 'function': fbeta}, # RuntimeError: The size of tensor a (2) must match the size of tensor b (300000) at non-singleton dimension 1
]

learner = BertLearner.from_pretrained_model(databunch,
                                            pretrained_path=args.model_name,
                                            metrics=metrics,
                                            device=args.device,
                                            logger=logger,
                                            output_dir=args.output_dir,
                                            finetuned_wgts_path=args.finetuned_path,
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu,
                                            is_fp16=args.fp16,
                                            multi_label=args.multi_label,
                                            logging_steps=args.logging_steps)

### 3. Train the model

In [None]:
learner.fit(epochs=args.num_train_epochs,
            lr=args.learning_rate,
            validate=True, # Evaluate the model after each epoch
            schedule_type=args.schedule_type,
            optimizer_type=args.optimizer_type)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
add_(Tensor other, Number alpha)
addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
addcmul_(Tensor tensor1, Tensor tensor2, Number value)


In [None]:
learner.validate()

### 4. Save trained model artifacts

In [None]:
learner.save_model()

### 5. Model Inference

In [None]:
from fast_bert.prediction import BertClassificationPredictor

MODEL_PATH = OUTPUT_DIR / 'model_out'

predictor = BertClassificationPredictor(model_path=str(MODEL_PATH),
                                        label_path=str(LABEL_PATH),
                                        multi_label=args.multi_label,
                                        model_type=args.model_type,
                                        do_lower_case=args.do_lower_case)

In [None]:
OUTPUT_CSV = Path('bert_scores') / ('output.%s.csv' % args.model_name)

test_df = pd.read_csv(DATA_PATH / VAL_FILE)
test_df.tail()

In [None]:
output = predictor.predict_batch(test_df.text.tolist())
pd.DataFrame(output).to_csv(OUTPUT_CSV)

### TRUES, PREDS, PROBS

In [None]:
trues = np.asarray([int(v == 'phishing') for v in test_df.label.values])
trues

In [None]:
preds = np.asarray([int(pred[0][0] == 'phishing') for pred in output])
preds

In [None]:
probs = np.asarray([pred[0][1] if pred[0][0] == 'phishing' else pred[1][1] for pred in output])
probs

### Sklearn Metrics

In [None]:
import sklearn.metrics

In [None]:
print(sklearn.metrics.classification_report(trues, preds))

In [None]:
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(trues, preds).ravel()

accuracy = sklearn.metrics.accuracy_score(trues, preds)
balance_accuracy = sklearn.metrics.balanced_accuracy_score(trues, preds)
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(trues, preds, beta=1.0, average='binary')
jaccard = sklearn.metrics.jaccard_score(trues, preds)
matthews_corrcoef = sklearn.metrics.matthews_corrcoef(trues, preds)

hamming_loss = sklearn.metrics.hamming_loss(trues, preds)
log_loss = sklearn.metrics.log_loss(trues, preds)
zero_one_loss = sklearn.metrics.zero_one_loss(trues, preds)
brier_score_loss = sklearn.metrics.brier_score_loss(trues, probs)

print('score')
print('accuracy:', accuracy)
print('balance_accuracy:', balance_accuracy)
print('precision:', precision)
print('recall:', recall)
print('f1:', f1)
print('jaccard:', jaccard)
print('matthews_corrcoef:', matthews_corrcoef)

print('\nloss')
print('hamming_loss:', hamming_loss)
print('log_loss:', log_loss)
print('zero_one_loss:', zero_one_loss)
print('brier_score_loss:', brier_score_loss)