# Overview
- Notebook shows how to use huggingface to fine tune using Ebert embeddings
- Based on https://pages.github.corp.ebay.com/PyBay/core/dev/explore/bert/tutorials/finetune.html#finetuning
- Other reference: https://pages.github.corp.ebay.com/PyBay/core/dev/explore/bert/introduction.html#getting-started

# Step 1: Prepare the dataset

In [1]:
import os
import datasets
DATA_DIR = '/data/ebay/data/ppetrushkov/relevance'
data = datasets.load_dataset(
    'csv',
    data_files={
        'train': os.path.join(DATA_DIR, 'search_relevance.us.train.csv'),
        'dev': os.path.join(DATA_DIR, 'search_relevance.us.dev.csv'),
    },
)
print('DONE')

Using custom data configuration default-8a524d7fd41cc35d


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/thchang/.cache/huggingface/datasets/csv/default-8a524d7fd41cc35d/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /home/thchang/.cache/huggingface/datasets/csv/default-8a524d7fd41cc35d/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.


In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['query', 'title', 'relevance'],
        num_rows: 864378
    })
    dev: Dataset({
        features: ['query', 'relevance', 'title'],
        num_rows: 45970
    })
})

In [4]:
data['train']

Dataset({
    features: ['query', 'title', 'relevance'],
    num_rows: 864378
})

# Step 2: Preprocess text data

In [5]:
import pybay.bert

# Load pretrained eBERT tokenizer. Learn more about tokenizers https://huggingface.co/docs/tokenizers/python/latest/
tokenizer = pybay.bert.AutoTokenizer.from_pretrained('eBERT-multilingual-base-2020Q3-cased')

# Preprocessing function that will be applied to all dataset entries
def preprocess(features):
    # We combine a query and a title into one sentence, separated by a special token
    result = tokenizer([' [SEP] '.join([query, title]) for query, title in zip(features['query'], features['title'])])
    # We convert the relevance score into an integer label
    result['label'] = [int(x) for x in features['relevance']]
    return result

# Apply preprocessing to all entries in the dataset
data = data.map(preprocess, batched=True)

HBox(children=(FloatProgress(value=0.0, description='downloading', max=5.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='downloading', max=5.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, max=865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))




# Step 3: Define Evaluation Metrics

In [6]:
from transformers import EvalPrediction
import numpy as np
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, average_precision_score,
                            roc_auc_score, mean_squared_error)

# This function will be called to evaluate a prediction, which contains model output and a label
def compute_metrics(eval_predictions: EvalPrediction):
    # model returns logits of shape [n_samples, n_classes]
    logits = eval_predictions.predictions
    # logit with the largest value is our class prediction
    predictions = np.argmax(logits, axis=1)
    # true label
    trues = eval_predictions.label_ids

    # We use metrics provided by scikit-learn
    results = {
        "Accuracy": accuracy_score(y_true=trues, y_pred=predictions),
        "BalancedAccuracy": balanced_accuracy_score(y_true=trues, y_pred=predictions),
        "AveragePrecision": average_precision_score(y_true=trues, y_score=logits[:, 1], average='weighted', pos_label=1),
        "ROCAUCScore": roc_auc_score(y_true=trues, y_score=logits[:, 1], average='weighted'),
        "MSE": mean_squared_error(y_true=trues, y_pred=predictions, squared=False),
    }
    return results

In [8]:
### Sample metric with mock data
prediction = EvalPrediction(
    predictions=np.array([[0.0, 1.0], [1.0, 0.0], [-3.0, 5.0]]),
    label_ids=np.array([0, 0, 1])
)
print(compute_metrics(prediction))


{'Accuracy': 0.6666666666666666, 'BalancedAccuracy': 0.75, 'AveragePrecision': 1.0, 'ROCAUCScore': 1.0, 'MSE': 0.5773502691896257}


# Step 4: Prepare HuggingFace Model and Trainer
- Next we will load our eBERT model and setup a transformers.Trainer instance to fine-tune it.



In [9]:
# This loads a pytorch eBERT model, which is initialized with pre-trained weights.
# This model also contains some additional weights, which are used for classification on top of pre-trained BERT output.
# Those weights are randomly initialized, so the model needs to be fine-tuned to produce any meaningful output.
# Read more https://huggingface.co/transformers/model_doc/auto.html#automodelforsequenceclassification
model = pybay.bert.AutoModelForSequenceClassification.from_pretrained('eBERT-multilingual-base-2020Q3-cased',
                                                                      id2label={
                                                                          0: 'irrelevant',
                                                                          1: 'relevant',
                                                                      })

import os
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Define an output directory
output_dir = os.path.join(os.environ.get('KRYLOV_DATA_DIR', ''), os.environ.get('KRYLOV_WS_PRINCIPAL', ''), 'bert-finetuning')

# Learn more about various training arguments https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
training_args = TrainingArguments(
    output_dir=os.path.join(output_dir, 'model_checkpoints'),       # Output model checkpoints
    num_train_epochs=2,                                             # Number of training epochs
    per_device_train_batch_size=64,                                 # Batch size for training
    per_device_eval_batch_size=64,                                  # Batch size for evaluation
    learning_rate=5e-6,                                             # Learning rate
    warmup_steps=100,                                               # Warmup for learning rate schedule
    logging_steps=5000,                                             # Logging frequency
    logging_dir=os.path.join(output_dir, 'model_logs'),             # Directory for logs
    fp16=True,                                                      # Mixed precision training on V100 GPUs
    save_steps=0,                                                   # Checkpoint saving frequency
)

# Learn more about Trainer https://huggingface.co/transformers/main_classes/trainer.html#id1
# data_collator will take multiple input entries and create a mini-batch. Because
# input sentences can have different length, it will add appropriate padding and
# masking, so that the model can efficiently and correctly process it.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['dev'],
    data_collator=DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8),
    compute_metrics=compute_metrics,
)

HBox(children=(FloatProgress(value=0.0, description='downloading', max=5.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='downloading', max=5.0, style=ProgressStyle(description_wi…




Some weights of the model checkpoint at /data/ebay/data/thchang/.pybay_cache/e6f108c7-44e2-49cc-9809-78dfdfa0eb1f were not used when initializing EBertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing EBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some

### Start training and evaluation

In [10]:

# This will start a training loop
trainer.train()

# Final evaluation on a dev set
results = trainer.evaluate(data['dev'])
print(results)

# Save the final model and tokenizer to disk
model.save_pretrained(os.path.join(output_dir, 'my_pretrained_model'))
tokenizer.save_pretrained(os.path.join(output_dir, 'my_pretrained_model'))

Step,Training Loss
5000,0.3847
10000,0.3381
15000,0.3242
20000,0.3047
25000,0.3018


{'eval_loss': 0.34215739369392395, 'eval_Accuracy': 0.8670002175331738, 'eval_BalancedAccuracy': 0.7625003942253656, 'eval_AveragePrecision': 0.9540019136251331, 'eval_ROCAUCScore': 0.8750029086247337, 'eval_MSE': 0.36469135233348515, 'eval_runtime': 37.2114, 'eval_samples_per_second': 1235.373, 'eval_steps_per_second': 19.322, 'epoch': 2.0}


('/data/ebay/data/thchang/bert-finetuning/my_pretrained_model/tokenizer_config.json',
 '/data/ebay/data/thchang/bert-finetuning/my_pretrained_model/special_tokens_map.json',
 '/data/ebay/data/thchang/bert-finetuning/my_pretrained_model/vocab.txt',
 '/data/ebay/data/thchang/bert-finetuning/my_pretrained_model/added_tokens.json')

# Step 5: Wrap in a Service/Inference Wrapper

### Sample inferences

In [11]:
# This will load and setup our saved tokenizer and model.
# We only need to provide a mapping from class indices, to some meaningful class names
classifier = pybay.bert.EBertTextClassifier(os.path.join(output_dir, 'my_pretrained_model'))

# Let's see how our model performs on some samples
print(classifier.classify("headphones [SEP] JBL - LIVE 500BT"))
print(classifier.classify("headphones [SEP] JBL - LIVE 500BT headphones stand"))
print(classifier.classify("headphones [SEP] quietcomfort 35"))

["relevant"]
["irrelevant"]
["relevant"]


### Start uvicorn service 
- Run this in a script

In [None]:
#!/bin/bash
export MODEL_PATH=/data/ebay/data/ppetrushkov/bert-finetuning
export MODEL_CLASS=EBertTextClassifier
uvicorn --host 0.0.0.0 --port 5000 pybay.bert.app:app