# Setup
- This notebook uses HF tokenizer and model to feed a sequence 2 sequence for query to item title similariy

- Reference: https://pages.github.corp.ebay.com/PyBay/core/dev/explore/bert/tutorials/finetune.html

In [1]:
import pprint


In [2]:
!pwd
!ls data

/Users/thchang/Documents/dev/git/nlp/pytorch-hf
[34msearch_relevance[m[m


# Step1: Prepare DataSets
- Load into HF dataset from local directory

In [3]:
import os
import datasets
DATA_DIR = '/Users/thchang/Documents/dev/git/nlp/pytorch-hf/data/pair_classification/search_relevance/'
data = datasets.load_dataset(
    'csv',
    data_files={
        'train': os.path.join(DATA_DIR, 'search_relevance.us.train.csv'),
        'dev': os.path.join(DATA_DIR, 'search_relevance.us.dev.csv'),
    },
)
data

Using custom data configuration default-48b1a6a309eb5012
Reusing dataset csv (/Users/thchang/.cache/huggingface/datasets/csv/default-48b1a6a309eb5012/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'title', 'relevance'],
        num_rows: 864378
    })
    dev: Dataset({
        features: ['query', 'title', 'relevance'],
        num_rows: 45970
    })
})

In [13]:
data['train'][0]

{'title': 'Apple iPhone 7 Plus - 128GB - Gold A1661 BLACKLISTED!!! for parts',
 'query': 'blacklisted iphone 7 plus',
 'relevance': 1.0}

# Step2: Preprocess data 
- Apply Tokenier
- A [CLS] token is inserted at the beginning of the first sentence and a [SEP] token is inserted at the end of each sentence.

In [34]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('/Users/thchang/Documents/dev/git/nlp/pytorch-hf/huggingface/bert-base-uncased')

def preprocess(features):
    # Combine query and title separated by SEP
    result = tokenizer([' [SEP] '.join([query, title]) for query, title in zip(features['query'], features['title'])])
    
    # Convert relevance score to an integer label
    result['label'] = [int(label) for label in features['relevance'] ]

    return result

#data = data.map(preprocess, batched=True)


In [35]:
pprint.pprint(data['train'][0])

{'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1],
 'input_ids': [101,
               2304,
               9863,
               2098,
               18059,
               1021,
               4606,
               102,
               6207,
               18059,
               1021,
               4606,
               1011,
               11899,
               18259,
               

# Step 3:  Evaluation Metrics

In [22]:
from transformers import EvalPrediction
import numpy as np
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, average_precision_score,
                            roc_auc_score, mean_squared_error)

# This function will be called to evaluate a prediction, which contains model output and a label
def compute_metrics(eval_predictions: EvalPrediction):
    # model returns logits of shape [n_samples, n_classes]
    logits = eval_predictions.predictions
    # logit with the largest value is our class prediction
    predictions = np.argmax(logits, axis=1)
    # true label
    trues = eval_predictions.label_ids

    # We use metrics provided by scikit-learn
    results = {
        "Accuracy": accuracy_score(y_true=trues, y_pred=predictions),
        "BalancedAccuracy": balanced_accuracy_score(y_true=trues, y_pred=predictions),
        "AveragePrecision": average_precision_score(y_true=trues, y_score=logits[:, 1], average='weighted', pos_label=1),
        "ROCAUCScore": roc_auc_score(y_true=trues, y_score=logits[:, 1], average='weighted'),
        "MSE": mean_squared_error(y_true=trues, y_pred=predictions, squared=False),
    }
    return results


In [27]:
prediction = EvalPrediction(
    predictions=np.array([[0, 0], [0.0, 0.0], [1.0, 1.0]]),
    label_ids=np.array([0, 0, 1])
)
print(compute_metrics(prediction))


{'Accuracy': 0.6666666666666666, 'BalancedAccuracy': 0.5, 'AveragePrecision': 1.0, 'ROCAUCScore': 1.0, 'MSE': 0.5773502691896257}


# Step 4:  Prepare Model and Trainer

### Model

In [39]:
# This loads a pytorch eBERT model, which is initialized with pre-trained weights.
# This model also contains some additional weights, which are used for classification on top of pre-trained BERT output.
# Those weights are randomly initialized, so the model needs to be fine-tuned to produce any meaningful output.
# Read more https://huggingface.co/transformers/model_doc/auto.html#automodelforsequenceclassification
from transformers import AutoConfig, AutoModelForSequenceClassification
# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_config(config)
model
                                                                      

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Define Trainer

In [50]:
import os
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Define an output directory
output_dir = os.path.join(os.environ.get('KRYLOV_DATA_DIR', ''), os.environ.get('KRYLOV_WS_PRINCIPAL', ''), 'bert-finetuning')

# Learn more about various training arguments https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
training_args = TrainingArguments(
    output_dir=os.path.join(output_dir, 'model_checkpoints'),       # Output model checkpoints\
    num_train_epochs=0.01,                                             # Number of training epochs\
    per_device_train_batch_size=128,                                 # Batch size for training\
    per_device_eval_batch_size=128,                                  # Batch size for evaluation\
    learning_rate=5e-5,                                             # Learning rate\
    warmup_steps=100,                                               # Warmup for learning rate schedule\
    logging_steps=5000,                                             # Logging frequency\
    logging_dir=os.path.join(output_dir, 'model_logs'),             # Directory for logs\
    fp16=False,                                                      # Mixed precision training on V100 GPUs\
    save_strategy="epoch",                                          # Save model checkpoint after each epoch\
)

# Learn more about Trainer https://huggingface.co/transformers/main_classes/trainer.html#id1
# data_collator will take multiple input entries and create a mini-batch. Because
# input sentences can have different length, it will add appropriate padding and
# masking, so that the model can efficiently and correctly process it.
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['dev'],
    data_collator=DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8),
    compute_metrics=compute_metrics,
)

print('DONE')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


DONE


### Start Training
- It may seem that we lost title, query, relevance, but we really didn't
- https://lewtun.github.io/blog/til/nlp/huggingface/transformers/2021/01/15/til-recovering-hidden-trainer-columns.html

In [51]:
# This will start a training loop
trainer.train()
print('DONE')

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: title, query, relevance.
***** Running training *****
  Num examples = 864378
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 68


Step,Training Loss


Saving model checkpoint to bert-finetuning/model_checkpoints/checkpoint-68
Configuration saved in bert-finetuning/model_checkpoints/checkpoint-68/config.json
Model weights saved in bert-finetuning/model_checkpoints/checkpoint-68/pytorch_model.bin
tokenizer config file saved in bert-finetuning/model_checkpoints/checkpoint-68/tokenizer_config.json
Special tokens file saved in bert-finetuning/model_checkpoints/checkpoint-68/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




DONE


### Evaluate

In [None]:
# Final evaluation on a dev set
results = trainer.evaluate(data['dev'])
print(results)

### Save

In [None]:
# Save the final model and tokenizer to disk
trainer.save_model(os.path.join(output_dir, 'my_pretrained_model'))


### Eval

In [None]:
model.eval() # make sure to set it back to train if training

text = '[CLS] iphone 6 [SEP] iphone 12 [SEP]'
encoded_input = tokenizer(text, return_tensors='pt')

output = model(**encoded_input)

print('DONE', output)
