# Prerequisites

- Host OS: Ubuntu 20.04 lts
- Using Docker Image 'mltooling/ml-workspace-gpu' (docker pull mltooling/ml-workspace-gpu)
- Single Nvidia GPU (RTX 3080)

# 0. GPU check

In [20]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    device_count = torch.cuda.device_count()
    print("device_count: {}".format(device_count))
    for device_num in range(device_count):
        print("device {} capability {}".format(
            device_num,
            torch.cuda.get_device_capability(device_num)))
        print("device {} name {}".format(
            device_num, 
            torch.cuda.get_device_name(device_num)))
else:
    device = torch.device("cpu")
    print("no cuda device")

device_count: 1
device 0 capability (8, 6)
device 0 name NVIDIA GeForce RTX 3080


# 1. Import packages

In [2]:
## Need to check if packages are compatible ##

# !pip install accelerate nvidia-ml-py3
# !pip install datasets==2.4.0
# !pip install huggingface_hub==0.9.1
# !pip install transformers==4.22.1 
# !pip install pyarrow==9.0.0

In [3]:
import transformers
import datasets
import huggingface_hub
import pyarrow

print(transformers.__version__)
print(datasets.__version__)
print(huggingface_hub.__version__)
print(pyarrow.__version__)

# 4.22.1
# 2.4.0
# 0.9.1
# 9.0.0

4.22.1
2.4.0
0.9.1
9.0.0


In [4]:
import os
import re
import math
import random
import numpy as np
import pandas as pd

# 'You can use tf32' if you are acessing Ampere hardware
import torch
torch.backends.cuda.matmul.allow_tf32 = True

from datasets import load_dataset, load_metric, ClassLabel
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

from functools import partial

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import (
    download_data,
    build_compute_metrics_fn,
)
from ray.tune.schedulers import PopulationBasedTraining, ASHAScheduler
from transformers import (
    glue_tasks_num_labels,
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    GlueDataset,
    GlueDataTrainingArguments,
    TrainingArguments,
    EarlyStoppingCallback
)

# 2. Import Data

* Example data of xxx_train.csv, xxx_test.csv


<table class="features-table">
  <tr>
    <th class="mdc-text-light-green-600", style="text-align:center">
    text
    </th>
    <th class="mdc-text-purple-600", style="text-align:center">
    label
    </th>
  </tr>
  <tr>
    <td class="mdc-bg-light-green-50" style="text-align:left">
      Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
    </td>
    <td class="mdc-bg-purple-50">
      0
    </td>
  </tr>
  <tr>
    <td class="mdc-bg-light-green-50" style="text-align:left">
      Ok lar... Joking wif u oni...
    </td>
    <td class="mdc-bg-purple-50">
      0
    </td>
  </tr>
  <tr>
    <td class="mdc-bg-light-green-50" style="text-align:left">
      Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)
    </td>
    <td class="mdc-bg-purple-50">
      1
    </td>
  </tr>
  <tr>
    <td class="mdc-bg-light-green-50" style="text-align:left">
      U dun say so early hor... U c already then say...
    </td>
    <td class="mdc-bg-purple-50">
      0
    </td>
  </tr>
  <tr>
    <td class="mdc-bg-light-green-50" style="text-align:left">
      Nah I don't think he goes to usf, he lives around here though
    </td>
    <td class="mdc-bg-purple-50">
      0
    </td>
  </tr>
</table>

In [5]:
data_name = "phi" ## IMDB / naver_movie_review / spam

dataset = load_dataset('csv', data_files={'train': f'../data_splited/{data_name}_train.csv',
                                          'test': f'../data_splited/{data_name}_test.csv'})
dataset

Using custom data configuration default-30b40e9c9e0e6156
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-30b40e9c9e0e6156/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 400
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})

# 3. Data Preprocessing

In [6]:
## remove specal characters

def remove_sp(example):
    example["text"]=re.sub(r'[^a-z|A-Z|0-9|ㄱ-ㅎ|ㅏ-ㅣ|가-힣| ]+', '', str(example["text"]))
#     example["text"]=re.sub(r'[^0-9|ㄱ-ㅎ|ㅏ-ㅣ|가-힣| ]+', '', str(example["text"])) # For PLM trained by Korean
    return example

dataset = dataset.map(remove_sp)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-30b40e9c9e0e6156/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-3a717fb8aa2e2e7a.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-30b40e9c9e0e6156/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-776c2cd46ee7289f.arrow


In [7]:
## label encoding

labels = list(set(dataset["train"]["label"] + dataset["test"]["label"]))
num_labels = len(labels)

def encoding_label(example):
    str_to_int = ClassLabel(num_classes=num_labels, names=labels)
    example["label"]=str_to_int.str2int(example["label"])
    return example

if type(labels[0]) == str:
    dataset = dataset.map(encoding_label)
    
num_labels

2

In [8]:
# Make imbalanced data to test model performance (label 0:label 1 = 8:2)
# https://discuss.huggingface.co/t/huggingface-datasets-convert-a-dataset-to-pandas-and-then-convert-it-back/14708/3

# df_train = pd.DataFrame(dataset['train'])
# df_train_0 = df_train[df_train["label"]==0]
# df_train_1 = df_train[df_train["label"]==1].sample(frac=1)[0:math.floor(len(df_train[df_train['label']==0])*0.2)]
# dataset["train"] = datasets.Dataset.from_pandas(pd.concat([df_train_0,df_train_1]), preserve_index=False)
# dataset

# 4. Load PLM & Tokenizing

In [9]:
num_cpus = 16
num_gpus = 1
seed = 1234

model_name = "xlm-roberta-base" # bert-base-multilingual-cased ; klue/roberta-base ; bert-base-cased ...

## Customize training strategy

task_data_dir = "test-model"
gpus_per_trial = 1
cpus_per_trial = 16
n_trials = 10

In [10]:
# Initialize ray

ray.shutdown()
ray.init(log_to_driver=False, ignore_reinit_error=True, num_cpus=num_cpus, num_gpus=num_gpus, include_dashboard=False)

2022-10-21 08:18:34,870	INFO worker.py:1518 -- Started a local Ray instance.


0,1
Python version:,3.8.10
Ray version:,2.0.0


In [11]:
# Download cache tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, truncation_side = 'left') # truncation_side = 'left' option remains last 512 tokens

def tokenize_function(examples):
    tokenized_batch = tokenizer(examples["text"], padding="max_length", truncation=True) # padding : ['longest', 'max_length', 'do_not_pad']
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-30b40e9c9e0e6156/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-2a6d558e3cdf2f0f.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-30b40e9c9e0e6156/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-e5a8a54d2ba50d34.arrow


In [12]:
train_dataset = tokenized_datasets["train"].shuffle(seed=seed).select(range(0,math.floor(len(tokenized_datasets["train"])*0.7)))
eval_dataset = tokenized_datasets["train"].shuffle(seed=seed).select(range(math.floor(len(tokenized_datasets["train"])*0.7), len(tokenized_datasets["train"])))
test_dataset = tokenized_datasets["test"]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/csv/default-30b40e9c9e0e6156/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-94343dfd2a188952.arrow


In [13]:
## sampling 1000 rows for test

# train_dataset = tokenized_datasets["train"].shuffle(seed=seed).select(range(1000))
# eval_dataset = tokenized_datasets["train"].shuffle(seed=seed).select(range(1000))
# test_dataset = tokenized_datasets["test"]

# 5. Check class weights

In [14]:
def class_weight(train_dataset) :
    
    train_labels = np.array(train_dataset["label"])
    class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(train_labels), y = train_labels)
    
    weights = torch.tensor(class_weights, dtype = torch.float)
    
    return weights

In [15]:
weights = class_weight(train_dataset)
print(weights)

tensor([0.6335, 2.3729])


# 6. Modeling

In [16]:
# Download model and features

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
        )

In [17]:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc") # Accuracy/F1
#     metric = load_metric("accuracy") # Accuracy/F1
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
training_args = TrainingArguments(
    output_dir=".",
    learning_rate=2e-5, # config
    do_train=True,
    do_eval=True,
    no_cuda=gpus_per_trial <= 0,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps = 50,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    load_best_model_at_end=True,
    num_train_epochs=2,  # config
    max_steps=-1,
    per_device_train_batch_size=8,  # config
    per_device_eval_batch_size=8,
    warmup_steps=0,
    warmup_ratio=0.1,  # config
    weight_decay=0.1,  # config
    logging_dir="./logs",
    skip_memory_metrics=True,
    report_to="none",
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    seed=seed  # config
    )
    
# trainer = Trainer(
#     model_init=model_init,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     compute_metrics=compute_metrics,
#     )

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss
        weight = weights.to(device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    
trainer = CustomTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/f6d161e8f5f6f2ed433fb4023d6cb34146506b3f/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/f6d

In [None]:
# Hyperparameter tuning with ray tune

tune_config = {
    "per_device_eval_batch_size": 8,
    "max_steps": -1
}

# PopulationBasedTraining
# worker might copy the model parameters from a better performing worker or explore new hyperparameters by changing the current values randomly
# cf. ASHAScheduler

scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_f1",
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "num_train_epochs": tune.randint(1, 20),
        "per_device_train_batch_size": tune.choice([4, 8]),
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "warmup_ratio": tune.uniform(0.0, 0.3),
        "adam_beta1": tune.loguniform(1e-2, 1),
        "adam_beta2": tune.loguniform(1e-3, 1),
        "adam_epsilon": tune.loguniform(1e-8, 1e-5),
    }, # correct_bias = True 어떻게 설정..?
)

reporter = CLIReporter(
    parameter_columns={
        "weight_decay": "w_decay",
        "learning_rate": "lr",
        "per_device_train_batch_size": "train_bs/gpu",
        "num_train_epochs": "num_epochs",
    },
    metric_columns=["eval_f1", "eval_accuracy", "eval_loss", "epoch", "training_iteration"]
)

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels = 2,
                                                           output_attentions = False,
                                                           output_hidden_states = False)


result = trainer.hyperparameter_search(
    hp_space = lambda _: tune_config,
    direction = "maximize",
    backend="ray",
    reuse_actors = True,
    n_trials=n_trials,
    resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
    scheduler=scheduler,
    keep_checkpoints_num=1,
    checkpoint_score_attr="training_iteration",
    stop=None,
    progress_reporter=reporter,
    local_dir="./test-results",
    name="tune_transformer_pbt",
    log_to_file=True,
)

# local_dir = os.path.join("./", model_name)

# tune.run(trainer,
#         local_dir = local_dir,
#         resources_per_trial={"cpu" : 16, "gpu" : 1},
#         config = config,
#         num_samples = 1,
#         scheduler = scheduler,
#         progress_reporter = reporter)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/f6d161e8f5f6f2ed433fb4023d6cb34146506b3f/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/f6d

== Status ==
Current time: 2022-10-21 08:20:33 (running for 00:00:00.16)
Memory usage on this node: 16.1/31.1 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/17.26 GiB heap, 0.0/8.63 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /workspace/syc/BERT_classification_binary/test-results/tune_transformer_pbt
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------+----------+--------------------+-----------+-------------+----------------+--------------+
| Trial name             | status   | loc                |   w_decay |          lr |   train_bs/gpu |   num_epochs |
|------------------------+----------+--------------------+-----------+-------------+----------------+--------------|
| _objective_3b454_00000 | RUNNING  | 172.17.0.3:4175666 | 0.0622085 | 1.04653e-05 |              4 |           18 |
| _objective_3b454_00001 | PENDING  |                    | 0.131068  | 3.4486e-05  |              8 |           

== Status ==
Current time: 2022-10-21 08:20:55 (running for 00:00:22.58)
Memory usage on this node: 16.0/31.1 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/17.26 GiB heap, 0.0/8.63 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /workspace/syc/BERT_classification_binary/test-results/tune_transformer_pbt
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------+----------+--------------------+-----------+-------------+----------------+--------------+
| Trial name             | status   | loc                |   w_decay |          lr |   train_bs/gpu |   num_epochs |
|------------------------+----------+--------------------+-----------+-------------+----------------+--------------|
| _objective_3b454_00000 | RUNNING  | 172.17.0.3:4175666 | 0.0622085 | 1.04653e-05 |              4 |           18 |
| _objective_3b454_00001 | PENDING  |                    | 0.131068  | 3.4486e-05  |              8 |           

== Status ==
Current time: 2022-10-21 08:21:19 (running for 00:00:46.46)
Memory usage on this node: 15.9/31.1 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/17.26 GiB heap, 0.0/8.63 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /workspace/syc/BERT_classification_binary/test-results/tune_transformer_pbt
Number of trials: 10/10 (1 PAUSED, 8 PENDING, 1 RUNNING)
+------------------------+----------+--------------------+-----------+-------------+----------------+--------------+-----------+-----------------+-------------+---------+----------------------+
| Trial name             | status   | loc                |   w_decay |          lr |   train_bs/gpu |   num_epochs |   eval_f1 |   eval_accuracy |   eval_loss |   epoch |   training_iteration |
|------------------------+----------+--------------------+-----------+-------------+----------------+--------------+-----------+-----------------+-------------+---------+------------

In [None]:
result

In [None]:
for n, v in result.hyperparameters.items():
    setattr(trainer.args, n, v)

In [None]:
trainer.args

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
pred = trainer.predict(test_dataset=test_dataset)
pred

In [None]:
label_test = list(pred.label_ids)
pred_test = list(map(lambda x: x.index(max(x)), pred.predictions.tolist()))

In [None]:
print(confusion_matrix(label_test, pred_test))

In [None]:
accuracy = accuracy_score(label_test, pred_test)
f1 = f1_score(label_test, pred_test)
recall = recall_score(label_test, pred_test)
precision = precision_score(label_test, pred_test)

print(accuracy)
print(f1)
print(recall)
print(precision)

In [None]:
# model_path = "test-model"
# trainer.model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)

# Reference

https://bo-10000.tistory.com/154  
https://huggingface.co/blog/ray-tune  
https://docs.ray.io/en/latest/tune/examples/pbt_transformers.html  
https://wood-b.github.io/post/a-novices-guide-to-hyperparameter-optimization-at-scale/#schedulers-vs-search-algorithms  
https://docs.ray.io/en/latest/tune/api_docs/search_space.html  
https://docs.ray.io/en/latest/tune/tutorials/tune-advanced-tutorial.html  
https://docs.ray.io/en/latest/tune/api_docs/schedulers.html  
https://blog.ml.cmu.edu/2018/12/12/massively-parallel-hyperparameter-optimization/  
https://docs.ray.io/en/latest/tune/faq.html  
https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#population-based-training-tune-schedulers-populationbasedtraining  
https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.hyperparameter_search  
https://docs.ray.io/en/latest/tune/api_docs/suggestion.html#optuna-tune-search-optuna-optunasearch  
https://kyunghyunlim.github.io/nlp/ml_ai/2021/09/22/hugging_face_5.html  

# Future Challenges
2. 훈련 셋이 늘어나면서 성능이 어떻게 좋아지는지, hp조합에 따라 어떻게 좋아지는지 시각화