#Setting

In [None]:
!pip install transformers==4.17 datasets accelerate evaluate

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, accelerate, evaluate
Successfully installed accelerate-0.26.1 evaluate-0.4.1 responses-0.18.0


In [None]:
from datasets import load_dataset

dataset = load_dataset('klue', 'nli')
dataset

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
    num_rows: 3000
})

# Fine-tune a pretrained model

## Prepare a dataset

In [None]:
# entailment(0), neutral(1), contradiction(2)
dataset['train'][0]

{'guid': 'klue-nli-v1_train_00000',
 'source': 'NSMC',
 'premise': '힛걸 진심 최고다 그 어떤 히어로보다 멋지다',
 'hypothesis': '힛걸 진심 최고로 멋지다.',
 'label': 0}

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

def tokenize_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets['train']

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 24998
})

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))

## Train

## Train with PyTorch Trainer

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=3)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

### Training hyperparameters

### Evaluate

In [None]:
# 예시 training Arguments
# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=1,              # total number of training epochs
#     per_device_train_batch_size=1,   # batch size per device during training
#     per_device_eval_batch_size=10,   # batch size for evaluation
#     warmup_steps=1000,               # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=200,               # How often to print logs
#     do_train=True,                   # Perform training
#     do_eval=True,                    # Perform evaluation
#     evaluation_strategy="epoch",     # evalute after eachh epoch
#     gradient_accumulation_steps=64,  # total number of steps before back propagation
#     fp16=True,                       # Use mixed precision
#     fp16_opt_level="02",             # mixed precision mode
#     run_name="ProBert-BFD-MS",       # experiment name
#     seed=3                           # Seed for experiment reproducibility 3x3
# )

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### Trainer

Create a [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) object with your model, training arguments, training and test datasets, and evaluation function:

In [None]:
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

Then fine-tune your model by calling [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train):

In [None]:
small_train_dataset

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 500
})

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: guid, hypothesis, source, premise. If guid, hypothesis, source, premise are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 500
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 315


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.820071,0.506
2,No log,2.909926,0.536
3,No log,2.681127,0.582
4,No log,3.401096,0.536
5,No log,3.413868,0.538


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: guid, hypothesis, source, premise. If guid, hypothesis, source, premise are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: guid, hypothesis, source, premise. If guid, hypothesis, source, premise are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: guid, hypothesis, source, premise. If guid, hypothesis, source, premise are not e

TrainOutput(global_step=315, training_loss=0.06772032843695747, metrics={'train_runtime': 321.3245, 'train_samples_per_second': 7.78, 'train_steps_per_second': 0.98, 'total_flos': 657783544320000.0, 'train_loss': 0.06772032843695747, 'epoch': 5.0})

In [None]:
small_train_dataset

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 500
})

In [None]:
print(small_train_dataset["premise"][1])
print(small_train_dataset["hypothesis"][1])
print(small_train_dataset["label"][1])

오랜만에 가슴 벅찬 감동을 느꼈습니다
가슴 벅찬 감동은 오랜만입니다.
0


In [None]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: guid, hypothesis, source, premise. If guid, hypothesis, source, premise are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 3.4138681888580322,
 'eval_accuracy': 0.538,
 'eval_runtime': 17.2114,
 'eval_samples_per_second': 29.051,
 'eval_steps_per_second': 3.66,
 'epoch': 5.0}

In [None]:
from datasets import Dataset

label_dictionary = {0 : 'entailment', 1: 'neutral', 2: 'contradiction'}

def text_classification(premise, hypothesis):
  example_data = {
      'label' : None,
      'premise' : premise,
      'hypothesis' : hypothesis,
      }
  tokenized_text = tokenizer(example_data['premise'], example_data['hypothesis'], padding="max_length", truncation=True)

  output = trainer.predict([tokenized_text])
  pred = np.argmax(output.predictions, axis=1)
  return label_dictionary[pred[0]]

premise = "오늘 저녁은 김치찌개 먹어야지"
hypothesis = "아 김치찌개 먹고 싶다"
text_classification(premise, hypothesis)

***** Running Prediction *****
  Num examples = 1
  Batch size = 8


'entailment'