# Custom PJT in KOR

- NSMC / HuggingFace dataset prep
- klue/bert-base model & tokenizer
- preprocess ds
- train model
- fine-tune training_arg
- apply Bucketing

In [2]:
import tensorflow
import numpy
import transformers
import datasets

print(tensorflow.__version__)
print(numpy.__version__)
print(transformers.__version__)
print(datasets.__version__)

2.6.0
1.21.4
4.11.3
1.14.0


In [3]:
# load data from HF

from datasets import load_dataset

hf_nsmc_dataset = load_dataset('nsmc')
hf_nsmc_dataset

Using custom data configuration default
Reusing dataset nsmc (/aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [4]:
train = hf_nsmc_dataset['train']
train

Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 150000
})

In [5]:
# check dataset

cols = train.column_names

for i in range(5):
    for col in cols:
        print(col, ':', train[col][i])
    print('\n')
    

id : 9976970
document : 아 더빙.. 진짜 짜증나네요 목소리
label : 0


id : 3819312
document : 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
label : 1


id : 10265843
document : 너무재밓었다그래서보는것을추천한다
label : 0


id : 9045019
document : 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
label : 0


id : 6483659
document : 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
label : 1




## Model & Tokenizer

In [6]:
# get model & tokenizer
# HF AutoTokenizer, Automodel

from transformers import AutoTokenizer, AutoModelForSequenceClassification

hf_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
hf_model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base')



Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [7]:
def transform_nsmc(data):
    return hf_tokenizer(
        text=data['document'],  # Use 'document' instead of 'sentence1' and 'sentence2'
        truncation=True,
        padding='max_length',
        max_length=512,  # Assuming you want to pad/trim to a maximum length of 512 tokens
        return_attention_mask=False,  # Assuming you want the attention mask for BERT model
    )


In [20]:
# tokenize

nsmc_ds = hf_nsmc_dataset.map(transform_nsmc, batched=True)

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-b065440ce617ef2b.arrow
Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-2d4965d41d50f69b.arrow


In [24]:
nsmc_ds['train'][0]

{'label': 0,
 'document': '아 더빙.. 진짜 짜증나네요 목소리',
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [None]:
# Assuming nsmc_ds['train'] and nsmc_ds['test'] are your original datasets

# Calculate the number of samples for each dataset based on the desired percentages
num_train_samples = len(nsmc_ds['train'])
num_test_samples = len(nsmc_ds['test'])

# Sizes based on specified percentages
new_train_size = int(num_train_samples * 0.01)
new_val_size = int(num_train_samples * 0.003)  # 0.3% of the original training dataset for validation
new_test_size = int(num_test_samples * 0.01)

# Select the samples based on calculated sizes
nsmc_train = nsmc_ds['train'].select(range(new_train_size))
nsmc_val = nsmc_ds['train'].select(range(new_train_size, new_train_size + new_val_size))
nsmc_test = nsmc_ds['test'].select(range(new_test_size))

# Now nsmc_train, nsmc_val, and nsmc_test are your new datasets


In [17]:
# train model

import os
import numpy as np
from transformers import Trainer, TrainingArguments

output_dir = os.getenv('HOME') + '/aiffel/NLP/transformers'

training_arg = TrainingArguments(
    output_dir,
    evaluation_strategy = 'epoch',
    learning_rate= 2e-5,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size= 8,
    num_train_epochs= 3,
    weight_decay= 0.01,
    fp16=True,
    gradient_accumulation_steps=4,  # Accumulate gradients over 2 steps
    group_by_length=True
    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
# Load a relevant metric for binary classification, for example, accuracy

from datasets import load_metric

metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert logits to class indices
    predictions = np.argmax(predictions, axis=1)
    # Compute and return the metric (e.g., accuracy)
    return metric.compute(predictions=predictions, references=labels)


In [27]:
# train!

trainer = Trainer(
    model = hf_model,
    args = training_arg,
    train_dataset=nsmc_train,
    eval_dataset=nsmc_val,
    compute_metrics=compute_metrics
)

trainer.train()

Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running training *****
  Num examples = 1500
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 141


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.405431,0.802222
2,No log,0.376697,0.815556
3,No log,0.388374,0.8


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 450
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 450
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 450
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=141, training_loss=0.3958468606286015, metrics={'train_runtime': 237.8498, 'train_samples_per_second': 18.92, 'train_steps_per_second': 0.593, 'total_flos': 1183999749120000.0, 'train_loss': 0.3958468606286015, 'epoch': 3.0})

In [33]:
import torch

torch.cuda.empty_cache()

In [34]:
trainer.evaluate(nsmc_test)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8


{'eval_loss': 0.42627090215682983,
 'eval_accuracy': 0.8154,
 'eval_runtime': 178.4332,
 'eval_samples_per_second': 28.022,
 'eval_steps_per_second': 3.503,
 'epoch': 3.0}

## Accuracy 90% 를 위한 Fine Tuning
- preprocessing korean text
- bucketing
- increase train data


In [45]:
# preprocessing KOR text

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', } 


def clean(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text.strip()


import re


def clean_str(text):
    pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '([ㄱ-ㅎㅏ-ㅣ]+)'  # 한글 자음, 모음 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '<[^>]*>'         # HTML 태그 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '[^\w\s\n]'         # 특수기호제거
    text = re.sub(pattern=pattern, repl='', string=text)
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', string=text)
    text = re.sub('\n', '.', string=text)
    return text 



In [47]:
# 파라미터 하나만 받게 해서 두 펑션 하나로 합치기

def preprocess_function(examples):
    # Apply the cleaning and string cleaning functions
    cleaned_examples = [clean(text, punct, punct_mapping) for text in examples["document"]]
    cleaned_str_examples = [clean_str(text) for text in cleaned_examples]
    return {"document": cleaned_str_examples}

# Applying the preprocessing function in a batched manner
nsmc_ds = hf_nsmc_dataset.map(preprocess_function, batched=True, load_from_cache_file=False)



  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [48]:
# tokenize

nsmc_ds = nsmc_ds.map(transform_nsmc, batched=True)

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [49]:
# 학습 데이터 크기 키우기

# Assuming nsmc_ds['train'] and nsmc_ds['test'] are your original datasets

# Calculate the number of samples for each dataset based on the desired percentages
num_train_samples = len(nsmc_ds['train'])
num_test_samples = len(nsmc_ds['test'])

# Sizes based on specified percentages
new_train_size = int(num_train_samples * 0.03)
new_val_size = int(num_train_samples * 0.003)  # 0.3% of the original training dataset for validation
new_test_size = int(num_test_samples * 0.01)

# Select the samples based on calculated sizes
nsmc_train = nsmc_ds['train'].select(range(new_train_size))
nsmc_val = nsmc_ds['train'].select(range(new_train_size, new_train_size + new_val_size))
nsmc_test = nsmc_ds['test'].select(range(new_test_size))

# Now nsmc_train, nsmc_val, and nsmc_test are your new datasets


In [50]:
# bucketing

from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from datasets import load_dataset

# Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer=hf_tokenizer, pad_to_multiple_of=8)  
    # Example padding to a multiple of 8

# Create a DataLoader for the training set with dynamic padding

train_dataloader = DataLoader(
    nsmc_ds["train"],
    shuffle=True,
    batch_size=8,  # Adjust the batch size as needed
    collate_fn=data_collator,
)


In [51]:
nsmc_train.column_names

['document', 'id', 'input_ids', 'label', 'token_type_ids']

In [52]:
# Initialize the Trainer
trainer = Trainer(
    model=hf_model,
    args=training_arg,
    train_dataset=nsmc_train,
    eval_dataset=nsmc_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



Using amp fp16 backend


In [53]:
training_arg = TrainingArguments(
    output_dir,
    evaluation_strategy = 'epoch',
    learning_rate= 2e-5,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size= 8,
    num_train_epochs= 3,
    weight_decay= 0.01,
    fp16=True,
    gradient_accumulation_steps=4,  # Accumulate gradients over 2 steps
    group_by_length=True
    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [54]:
# Train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running training *****
  Num examples = 4500
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 420


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.444935,0.822222
1,No log,0.419358,0.828889
2,No log,0.42551,0.826667


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 450
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 450
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 450
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=420, training_loss=0.3611492701939174, metrics={'train_runtime': 617.0875, 'train_samples_per_second': 21.877, 'train_steps_per_second': 0.681, 'total_flos': 3546737026252800.0, 'train_loss': 0.3611492701939174, 'epoch': 2.99})

In [55]:
trainer.evaluate(nsmc_test)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 0.43322306871414185,
 'eval_accuracy': 0.832,
 'eval_runtime': 17.6607,
 'eval_samples_per_second': 28.311,
 'eval_steps_per_second': 3.567,
 'epoch': 2.99}

# 회고
- 노드에 나와있던 세팅대로 트레이닝 시켰더니 10시간이 걸린다고 했다..
    - 전체 데이터의 1%만 사용하는 것으로 줄였다. 5분 컷으로 정확도 80%
- bucketing : 길이 비슷한 아이들끼리 묶어서 배치화 하고, 버킷별로 패딩 사이즈 통일해서 학습하는 것
- batch 크기 키워서 학습 시간을 짧게 해보려고 했으나, 쿠다 메모리가 부족하다고 해서 8로 그대로 뒀다
- bucketing 을 했더니 데이터셋 3% 쓰는데도 10분이면 학습이 된다고 한다!
- 한국어 전처리 코드를 복붙해서 사용했다
- 데이터를 더 준다고 정확도가 놀랍도록 올라가지는 않았다. 맞춤법 검사기도 돌려볼까 했으나 시간 관계상 다음 기회에..!