# STEP 0. 필요한 라이브러리 import

In [1]:
import datasets
from datasets import load_dataset
from datasets import load_metric
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TFBertForSequenceClassification
import os
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from datasets import Dataset, DatasetDict
import tensorflow as tf

# STEP 1. NSMC 데이터 분석 및 Huggingface dataset 구성

데이터셋은 깃허브에서 다운

In [3]:
path = os.getenv('HOME')+'/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/'

train = pd.read_table(path + 'ratings_train.txt')
test = pd.read_table(path + 'ratings_test.txt')

# STEP 2. klue/bert-base model 및 tokenizer 불러오기

In [4]:
huggingface_model_path = "klue/bert-base"
huggingface_tokenizer = AutoTokenizer.from_pretrained(huggingface_model_path)
huggingface_model = AutoModelForSequenceClassification.from_pretrained(huggingface_model_path, num_labels = 2)
# 가중치 고정
huggingface_model.bert.trainable = False

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

# STEP 3. 위에서 불러온 tokenizer으로 데이터셋을 전처리하고, model 학습 진행해 보기

결측치 확인

In [5]:
train.isnull().sum()

id          0
document    5
label       0
dtype: int64

In [6]:
test.isnull().sum()

id          0
document    3
label       0
dtype: int64

결측치 제거

In [7]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [8]:
print(train.duplicated().sum())
print(test.duplicated().sum())

0
0


max_len을 정하기 위한 길이 분포 확인

In [9]:
# 길이 분포 출력
train_len = [len(huggingface_tokenizer.tokenize(s)) for s in tqdm(train['document'])]
test_len = [len(huggingface_tokenizer.tokenize(s)) for s in tqdm(test['document'])]

print('train의 최소 길이 : {}'.format(np.min(train_len)))
print('train의 최대 길이 : {}'.format(np.max(train_len)))
print('train의 평균 길이 : {}'.format(np.mean(train_len)))
print('test의 최소 길이 : {}'.format(np.min(test_len)))
print('test의 최대 길이 : {}'.format(np.max(test_len)))
print('test의 평균 길이 : {}'.format(np.mean(test_len)))

  0%|          | 0/149995 [00:00<?, ?it/s]

  0%|          | 0/49997 [00:00<?, ?it/s]

train의 최소 길이 : 1
train의 최대 길이 : 140
train의 평균 길이 : 20.276189206306878
test의 최소 길이 : 1
test의 최대 길이 : 120
test의 평균 길이 : 20.360981658899533


In [10]:
# max_len 길이를 선택했을 때, 얼마나 많은 샘플들을 자르지 않고
# 포함할 수 있는지 통계로 확인하는 함수
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in tqdm(nested_list):
    if(len(s.split()) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))))
            
below_threshold_len(20, train['document'])
below_threshold_len(20, test['document'])

  0%|          | 0/149995 [00:00<?, ?it/s]

전체 샘플 중 길이가 20 이하인 샘플의 비율: 0.9365645521517384


  0%|          | 0/49997 [00:00<?, ?it/s]

전체 샘플 중 길이가 20 이하인 샘플의 비율: 0.935416124967498


max_len이하 데이터만 선별

In [11]:
max_len = 20
train = train[train['document'].apply(lambda x: len(huggingface_tokenizer.tokenize(x)) <= max_len)]
test = test[test['document'].apply(lambda x: len(huggingface_tokenizer.tokenize(x)) <= max_len)]

In [12]:
print(len(train))
print(len(test))

98243
32675


데이터를 DatasetDict으로 변환

In [13]:
# Hugging Face의 Dataset으로 변환
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

# DatasetDict로 결합
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

dataset = dataset.remove_columns(["__index_level_0__"])

# 결과 확인
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 98243
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 32675
    })
})


토큰화

In [14]:
def transform(data):
    return huggingface_tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,
        )

dataset = dataset.map(transform, batched=True)

  0%|          | 0/99 [00:00<?, ?ba/s]

  0%|          | 0/33 [00:00<?, ?ba/s]

데이터셋 분리

In [15]:
train_dataset = dataset['train']
# dataset['test'] 내부를 다시 train과 test셋으로 나눔
val_test_split = dataset['test'].train_test_split(test_size=0.2)

val_dataset = val_test_split['train']
test_dataset = val_test_split['test'] 

train argument 정의

In [16]:
output_dir = os.getenv('HOME')+'/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08'

training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    evaluation_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 14,   # 각 device 당 batch size
    per_device_eval_batch_size = 14,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
)

compute_metrics 정의

In [17]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Trainer 정의후 훈련

In [18]:
trainer = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=train_dataset,    # training dataset
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running training *****
  Num examples = 98243
  Num Epochs = 3
  Instantaneous batch size per device = 14
  Total train batch size (w. parallel, distributed & accumulation) = 14
  Gradient Accumulation steps = 1
  Total optimization steps = 21054


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2769,0.280597,0.887911
2,0.2108,0.310845,0.891584
3,0.1488,0.411153,0.893152


Saving model checkpoint to /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-500
Configuration saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-500/config.json
Model weights saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-1000
Configuration saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-1000/config.json
Model weights saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-1500
Configuration saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-1500/config.json
Model weights saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/aiffel/AIFFEL

Configuration saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-13000/config.json
Model weights saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-13000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-13500
Configuration saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-13500/config.json
Model weights saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-13500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-14000
Configuration saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-14000/config.json
Model weights saved in /aiffel/aiffel/aiffel/AIFFEL_quest_rs/GoingDeeper/Gd08/checkpoint-14000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been igno

TrainOutput(global_step=21054, training_loss=0.22621684107468362, metrics={'train_runtime': 29303.7154, 'train_samples_per_second': 10.058, 'train_steps_per_second': 0.718, 'total_flos': 7.754645823519744e+16, 'train_loss': 0.22621684107468362, 'epoch': 3.0})

모델 평가

In [19]:
trainer.evaluate(test_dataset)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 6535
  Batch size = 14


{'eval_loss': 0.41729044914245605,
 'eval_accuracy': 0.8915072685539404,
 'eval_runtime': 219.1384,
 'eval_samples_per_second': 29.821,
 'eval_steps_per_second': 2.131,
 'epoch': 3.0}

# STEP 4. Bucketing을 적용하여 학습시키고, STEP 3의 결과와의 비교

데이터 load

In [2]:
dataset = load_dataset('e9t/nsmc')

Using custom data configuration default
Reusing dataset nsmc (/aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

데이터셋 분리

In [3]:
train_dataset = dataset['train']
val_test_split = dataset['test'].train_test_split(test_size=0.2)

val_dataset = val_test_split['train']
test_dataset = val_test_split['test'] 

Loading cached split indices for dataset at /aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-d09880cac8a4b036.arrow and /aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-603f7bbd9708353c.arrow


input, target 데이터 분리

In [4]:
train_x = train_dataset['document']
val_x = val_dataset['document']
test_x = test_dataset['document']
train_y = train_dataset['label']
val_y = val_dataset['label']
test_y = test_dataset['label']

토큰화

In [6]:
def transform(data):
    return huggingface_tokenizer(
        data,
        truncation=True,
    )

train_encodings = transform(train_x)

In [7]:
val_encodings = transform(val_x)

In [8]:
test_encodings = transform(test_x)

bucketing 적용

In [53]:
def create_bucketed_dataset(encodings, labels):
    # RaggedTensor로 생성하여 가변 길이 허용
    input_ids = tf.ragged.constant(encodings['input_ids'])
    attention_mask = tf.ragged.constant(encodings['attention_mask'])
    
    # 레이블을 Tensor로 변환
    labels = tf.constant(labels)
    
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        },
        labels
    ))
    
    # 배치 처리 및 RaggedTensor를 텐서로 변환
    def map_to_tensor(inputs, label):
        return (
            {
                'input_ids': tf.convert_to_tensor(inputs['input_ids']),
                'attention_mask': tf.convert_to_tensor(inputs['attention_mask']),
            },
            label
        )

    # 각 요소를 텐서로 변환
    dataset = dataset.map(map_to_tensor)

    # 패딩 및 버킷 적용
    def element_length_func(inputs, label):
        return tf.shape(inputs['input_ids'])[0]  # 길이 반환
    
    padding_values = (
        {
            'input_ids': tf.constant(0, dtype=tf.int32),
            'attention_mask': tf.constant(0, dtype=tf.int32),
        },
        tf.constant(0, dtype=tf.int32)  # 레이블 패딩
    )

    bucketed_dataset = dataset.bucket_by_sequence_length(
        element_length_func=element_length_func,
        bucket_boundaries=[20, 50, 80, 110, 140],
        bucket_batch_sizes=[14, 11, 8, 5, 2, 1],
        padding_values=padding_values,
        drop_remainder=True
    )
    
    return bucketed_dataset

In [54]:
bucketed_train_dataset = create_bucketed_dataset(train_encodings, train_y)

In [56]:
bucketed_val_dataset = create_bucketed_dataset(val_encodings, val_y)

In [57]:
bucketed_test_dataset = create_bucketed_dataset(test_encodings, test_y)

klue-bert를 FT모델로 불러옴

In [58]:
huggingface_model = TFBertForSequenceClassification.from_pretrained(huggingface_model_path, num_labels = 2, from_pt=True)
huggingface_model.bert.trainable = False

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


모델 컴파일 및 훈련

In [59]:
# 모델 컴파일
huggingface_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),  # learning_rate
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),  # loss function
    metrics=['accuracy']  # 평가 지표
)

# 모델 훈련
history = huggingface_model.fit(
    bucketed_train_dataset,
    validation_data=bucketed_val_dataset,  # 검증 데이터셋
    epochs=3,  # 총 epochs
    verbose=1,  # 훈련 로그 출력
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),  # 조기 종료 설정
    ]
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


모델 평가

In [60]:
huggingface_model.evaluate(bucketed_test_dataset)



[0.40109875798225403, 0.8200220465660095]

## 다른 전처리 O / Bucketing X  vs. 다른 전처리 X / Bucketing O

- 다른 전처리 O / Bucketing X -> A
    - 훈련 시간: 8:08:21
    - Training Loss: 0.148800
    - Validation Loss: 0.411153
    - Accuracy: 0.893152
    - eval_loss: 0.31855401396751404
    - eval_accuracy: 0.891354246365723
    
---
    
- 다른 전처리 X / Bucketing O -> B
    - 훈련 시간: 00:33:08
    - Training Loss: 0.4269
    - Validation Loss: 0.4108
    - Training Accuracy: 0.8083
    - Validation Accuracy: 0.8145
    - eval_loss: 0.40109875798225403
    - eval_accuracy: 0.8200220465660095
  
---  
---

- B보단 A가 성능면에선 우수함
    - 하지만 시간이 너무 오래 걸림
- B는 시간 대비 성능이 우수하다고 볼 수 있음
    - epoch를 더 늘리면 성능이 좋아질 가능성이..?

# 회고

- DLthon때 똑같이 klue-bert를 사용
    - 그 당시에는 Trainer를 어떻게 사용하는지 몰라 ft모델로 불러와 사용 했었음
    - 이번 프로젝트를 통해 Trainer의 사용법을 제대로 익힌거 같아 좋음
    
    
- Bucketing이라는 기법을 프로젝트 하면서 처음 접했음
    - 훈련 시간이 획기적으로 줄어들어 맘에 듬
    - 성능은 epoch라던가 하이퍼파라미터 튜닝, 데이터셋 전처리 나 추가를 통해 향상 시킬 수 있지 않을까 생각이 듬