In [1]:
import os

import torch
import numpy as np

from datasets import load_dataset
from transformers import BertForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding

import tqdm
from sklearn.metrics import label_ranking_average_precision_score
from transformers.pipelines.base import KeyDataset
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


## KcBert

In [2]:
# KcBERT: KoBERT trained from scratch using comments data of naver news
model_name = 'beomi/kcbert-base'

## Dataset

In [3]:
# 1. load dataset which is a bit processed from local 
base_dir = os.getcwd()
train_data = base_dir + '/dataset/cleaned_unsmile_train_v1.0.csv'
test_data = base_dir + '/dataset/cleaned_unsmile_test_v1.0.csv'

dataset = load_dataset('csv', data_files={'train': train_data, 'test': test_data})

Using custom data configuration default-82194d63485bfcca
Reusing dataset csv (/home/kdy20401/.cache/huggingface/datasets/csv/default-82194d63485bfcca/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
100%|██████████| 2/2 [00:00<00:00, 331.87it/s]


In [4]:
# 2. load dataset from huggingface.co
# hf_dataset = load_dataset('smilegate-ai/kor_unsmile')

In [5]:
# '개인지칭' label is excluded
unsmile_columns = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]

In [6]:
def preprocess_function(examples):
    tokenized_examples = tokenizer(str(examples["문장"]))
    tokenized_examples['labels'] = torch.tensor([examples[col] for col in unsmile_columns], dtype=torch.float)
    # multi label classification 학습을 위해선 label이 float 형태로 변형되어야 합니다.
    # huggingface datasets 최신 버전에는 'map' 함수에 버그가 있어서 변형이 올바르게 되지 않습니다.
    
    return tokenized_examples

In [7]:
# tokenize data
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = dataset.map(preprocess_function, batched=False)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels'])

100%|██████████| 15003/15003 [00:03<00:00, 3826.24ex/s]
100%|██████████| 3737/3737 [00:00<00:00, 4056.34ex/s]


In [8]:
tokenized_dataset['train'][0]

{'input_ids': tensor([    2,  2458, 15751, 24930, 24351, 29278, 17038, 11631,     3]),
 'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])}

In [9]:
num_labels = len(unsmile_columns)

model = BertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_columns)}
model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_columns)}

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [10]:
model.base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30000, 768, padding_idx=0)
    (position_embeddings): Embedding(300, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
model.config.label2id

{'여성/가족': 0,
 '남성': 1,
 '성소수자': 2,
 '인종/국적': 3,
 '연령': 4,
 '지역': 5,
 '종교': 6,
 '기타 혐오': 7,
 '악플/욕설': 8,
 'clean': 9}

## Train model

In [12]:
def compute_metrics(x):
    return {
        'lrap': label_ranking_average_precision_score(x.label_ids, x.predictions),
    }

In [13]:
batch_size = 64
num_epoch = 15

# before feeding batch data to the model, apply padding to each sequences in the batch to fit the tensor size
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    output_dir="model_output/KcBert",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epoch,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='lrap',
    greater_is_better=True,
    optim='adamw_torch'
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["test"], 
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [14]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: 악플/욕설, clean, 개인지칭, 문장, 종교, 인종/국적, 기타 혐오, 연령, 지역, 여성/가족, 남성, 성소수자. If 악플/욕설, clean, 개인지칭, 문장, 종교, 인종/국적, 기타 혐오, 연령, 지역, 여성/가족, 남성, 성소수자 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15003
  Num Epochs = 15
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 885


Epoch,Training Loss,Validation Loss,Lrap
1,No log,0.228506,0.781661
2,No log,0.157334,0.86058
3,No log,0.138762,0.874631
4,No log,0.13191,0.877017
5,No log,0.131161,0.877105
6,No log,0.135107,0.875694
7,No log,0.135361,0.877933
8,No log,0.137416,0.875916
9,0.129200,0.140853,0.875959
10,0.129200,0.144887,0.875291


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: 악플/욕설, clean, 개인지칭, 문장, 종교, 인종/국적, 기타 혐오, 연령, 지역, 여성/가족, 남성, 성소수자. If 악플/욕설, clean, 개인지칭, 문장, 종교, 인종/국적, 기타 혐오, 연령, 지역, 여성/가족, 남성, 성소수자 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3737
  Batch size = 64
Saving model checkpoint to model_output/KcBert/checkpoint-59
Configuration saved in model_output/KcBert/checkpoint-59/config.json
Model weights saved in model_output/KcBert/checkpoint-59/pytorch_model.bin
tokenizer config file saved in model_output/KcBert/checkpoint-59/tokenizer_config.json
Special tokens file saved in model_output/KcBert/checkpoint-59/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: 악플/욕설, clean, 개인지칭, 문장

TrainOutput(global_step=885, training_loss=0.08943302995067531, metrics={'train_runtime': 697.7093, 'train_samples_per_second': 322.548, 'train_steps_per_second': 1.268, 'total_flos': 8370692162021616.0, 'train_loss': 0.08943302995067531, 'epoch': 15.0})

In [15]:
trainer.save_model()

Saving model checkpoint to model_output/KcBert
Configuration saved in model_output/KcBert/config.json
Model weights saved in model_output/KcBert/pytorch_model.bin
tokenizer config file saved in model_output/KcBert/tokenizer_config.json
Special tokens file saved in model_output/KcBert/special_tokens_map.json


## Test model

In [16]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device=0,
    return_all_scores=True,
    function_to_apply='sigmoid'
    )

In [17]:
for result in pipe("포괄적차별금지법을 반대합니다.")[0]:
    print(result)

{'label': '여성/가족', 'score': 0.027701519429683685}
{'label': '남성', 'score': 0.026212995871901512}
{'label': '성소수자', 'score': 0.21012935042381287}
{'label': '인종/국적', 'score': 0.02318049781024456}
{'label': '연령', 'score': 0.023564638569951057}
{'label': '지역', 'score': 0.058266717940568924}
{'label': '종교', 'score': 0.027729108929634094}
{'label': '기타 혐오', 'score': 0.7373677492141724}
{'label': '악플/욕설', 'score': 0.053706057369709015}
{'label': 'clean', 'score': 0.024462051689624786}


## Evaluate model

In [18]:
def get_predicated_label(output_labels, min_score):
    labels = []
    for label in output_labels:
        if label['score'] > min_score:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [19]:
predicated_labels = []

for out in tqdm.tqdm(pipe(KeyDataset(dataset['test'], '문장'))):
    predicated_labels.append(get_predicated_label(out, 0.5))

Disabling tokenizer parallelism, we're using DataLoader multithreading already
100%|██████████| 3737/3737 [00:36<00:00, 101.63it/s]


In [20]:

print(classification_report(tokenized_dataset['test']['labels'], predicated_labels))

              precision    recall  f1-score   support

           0       0.84      0.76      0.80       393
           1       0.90      0.84      0.87       340
           2       0.86      0.79      0.83       281
           3       0.84      0.82      0.83       422
           4       0.91      0.82      0.86       146
           5       0.91      0.90      0.91       261
           6       0.89      0.87      0.88       294
           7       0.71      0.35      0.47       134
           8       0.68      0.66      0.67       770
           9       0.76      0.75      0.76       944

   micro avg       0.80      0.76      0.78      3985
   macro avg       0.83      0.76      0.79      3985
weighted avg       0.80      0.76      0.78      3985
 samples avg       0.78      0.77      0.77      3985



  _warn_prf(average, modifier, msg_start, len(result))
