In [93]:
import os

import torch
import numpy as np

from datasets import load_dataset
from transformers import BertForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding

import tqdm
from sklearn.metrics import label_ranking_average_precision_score
from transformers.pipelines.base import KeyDataset
from sklearn.metrics import classification_report

In [94]:
# KcBERT: KoBERT trained from scratch using comments data of naver news
model_name = 'beomi/kcbert-base'

## Dataset

In [95]:
# option 1. cleaned dataset
base_dir = os.getcwd()
train_data = base_dir + '/dataset/processed_unsmile_train_v1.0.csv'
test_data = base_dir + '/dataset/processed_unsmile_test_v1.0.csv'

dataset = load_dataset('csv', data_files={'train': train_data, 'test': test_data})

Using custom data configuration default-7381e1b9c9e59c28
Reusing dataset csv (/home/kdy20401/.cache/huggingface/datasets/csv/default-7381e1b9c9e59c28/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
100%|██████████| 2/2 [00:00<00:00, 262.50it/s]


In [96]:
# option 2. original dataset from huggingface.co
hf_dataset = load_dataset('smilegate-ai/kor_unsmile')

Using custom data configuration smilegate-ai--kor_unsmile-e0f75c6e3be1af78
Reusing dataset parquet (/home/kdy20401/.cache/huggingface/datasets/parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121)
100%|██████████| 2/2 [00:00<00:00, 290.70it/s]


In [97]:
# '개인지칭' label is excluded
unsmile_columns = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]

In [98]:
def preprocess_function(examples):
    tokenized_examples = tokenizer(str(examples["문장"]))
    tokenized_examples['labels'] = torch.tensor([examples[col] for col in unsmile_columns], dtype=torch.float)
    # multi label classification 학습을 위해선 label이 float 형태로 변형되어야 합니다.
    # huggingface datasets 최신 버전에는 'map' 함수에 버그가 있어서 변형이 올바르게 되지 않습니다.
    
    return tokenized_examples

In [99]:
# tokenize data
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = dataset.map(preprocess_function, batched=False)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels'])

loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /home/kdy20401/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779
Model config BertConfig {
  "_name_or_path": "beomi/kcbert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 300,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "tr

In [100]:
tokenized_dataset['train'][0]

{'input_ids': tensor([    2,  2458, 15751, 24930, 24351, 29278, 17038, 11631,     3]),
 'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])}

## Model

In [101]:
num_labels = len(unsmile_columns)

model = BertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels, # original number of labels of BERT is two(0, 1)
    problem_type="multi_label_classification"
)
model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_columns)}
model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_columns)}

loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /home/kdy20401/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9"

In [102]:
model.config.label2id

{'여성/가족': 0,
 '남성': 1,
 '성소수자': 2,
 '인종/국적': 3,
 '연령': 4,
 '지역': 5,
 '종교': 6,
 '기타 혐오': 7,
 '악플/욕설': 8,
 'clean': 9}

## Train model

In [103]:
def compute_metrics(x):
    return {
        'lrap': label_ranking_average_precision_score(x.label_ids, x.predictions),
    }

In [104]:
batch_size = 64

# before feeding batch data to the model, apply padding to each sequences in the batch to fit the tensor size
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    output_dir="model_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='lrap',
    greater_is_better=True,
    optim='adamw_torch'
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["test"], 
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [105]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: clean, 여성/가족, 개인지칭, 악플/욕설, 인종/국적, 성소수자, 문장, 연령, 기타 혐오, 지역, 종교, 남성. If clean, 여성/가족, 개인지칭, 악플/욕설, 인종/국적, 성소수자, 문장, 연령, 기타 혐오, 지역, 종교, 남성 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15003
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 295


Epoch,Training Loss,Validation Loss,Lrap
1,No log,0.241682,0.768286
2,No log,0.169205,0.855768
3,No log,0.15137,0.866659
4,No log,0.144302,0.870486
5,No log,0.142209,0.873362


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: clean, 여성/가족, 개인지칭, 악플/욕설, 인종/국적, 성소수자, 문장, 연령, 기타 혐오, 지역, 종교, 남성. If clean, 여성/가족, 개인지칭, 악플/욕설, 인종/국적, 성소수자, 문장, 연령, 기타 혐오, 지역, 종교, 남성 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3737
  Batch size = 64
Saving model checkpoint to model_output/checkpoint-59
Configuration saved in model_output/checkpoint-59/config.json
Model weights saved in model_output/checkpoint-59/pytorch_model.bin
tokenizer config file saved in model_output/checkpoint-59/tokenizer_config.json
Special tokens file saved in model_output/checkpoint-59/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: clean, 여성/가족, 개인지칭, 악플/욕설, 인종/국적, 성소수자, 문장, 연령, 기타 혐오, 지역

TrainOutput(global_step=295, training_loss=0.1932741456112619, metrics={'train_runtime': 131.7882, 'train_samples_per_second': 569.209, 'train_steps_per_second': 2.238, 'total_flos': 2787445072114668.0, 'train_loss': 0.1932741456112619, 'epoch': 5.0})

## Save model

In [111]:
trainer.save_model()

Saving model checkpoint to model_output
Configuration saved in model_output/config.json
Model weights saved in model_output/pytorch_model.bin
tokenizer config file saved in model_output/tokenizer_config.json
Special tokens file saved in model_output/special_tokens_map.json


## Test model

In [106]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device=0,
    return_all_scores=True,
    function_to_apply='sigmoid'
)

In [107]:
for result in pipe("포괄적차별금지법을 반대합니다.")[0]:
    print(result)

{'label': '여성/가족', 'score': 0.0744129866361618}
{'label': '남성', 'score': 0.018040314316749573}
{'label': '성소수자', 'score': 0.31230801343917847}
{'label': '인종/국적', 'score': 0.023305071517825127}
{'label': '연령', 'score': 0.050473958253860474}
{'label': '지역', 'score': 0.06035453826189041}
{'label': '종교', 'score': 0.01604899950325489}
{'label': '기타 혐오', 'score': 0.1687774360179901}
{'label': '악플/욕설', 'score': 0.15697471797466278}
{'label': 'clean', 'score': 0.0743395984172821}


## Model performance

In [108]:
def get_predicated_label(output_labels, min_score):
    labels = []
    for label in output_labels:
        if label['score'] > min_score:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [109]:
predicated_labels = []

for out in tqdm.tqdm(pipe(KeyDataset(dataset['test'], '문장'))):
    predicated_labels.append(get_predicated_label(out, 0.5))

100%|██████████| 3737/3737 [00:34<00:00, 108.38it/s]


In [110]:

print(classification_report(tokenized_dataset['test']['labels'], predicated_labels))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78       393
           1       0.90      0.78      0.84       340
           2       0.88      0.77      0.82       281
           3       0.86      0.75      0.80       422
           4       0.90      0.77      0.83       146
           5       0.88      0.85      0.87       261
           6       0.88      0.89      0.88       294
           7       0.00      0.00      0.00       134
           8       0.76      0.63      0.69       770
           9       0.78      0.72      0.75       944

   micro avg       0.83      0.71      0.77      3985
   macro avg       0.77      0.69      0.73      3985
weighted avg       0.80      0.71      0.75      3985
 samples avg       0.75      0.73      0.73      3985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
