In [1]:
import os

import torch
import numpy as np

from datasets import load_dataset
from transformers import BertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

import tqdm
from sklearn.metrics import label_ranking_average_precision_score
from transformers.pipelines.base import KeyDataset
from sklearn.metrics import classification_report

from kobert_tokenizer import KoBERTTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## KoBert

In [2]:
model_name = 'skt/kobert-base-v1'

## Dataset

In [3]:
# 1. load dataset which is a bit processed from local 
base_dir = os.getcwd()
train_data = base_dir + '/dataset/cleaned_unsmile_train_v1.0.csv'
test_data = base_dir + '/dataset/cleaned_unsmile_test_v1.0.csv'

dataset = load_dataset('csv', data_files={'train': train_data, 'test': test_data})

Using custom data configuration default-82194d63485bfcca
Reusing dataset csv (/home/kdy20401/.cache/huggingface/datasets/csv/default-82194d63485bfcca/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
100%|██████████| 2/2 [00:00<00:00, 300.38it/s]


In [4]:
# 2. load dataset from huggingface.co
# hf_dataset = load_dataset('smilegate-ai/kor_unsmile')

In [5]:
# '개인지칭' label is excluded
unsmile_columns = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]

In [6]:
def preprocess_function(examples):
    tokenized_examples = tokenizer(str(examples["문장"]))
    tokenized_examples['labels'] = torch.tensor([examples[col] for col in unsmile_columns], dtype=torch.float)
    # multi label classification 학습을 위해선 label이 float 형태로 변형되어야 합니다.
    # huggingface datasets 최신 버전에는 'map' 함수에 버그가 있어서 변형이 올바르게 되지 않습니다.
    
    return tokenized_examples

In [7]:
tokenizer = KoBERTTokenizer.from_pretrained(model_name)
tokenized_dataset = dataset.map(preprocess_function, batched=False)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
100%|██████████| 15003/15003 [00:04<00:00, 3045.00ex/s]
100%|██████████| 3737/3737 [00:01<00:00, 3180.87ex/s]


In [8]:
tokenized_dataset['train'][0]

{'input_ids': tensor([   2, 3803, 6812, 7794, 2962, 7086, 2923, 5439, 6751, 6855, 6553, 1201,
         5400, 3093, 5777, 5591,    3]),
 'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])}

In [9]:
num_labels = len(unsmile_columns)

model = BertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels, # original number of labels of BERT is two(0, 1)
    problem_type="multi_label_classification"
)
model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_columns)}
model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_columns)}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )

In [11]:
model.config.label2id

{'여성/가족': 0,
 '남성': 1,
 '성소수자': 2,
 '인종/국적': 3,
 '연령': 4,
 '지역': 5,
 '종교': 6,
 '기타 혐오': 7,
 '악플/욕설': 8,
 'clean': 9}

## Train model

In [12]:
def compute_metrics(x):
    return {
        'lrap': label_ranking_average_precision_score(x.label_ids, x.predictions),
    }

In [13]:
batch_size = 64
num_epoch = 15

# before feeding batch data to the model, apply padding to each sequences in the batch to fit the tensor size
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    output_dir="model_output/KoBert",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epoch,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='lrap',
    greater_is_better=True,
    optim='adamw_torch'
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["test"], 
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [14]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: clean, 인종/국적, 악플/욕설, 남성, 문장, 여성/가족, 성소수자, 종교, 개인지칭, 연령, 기타 혐오, 지역. If clean, 인종/국적, 악플/욕설, 남성, 문장, 여성/가족, 성소수자, 종교, 개인지칭, 연령, 기타 혐오, 지역 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15003
  Num Epochs = 15
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 885


Epoch,Training Loss,Validation Loss,Lrap
1,No log,0.359525,0.467643
2,No log,0.310708,0.516799
3,No log,0.282674,0.608801
4,No log,0.255755,0.703944
5,No log,0.230431,0.778412
6,No log,0.212253,0.802376
7,No log,0.198704,0.812751
8,No log,0.193038,0.814126
9,0.267800,0.187677,0.818008
10,0.267800,0.183271,0.822842


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: clean, 인종/국적, 악플/욕설, 남성, 문장, 여성/가족, 성소수자, 종교, 개인지칭, 연령, 기타 혐오, 지역. If clean, 인종/국적, 악플/욕설, 남성, 문장, 여성/가족, 성소수자, 종교, 개인지칭, 연령, 기타 혐오, 지역 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3737
  Batch size = 64
Saving model checkpoint to model_output/KoBert/checkpoint-59
Configuration saved in model_output/KoBert/checkpoint-59/config.json
Model weights saved in model_output/KoBert/checkpoint-59/pytorch_model.bin
tokenizer config file saved in model_output/KoBert/checkpoint-59/tokenizer_config.json
Special tokens file saved in model_output/KoBert/checkpoint-59/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: clean, 인종/국적, 악플/욕설, 남

TrainOutput(global_step=885, training_loss=0.2099381656970008, metrics={'train_runtime': 712.8164, 'train_samples_per_second': 315.712, 'train_steps_per_second': 1.242, 'total_flos': 1.1557313597626524e+16, 'train_loss': 0.2099381656970008, 'epoch': 15.0})

In [15]:
trainer.save_model()

Saving model checkpoint to model_output/KoBert
Configuration saved in model_output/KoBert/config.json
Model weights saved in model_output/KoBert/pytorch_model.bin
tokenizer config file saved in model_output/KoBert/tokenizer_config.json
Special tokens file saved in model_output/KoBert/special_tokens_map.json


## Test model

In [16]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device=0,
    return_all_scores=True,
    function_to_apply='sigmoid'
    )

In [17]:
for result in pipe("포괄적차별금지법을 반대합니다.")[0]:
    print(result)

{'label': '여성/가족', 'score': 0.5228493809700012}
{'label': '남성', 'score': 0.041127901524305344}
{'label': '성소수자', 'score': 0.19210051000118256}
{'label': '인종/국적', 'score': 0.04396301135420799}
{'label': '연령', 'score': 0.030368909239768982}
{'label': '지역', 'score': 0.02861735038459301}
{'label': '종교', 'score': 0.03193731978535652}
{'label': '기타 혐오', 'score': 0.06751430779695511}
{'label': '악플/욕설', 'score': 0.06330510228872299}
{'label': 'clean', 'score': 0.07052849978208542}


## Evaluate model

In [18]:
def get_predicated_label(output_labels, min_score):
    labels = []
    for label in output_labels:
        if label['score'] > min_score:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [19]:
predicated_labels = []

for out in tqdm.tqdm(pipe(KeyDataset(dataset['test'], '문장'))):
    predicated_labels.append(get_predicated_label(out, 0.5))

Disabling tokenizer parallelism, we're using DataLoader multithreading already
100%|██████████| 3737/3737 [00:37<00:00, 100.18it/s]


In [20]:

print(classification_report(tokenized_dataset['test']['labels'], predicated_labels))

              precision    recall  f1-score   support

           0       0.77      0.64      0.70       393
           1       0.85      0.75      0.80       340
           2       0.90      0.74      0.82       281
           3       0.84      0.70      0.76       422
           4       0.92      0.40      0.56       146
           5       0.92      0.82      0.87       261
           6       0.90      0.81      0.85       294
           7       0.00      0.00      0.00       134
           8       0.60      0.56      0.58       770
           9       0.67      0.72      0.69       944

   micro avg       0.75      0.66      0.70      3985
   macro avg       0.74      0.61      0.66      3985
weighted avg       0.74      0.66      0.69      3985
 samples avg       0.69      0.67      0.68      3985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
