In [1]:
# pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel,BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
train = pd.read_csv("./Dataset/unsmile_train_v1.0.tsv", sep="\t")
valid = pd.read_csv("./Dataset/unsmile_valid_v1.0.tsv", sep="\t")

In [5]:
train.head()

Unnamed: 0,문장,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean,개인지칭
0,일안하는 시간은 쉬고싶어서 그런게 아닐까,0,0,0,0,0,0,0,0,0,1,0
1,아동성범죄와 페도버는 기록바 끊어져 영원히 고통 받는다. 무슬림 50퍼 근친이다. ...,0,0,0,0,0,0,1,0,0,0,0
2,루나 솔로앨범 나왔을 때부터 머모 기운 있었음 ㅇㅇ Keep o doin 진짜 띵...,0,0,0,0,0,0,0,0,0,1,0
3,홍팍에도 어버이연합인가 보내요 뭐 이런뎃글 있는데 이거 어버이연합측에 신고하면 그쪽...,0,0,0,0,0,0,0,0,0,1,0
4,아놔 왜 여기 댓들은 다 여자들이 김치녀라고 먼저 불렸다! 여자들은 더 심하게 그런...,1,0,0,0,0,0,0,0,0,0,0


In [6]:
train.shape, valid.shape

((15005, 12), (3737, 12))

In [7]:
MODEL_NAME = "skt/kobert-base-v1"
tokenizer = KoBERTTokenizer.from_pretrained(MODEL_NAME)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [8]:
tokenized_train = tokenizer(
                    list(train["문장"]), 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=512, 
                    add_special_tokens=True
                )

In [9]:
tokenized_train.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [10]:
# print(tokenized_train)
# print(tokenized_train[0].tokens)
# print(tokenized_train[0].ids)
# print(tokenized_train[0].attention_mask)

In [11]:
tokenized_valid = tokenizer(
                    list(valid["문장"]), 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=512, 
                    add_special_tokens=True
                )

In [12]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self, encoding, label):
        self.encoding = encoding
        self.label = label
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
        item["label"] = torch.tensor(self.label[idx])
        return item

    def __len__(self):
        return len(self.label)

In [13]:
train_label = train['clean'].values
valid_label = valid['clean'].values

train_dataset = myDataset(tokenized_train, train_label)
valid_dataset = myDataset(tokenized_valid, valid_label)

In [14]:
train_dataset.__getitem__(1)

  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}


{'input_ids': tensor([   2, 3106, 6573, 6334, 6983, 4829, 5859, 6323, 5760, 1273, 6273,  517,
         5643, 6855, 7245, 3376, 7020, 7996, 1012, 2225,   54, 2095, 6697, 6136,
          612, 7706, 1221, 7489, 7100,   54,  533,  545, 5592,  655,  341, 1859,
         5439, 4575, 3571, 7086,  606,   61, 1562,   54,    3,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       

In [15]:
model = BertForSequenceClassification.from_pretrained('skt/kobert-base-v1')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [16]:
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=10,
    per_device_eval_batch_size=64,
    per_device_train_batch_size=8,
    logging_dir="./log",
    save_steps=1000,
    save_total_limit=2,
)

In [17]:
def compute_metrics(preds):
    labels = preds.label_ids
    preds = preds.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "acc": acc, 
        "precision": precision, 
        "recall": recall,
        "f1": f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 15005
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18760
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}


Step,Training Loss
500,0.5663
1000,0.5643
1500,0.5743
2000,0.5685
2500,0.5669
3000,0.5593
3500,0.569
4000,0.5715
4500,0.57
5000,0.5554


Saving model checkpoint to ./output/checkpoint-1000
Configuration saved in ./output/checkpoint-1000/config.json
Model weights saved in ./output/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
Saving model checkpoint to ./output/checkpoint-2000
Configuration saved in ./output/checkpoint-2000/config.json
Model weights saved in ./output/checkpoint-2000/pytorch_model.bin
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
Saving model checkpoint to ./output/checkpoint-3000
Configuration saved in ./output/checkpoint-3000/config.json
Model weights saved in ./output/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [output/checkpoint-1000] due to args.save_total_limit
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
Saving model checkpoint to ./output/checkpoint-4000
Configuration saved in ./output/checkpoint-4000/config.json
Model weights saved in ./output

Saving model checkpoint to ./output/checkpoint-16000
Configuration saved in ./output/checkpoint-16000/config.json
Model weights saved in ./output/checkpoint-16000/pytorch_model.bin
Deleting older checkpoint [output/checkpoint-14000] due to args.save_total_limit
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
Saving model checkpoint to ./output/checkpoint-17000
Configuration saved in ./output/checkpoint-17000/config.json
Model weights saved in ./output/checkpoint-17000/pytorch_model.bin
Deleting older checkpoint [output/checkpoint-15000] due to args.save_total_limit
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
Saving model checkpoint to ./output/checkpoint-18000
Configuration saved in ./output/checkpoint-18000/config.json
Model weights saved in ./output/checkpoint-18000/pytorch_model.bin
Deleting older checkpoint [output/checkpoint-16000] due to args.save_total_limit
  item = {key: torch.tensor(value[idx]) for key, val

TrainOutput(global_step=18760, training_loss=0.5646822410860042, metrics={'train_runtime': 1559.7709, 'train_samples_per_second': 96.2, 'train_steps_per_second': 12.027, 'total_flos': 1.002417148707e+16, 'train_loss': 0.5646822410860042, 'epoch': 10.0})

In [19]:
trainer.evaluate(eval_dataset=valid_dataset)

***** Running Evaluation *****
  Num examples = 3737
  Batch size = 64
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5636418461799622,
 'eval_acc': 0.7497993042547498,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_runtime': 7.2911,
 'eval_samples_per_second': 512.543,
 'eval_steps_per_second': 8.092,
 'epoch': 10.0}

In [None]:
def sentence_predict(sent):
    model.eval()

    tokenized_sent = tokenizer(
        sent,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
        add_special_tokens=True,
    )

    tokenized_sent.to(device)

    with torch.no_grad():
        output = model(**tokenized_sent)

    logits = output[0].detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    if result == 0:
        return "악성댓글"
    else:
        return "정상댓글"

while True:
    sent = input("문장을 입력하세요: ")
    if sent == "0":
        break
    print(sentence_predict(sent))
    print("-"*50)

문장을 입력하세요: 나는 중국인이 싫어
악성댓글
--------------------------------------------------
