# KorNLI 분류

In [154]:
# !pip install transformers
# !pip install datasets

## 데이터셋 로드 및 구조 확인

In [155]:
from datasets import load_dataset

cs = load_dataset("klue", "nli", split='train')
cs = cs.train_test_split(test_size=0.1)
train_cs = cs['train']
valid_cs = cs['test']

test_cs = load_dataset("klue", 'nli', split='validation')

In [156]:
train_cs

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
    num_rows: 22498
})

In [157]:
valid_cs

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
    num_rows: 2500
})

In [158]:
test_cs

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
    num_rows: 3000
})

## 데이터전처리

In [159]:
import pandas as pd
import numpy as np
import random
import time
import datetime
from tqdm import tqdm

import csv
import os

import torch

# BERT 사용을 위함
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.optim import  AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# for padding
from torch.nn.utils.rnn import pad_sequence

# 전처리 및 평가 지표
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, hamming_loss

In [160]:
train_sentences = list(map(lambda train_cs: '[CLS] ' + str(train_cs['premise']) + ' [SEP] ' + str(train_cs['hypothesis']) + ' [SEP]', train_cs))
valid_sentences = list(map(lambda valid_cs: '[CLS] ' + str(valid_cs['premise']) + ' [SEP] ' + str(valid_cs['hypothesis']) + ' [SEP]', valid_cs))
test_sentences = list(map(lambda test_cs: '[CLS] ' + str(test_cs['premise']) + ' [SEP] ' + str(test_cs['hypothesis']) + ' [SEP]', test_cs))

In [161]:
train_labels= train_cs['label']
valid_labels = valid_cs['label']
test_labels = test_cs['label']

In [162]:
test_sentences[:5]

['[CLS] 흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다. [SEP] 어떤 방에서도 흡연은 금지됩니다. [SEP]',
 '[CLS] 10명이 함께 사용하기 불편함없이 만족했다. [SEP] 10명이 함께 사용하기 불편함이 많았다. [SEP]',
 '[CLS] 10명이 함께 사용하기 불편함없이 만족했다. [SEP] 성인 10명이 함께 사용하기 불편함없이 없었다. [SEP]',
 '[CLS] 10명이 함께 사용하기 불편함없이 만족했다. [SEP] 10명이 함께 사용하기에 만족스러웠다. [SEP]',
 '[CLS] 10층에 건물사람들만 이용하는 수영장과 썬베드들이 있구요. [SEP] 건물사람들은 수영장과 썬베드를 이용할 수 있습니다. [SEP]']

In [163]:
test_labels[:5]

[2, 2, 1, 0, 0]

얽힘(entailment) : 0  
중립(neutral) : 1  
모순(contradiction) : 2

## BERT 토크나이저를 이용한 전처리

In [164]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/tokenizer_config.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/tokenizer.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/config.json
Model config BertConfig {
  "architectures": [
    "BertForMask

In [165]:
tokenized_text = tokenizer.tokenize('안녕하세요')
input_id = tokenizer.convert_tokens_to_ids(tokenized_text)

print(tokenized_text)
print(input_id)

['안녕', '##하', '##세요']
[5891, 2205, 5971]


In [166]:
max_len = 128

def data_to_tensor(sentences, labels):
  tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]
  input_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_text]

  input_ids = pad_sequence([torch.tensor(i) for i in input_ids], batch_first=True,padding_value=0)

  attention_masks = []
  for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

  tensor_inputs = torch.tensor(input_ids)
  tensor_labels = torch.tensor(labels)
  tensor_masks = torch.tensor(attention_masks)

  return tensor_inputs, tensor_labels, tensor_masks

In [167]:
train_input, train_labels, train_masks = data_to_tensor(train_sentences, train_labels)
valid_input, valid_labels, valid_masks = data_to_tensor(valid_sentences, valid_labels)
test_input, test_labels, test_masks = data_to_tensor(test_sentences, test_labels)

  tensor_inputs = torch.tensor(input_ids)


In [168]:
tokenizer.decode([2])

'[CLS]'

In [169]:
tokenizer.decode([3])

'[SEP]'

In [170]:
test_input[0]

tensor([    2, 25313,  2377,  2031,  2073, 20812,  2116,  1513,  2259,  1129,
        24094, 20812, 27135,  9753,  2052,  3662, 11800,    18,     3,  3711,
         1129, 27135,  2119,  9753,  2073,  5040,  3598,  3606,    18,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])

In [171]:
tokenizer.decode(test_input[0])

'[CLS] 흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다. [SEP] 어떤 방에서도 흡연은 금지됩니다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [172]:
batch_size = 32

train_data = TensorDataset(train_input, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(valid_input, valid_masks, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [173]:
print('훈련 데이터의 크기:', len(train_labels))
print('검증 데이터의 크기:', len(valid_labels))
print('테스트 데이터의 크기:', len(test_labels))

훈련 데이터의 크기: 22498
검증 데이터의 크기: 2500
테스트 데이터의 크기: 3000


## GPU

In [174]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## 모델 로드하기

In [175]:
num_labels = 3

model = BertForSequenceClassification.from_pretrained('klue/bert-base',num_labels = num_labels)
model.cuda()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.50.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--k

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [176]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps=1e-8)

In [177]:
epoch=2
total_steps=len(train_dataloader)*epoch
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps=0,
                                           num_training_steps=total_steps)

In [178]:
def format_time(elapsed):
  elapsed_rounded = int(round(elapsed))
  return str(datetime.timedelta(seconds=elapsed_rounded))

In [179]:
def metrics(predictions, labels):
  y_pred = predictions
  y_true = labels

  accuracy = accuracy_score(y_true, y_pred)
  f1_macro_average = f1_score(y_true, y_pred, average='macro',zero_division=0)
  f1_micro_average = f1_score(y_true, y_pred, average='micro',zero_division=0)
  f1_weighted_average = f1_score(y_true, y_pred, average='weighted', zero_division=0)

  metrics={'accuracy': accuracy,
           'f1_macro': f1_macro_average,
           'f1_micro': f1_micro_average,
           'f1_weighted': f1_weighted_average}

  return metrics

# 모델학습

In [180]:
seed_val = 810
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

model.zero_grad()
for epoch_i in range(0, epoch):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epoch))
    t0 = time.time()
    total_loss=0

    model.train()

    for step, batch in enumerate(train_dataloader):
      if step % 500 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      outputs = model(b_input_ids,
                      token_type_ids = None,
                      attention_mask=b_input_mask,
                      labels=b_labels)

      loss = outputs[0]
      total_loss += loss.item()
      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1, 0)
      optimizer.step()
      scheduler.step()

      model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)

    print("")
    print("  Average training loss: {0:.4f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

  Batch   500  of    704.    Elapsed: 0:04:15.

  Average training loss: 0.6315
  Training epcoh took: 0:05:59
  Batch   500  of    704.    Elapsed: 0:04:14.

  Average training loss: 0.3474
  Training epcoh took: 0:05:58


## 검증데이터 평가

In [181]:
t0 = time.time()
model.eval()
accum_logits, accum_label_ids = [], []

for step, batch in tqdm(enumerate(valid_dataloader)):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
    outputs = model(b_input_ids,
                    token_type_ids =None,
                    attention_mask=b_input_mask)

    logits = outputs[0]
    # logits = logits.detach().cpu().numpy()
    pred = np.argmax(logits.detach().to('cpu').numpy(), axis=1)
    label_ids = b_labels.to('cpu').numpy()

    # for b in logits:
    #   accum_logits.append(np.argmax(b))
    # for b in label_ids:
    #   accum_label_ids.append(b)

    accum_logits.extend(pred)
    accum_label_ids.extend(label_ids)

accum_logits = np.array(accum_logits)
accum_label_ids = np.array(accum_label_ids)
results = metrics(accum_logits, accum_label_ids)

print("Accuracy: {0:.4f}".format(results['accuracy']))
print("F1 (Macro) Score: {0:.4f}".format(results['f1_macro']))
print("F1 (Micro) Score: {0:.4f}".format(results['f1_micro']))
print("F1 (Weighted) Score: {0:.4f}".format(results['f1_weighted']))

79it [00:12,  6.13it/s]

Accuracy: 0.8292
F1 (Macro) Score: 0.8283
F1 (Micro) Score: 0.8292
F1 (Weighted) Score: 0.8289





## 모델 저정과 로드

In [182]:
%pwd

'/content'

In [183]:
%mkdir model

mkdir: cannot create directory ‘model’: File exists


In [184]:
path = '/content/model/'

In [185]:
torch.save(model.state_dict(), path+"BERT_kornli.pt")

In [186]:
model.load_state_dict(torch.load(path+"BERT_kornli.pt"))

<All keys matched successfully>

## 테스트 데이터 평가

In [187]:
t0 = time.time()
model.eval()
accum_logits, accum_label_ids = [], []

for step, batch in tqdm(enumerate(valid_dataloader)):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
    outputs = model(b_input_ids,
                    token_type_ids =None,
                    attention_mask=b_input_mask)

    logits = outputs[0]
    pred = np.argmax(logits.detach().to('cpu').numpy(), axis=1)
    label_ids = b_labels.to('cpu').numpy()

    accum_logits.extend(pred)
    accum_label_ids.extend(label_ids)

accum_logits = np.array(accum_logits)
accum_label_ids = np.array(accum_label_ids)
results = metrics(accum_logits, accum_label_ids)

print("Accuracy: {0:.4f}".format(results['accuracy']))
print("F1 (Macro) Score: {0:.4f}".format(results['f1_macro']))
print("F1 (Micro) Score: {0:.4f}".format(results['f1_micro']))
print("F1 (Weighted) Score: {0:.4f}".format(results['f1_weighted']))

79it [00:12,  6.21it/s]

Accuracy: 0.8292
F1 (Macro) Score: 0.8283
F1 (Micro) Score: 0.8292
F1 (Weighted) Score: 0.8289





## 예츢

In [188]:
from transformers import pipeline

In [226]:
pipe = pipeline("text-classification",
                model=model.cuda(),
                tokenizer=tokenizer,
                device=0,
                max_length=512,
                top_k=None,
                function_to_apply='softmax')

Device set to use cuda:0


In [227]:
inputs = {"text" : "흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다.", "text_pair" : "어떤 방에서도 흡연은 금지됩니다."}

In [228]:
result = pipe(inputs)
print(result)

[{'label': 'LABEL_2', 'score': 0.7422130703926086}, {'label': 'LABEL_1', 'score': 0.2403743863105774}, {'label': 'LABEL_0', 'score': 0.017412522807717323}]


In [229]:
pipe = pipeline("text-classification",
                model=model.cuda(),
                tokenizer=tokenizer,
                device=0,
                max_length=512,
                function_to_apply='softmax')

Device set to use cuda:0


In [230]:
result = pipe(inputs)
print(result)

{'label': 'LABEL_2', 'score': 0.7422130703926086}


In [231]:
label_dict = {'LABEL_0' : '얽힘', 'LABEL_1' : '중립', 'LABEL_2' : '모순'}

In [232]:
def prediction(sent1, sent2):
  text = {"text" : sent1, "text_pair" : sent2}
  result = pipe(text)
  return [label_dict[result['label']]]

In [233]:
sent1 = "흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다."
sent2 = "어떤 방에서도 흡연은 금지됩니다."

In [234]:
prediction(sent1, sent2)

['모순']

In [235]:
sent1 = "저는, 그냥 알아내려고 거기 있었어요."
sent2 = "나는 돈이 어디로 갔는지 이해하려고 했어요."

In [236]:
prediction(sent1, sent2)

['중립']

In [237]:
sent1 = "저는 그것을 이해하려고 거기 있었어요."
sent2 = "저는 이해하려고 노력하고 있었어요."

In [238]:
prediction(sent1, sent2)

['얽힘']