# Load dataset

In [1]:
from datasets import load_dataset

# naver 영화 감성분석
nsmc_dataset = load_dataset('nsmc')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nsmc_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [3]:
nsmc_dataset['train'][0]

{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}

In [4]:
# 각 feature에 대한 설명 보기
nsmc_dataset['train'].features

{'id': Value(dtype='string', id=None),
 'document': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None)}

In [5]:
# label 값에 대한 class 보기
nsmc_dataset['train'].features['label'].str2int('negative')

0

In [6]:
nsmc_dataset['train'].features['label'].str2int('positive')

1

In [7]:
labels = nsmc_dataset['train'].features['label'].names

for n, i in enumerate(labels):
    print(n, i)
    

0 negative
1 positive


In [8]:
# move to dataframe
nsmc_df = nsmc_dataset['train'].to_pandas()
nsmc_df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [9]:
# label간에 분포도 살펴보기
nsmc_df.groupby('label').count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75173,75173
1,74827,74827


In [10]:
# 각 문장 길이를 구하고 통계값 살펴보기
nsmc_df['review_length'] = nsmc_df['document'].str.len()
nsmc_df['review_length'].describe()

count    150000.000000
mean         35.203353
std          29.532097
min           0.000000
25%          16.000000
50%          27.000000
75%          42.000000
max         146.000000
Name: review_length, dtype: float64

In [11]:
nsmc_df[nsmc_df['review_length'] < 2 ]

Unnamed: 0,id,document,label,review_length
151,7348295,아,0,1
384,7679615,잼,1,1
584,7117896,1,0,1
593,6478189,4,0,1
1058,5890638,4,0,1
...,...,...,...,...
149247,3747149,굿,1,1
149718,7690797,.,1,1
149862,7175749,.,0,1
149919,6502490,짱,1,1


# Preprocess

In [12]:
# Tokenize
from transformers import AutoTokenizer
tokenize = AutoTokenizer.from_pretrained('beomi/kcbert-base')

In [13]:
# tokenizer 살펴보기
tokenize

BertTokenizerFast(name_or_path='beomi/kcbert-base', vocab_size=30000, model_max_length=300, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [14]:
tokenize.tokenize(nsmc_df['document'][100])

['신',
 '##카',
 '##이',
 '마',
 '##코',
 '##토',
 '##의',
 '작',
 '##화',
 '##와',
 ',',
 '미',
 '##유',
 '##와',
 '하나',
 '##카',
 '##나가',
 '연기',
 '##를',
 '잘해',
 '##줘서',
 '더',
 '##대',
 '##박이',
 '##였다',
 '.']

In [15]:
tokenize(nsmc_df['document'][100])

{'input_ids': [2, 2005, 4024, 4017, 1293, 4599, 4775, 4042, 2478, 4075, 4196, 15, 1463, 4207, 4196, 8080, 4024, 10314, 11219, 4180, 12610, 10579, 832, 4140, 11414, 9827, 17, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
# collator : 한 토큰의 길이에 맞추는 작업
tokenize(['청춘 최고의 영화', '영화'], padding=True)

{'input_ids': [[2, 23061, 11446, 9376, 3], [2, 9376, 3, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]]}

In [17]:
# tokenization
def tokenizer(data):
    return tokenize(data['document'], max_length=32, padding='max_length', truncation=True)

In [18]:
# do toeknization
nsmc_dataset_tokenized = nsmc_dataset.map(tokenizer)

In [19]:
# 결과 보기
print(nsmc_dataset_tokenized['train'][0])

{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0, 'input_ids': [2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [20]:
nsmc_dataset_tokenized['train'].features

{'id': Value(dtype='string', id=None),
 'document': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

# Load model

In [21]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [22]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('beomi/kcbert-base', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Set hyper-parameters

epochs = 3
learning_rate = 0.001
batch_size=256

# train model: Pytorch

In [34]:
from torch.utils.data import DataLoader

In [35]:
# 학습을 위해 DataLoader에 먹이기
train_ds = nsmc_dataset_tokenized['train'].remove_columns(['id', 'document'])
train_ds.set_format(type='torch')

In [36]:
train_ds

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 150000
})

In [37]:
train_ds = DataLoader(train_ds, batch_size=batch_size)

In [38]:
train_ds

<torch.utils.data.dataloader.DataLoader at 0x28e903e9d90>

In [39]:
valid_ds = nsmc_dataset_tokenized['test'].remove_columns(['id', 'document'])
valid_ds.set_format(type='torch')
valid_ds = DataLoader(valid_ds, batch_size=batch_size)

In [40]:
next(iter(train_ds))

{'label': tensor([0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
         0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
         0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
         0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
         1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
         0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
         0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
         0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
         0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
         1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1]),
 'input_ids': tensor([[    2,  2170,   832,  ...,     0,     0,     0],
         [    2,  3521,    17,  ...,     0,     0,

In [41]:
next(iter([0,2,3,4,5,5,5,5,5,6]))

0

In [45]:
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

# set metric: accuracy
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()   

In [46]:
# torch style로 학습하기

model.to(device)
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):

    # train zone
    train_losses = []
    train_acc = .0
    model.train()

    for step, batch in enumerate(tqdm(train_ds)):
        label = batch['label'].to(device)
        input_id, token_type_ids, attention_mask = batch['input_ids'].to(device), batch['token_type_ids'].to(device), batch['attention_mask'].to(device)

        model.zero_grad() # gradient 초기화
        pred = model(input_id, token_type_ids, attention_mask)
        loss = criterion(torch.sigmoid(pred.logits.t()[1]), label.float()) #?

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        train_acc += acc(pred.logits.argmax(dim=1), label)

    print("train loss: ", np.mean(train_losses))
    print("train acc: ", train_acc/len(train_ds.dataset))

    # validation zone
    val_losses = []
    val_acc = 0
    model.eval()

    for step, batch in enumerate(tqdm(valid_ds)):
        label = batch['label'].to(device)
        input_id, token_type_ids, attention_mask = batch['input_ids'].to(device), batch['token_type_ids'].to(device), batch['attention_mask'].to(device)

        pred = model(input_id, token_type_ids, attention_mask) # feed-forward
        loss = criterion(torch.sigmoid(pred.logits.t()[1]), label.float())

        val_losses.append(loss.item())
        val_acc += acc(pred.logits.argmax(dim=1), label)

    print("Valid loss: ", np.mean(val_losses))
    print("Valid acc: ", val_acc/len(valid_ds.dataset))

100%|████████████████████████████████████████████████████████████████████████████████| 586/586 [02:50<00:00,  3.44it/s]


train loss:  708.0613669958537
train acc:  0.5018666666666667


100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:20<00:00,  9.43it/s]


Valid loss:  711.9501339659399
Valid acc:  0.49654


100%|████████████████████████████████████████████████████████████████████████████████| 586/586 [02:51<00:00,  3.41it/s]


train loss:  708.0561823405503
train acc:  0.5011533333333333


100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:20<00:00,  9.40it/s]


Valid loss:  711.9501339659399
Valid acc:  0.49654


100%|████████████████████████████████████████████████████████████████████████████████| 586/586 [02:50<00:00,  3.43it/s]


train loss:  708.0561832779504
train acc:  0.5011533333333333


100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:20<00:00,  9.42it/s]

Valid loss:  711.9501339659399
Valid acc:  0.49654





# HuggingFace's Trainsformers Trainer

In [None]:
from transformers import TrainingArguments

logging_steps = int(len(nsmc_dataset['train']) // batch_size)
output_dir = 'trainer_test'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    logging_steps=logging_steps, #
    fp16=True, #
    push_to_hub=False)


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
def compute_metrics(pred):
    labels=pred.label_ids
    preds=pred.predictions.argmax(-1)
    # # # #
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
    'accuracy':acc,
    'f1': f1,
    'precision':precision,
    'recall':recall}
    

In [None]:
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('beomi/kcbert-base', num_labels=2)

trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    train_dataset=nsmc_dataset_tokenized['train'],
    eval_dataset=nsmc_dataset_tokenized['test'],
    tokenizer=tokenize)

In [None]:
# Let's train
trainer.train()