# 자연어 처리 감정분석 전이학습

- GPU로 런타임 연결하기

In [3]:
!pip install -qq torch transformers datasets numpy evaluate pandas

In [4]:
!pip install -qq accelerate -U

In [5]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
import pandas as pd
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
dataset = load_dataset("sepidmnorozy/Korean_sentiment")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 36000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1333
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 2667
    })
})

In [7]:
print(dataset['train'][3118])
print(dataset['train'][14310])

{'label': 1, 'text': '졸잼!!!성아가나중에억울한일이잇어서좀슬펏는데마지막은기쁘게끝나서다행이에여'}
{'label': 0, 'text': '진짜 어떻게 된놈의 영화가 여고괴담 1보다도 못함? 신기하다 그것도 2012년작이 1998년보다 못함 솔까 여고괴담1은 반전은 최고지 뭐 이놈의 영화는 여고괴담 시리즈보다도 못하는거같다'}


## 토큰화 Tokenize

https://huggingface.co/kykim/bert-kor-base

In [8]:
model_name = "kykim/bert-kor-base"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer



BertTokenizerFast(name_or_path='kykim/bert-kor-base', vocab_size=42000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [10]:
def tokenizer_func(x):
    return tokenizer(
        x['text'],
        padding="max_length",
        max_length=256,
        truncation=True
    )

In [11]:
tokenized_datasets = dataset.map(tokenizer_func, batched=True)

Map: 100%|██████████| 1333/1333 [00:00<00:00, 4672.60 examples/s]


In [12]:
train_num_samples = 10000

train_ds = tokenized_datasets['train'].shuffle(seed=42).select(range(train_num_samples))
eval_ds = tokenized_datasets['validation'].shuffle(seed=42)

## 전이학습 Transfer Learning

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at kykim/bert-kor-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Hyperparameters

In [14]:
bs = 16
epochs = 4
lr = 1e-5

https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments

In [15]:
args = TrainingArguments(
    'outputs',
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    bf16=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    gradient_accumulation_steps=4, # until bs=128
    eval_accumulation_steps=4,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none'
)



### Metrics

In [16]:
metric = evaluate.load('accuracy')

# all Transformers models return logits
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

### Trainer

In [17]:
import gc
gc.collect()


11

In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()

  0%|          | 0/624 [00:00<?, ?it/s]

In [None]:
trainer.save_model("./mymodel")

## 추론 Inference

In [None]:
pipe = pipeline('text-classification', model="./mymodel")

### 테스트셋 사용

In [None]:
test_data = dataset['validation'].shuffle(seed=424)[:100]
td = pd.DataFrame(test_data)
td

In [None]:
preds = pipe(td['text'].tolist())

preds_df = pd.DataFrame(preds)
preds_df

In [None]:
preds_df.rename(columns={'label':'pred'}, inplace=True)
preds_df['pred'] = preds_df['pred'].map({'LABEL_1': 1, 'LABEL_0': 0})

preds_df = pd.concat([preds_df, td], axis=1)
preds_df

In [None]:
mask = preds_df['pred'] == preds_df['label']

len(preds_df[mask])

### 내 데이터셋

In [None]:
txts = [
    {'label': 0, 'text': "절대로 강추할 수 없는 영화"},
    {'label': 0, 'text': "절대로 추천할 수 없는 영화"},
    {'label': 1, 'text': "또 보고 싶다."},
    {'label': 0, 'text': "이걸 보면서 웃을 수는 없다."},
    {'label': 0, 'text': "처음에는 재미있었는데 갈수록 산으로 가는 내용."},
    {'label': 1, 'text': "요즘 재미없는 영화만 나오는데 신선한 충격을 준 영화."},
    {'label': 1, 'text': "유명한 감독이나 배우가 나오지는 않지만 스토리가 감동"}
]

txts_td = pd.DataFrame(txts)
txts_td

In [None]:
preds_txts = pipe(txts_td['text'].tolist())

In [None]:
preds_txts_df = pd.DataFrame(preds_txts)
preds_txts_df.rename(columns={'label':'pred'}, inplace=True)
preds_txts_df['pred'] = preds_txts_df['pred'].map({'LABEL_1': 1, 'LABEL_0': 0})

preds_txts_df = pd.concat([preds_txts_df, txts_td], axis=1)
preds_txts_df

### 파이프라인 사용하지 않고 모델 로딩

In [None]:
model_inf = AutoModelForSequenceClassification.from_pretrained("./mymodel")
inputs = tokenizer(txts_td['text'].tolist(), padding=True, return_tensors="pt")

with torch.no_grad():
    logits = model_inf(**inputs).logits

In [None]:
logits.argmax(axis=1)