# 네이버 영화 리뷰 감성분석

최소한의 성능이라도 작동하는 코드를 작성하자.

In [36]:
# import modules

import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch_optimizer as custom_optim

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


## 데이터셋 불러오기

In [1]:
from datasets import load_dataset

nsmc_dataset = load_dataset('nsmc')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nsmc_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

간단한 EDA를 통해 데이터 분포와 특징을 살펴보자.

In [3]:
# data move to dataFrame
nsmc_df = nsmc_dataset['train'].to_pandas()

In [4]:
nsmc_df.groupby('label').count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75173,75173
1,74827,74827


label 분포는 적당하다

In [5]:
nsmc_df['length'] = nsmc_df['document'].str.len()
nsmc_df['length'].describe()

count    150000.000000
mean         35.203353
std          29.532097
min           0.000000
25%          16.000000
50%          27.000000
75%          42.000000
max         146.000000
Name: length, dtype: float64

0-0 base line  
리뷰인데, 최솟값이 0인 것이 보인다.   
추후에 데이터를 자세히 살펴보자.

In [39]:
# Set Aurguement
batch_size = 256
max_length = 146
warmup_ratio = 0.2
pretrained_model = "klue/roberta-base"

## 전처리
1. train 데이터를 random하게 shuffleing한다.
2. train 데이터를 train과 valid셋으로 나눈다.
3. DataLoader에 주입하고 batch 별 데이터에 맞게 collate를 수행함

In [20]:
def shuffle_and_split(data, valid_ratio=.2):
    data, labels = data['document'], data['label']

    # label-index map을 만듬(해당 task에서는 label이 이미 숫자이므로 수행할 필요는 없지만 
    # 범용성을 위해 수행)
    unique_labels = list(set(labels))
    label_to_index = {}
    index_to_label = {}
    for i, label in enumerate(unique_labels):
        label_to_index[label] = i
        index_to_label[i] = label

    # label value를 integer value로 convert 수행
    labels = list(map(label_to_index.get, labels))

    # Shuffle before split into train and validation set
    shuffled = list(zip(data, labels))
    random.shuffle(shuffled)
    data = [element[0] for element in shuffled]
    labels = [element[1] for element in shuffled]
    idx = int(len(data) * (1 - valid_ratio)) # split할 경계의 index값

    data = {

        'train':{
            'document':data[:idx],
            'label':labels[:idx]
        },
        'validation':{
            'document':data[idx:],
            'label':labels[:idx]
        }
    }

    return data, index_to_label
                    

In [21]:
data, index_to_label = shuffle_and_split(nsmc_dataset['train'])

In [14]:
class TextClassificationCollator():
    def __init__(self, tokenizer, max_length, with_text=True):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.with_text = with_text # tokenization을 통과하면 text는 반환하지 않기 때문에 필요에 
                                   # 따라 원본 텍스르도 함께 반환하도록 한다. 

    def __call__(self, samples):
        texts, labels = [], []
        for text, label in samples:
            texts += [text]
            labels += [label]

        encoding = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length
        )

        return_value = {
            'input_ids':encoding['input_ids'],
            'attention_mask':encoding['attention_mask'],
            'labels':torch.tensor(labels, dtype=torch.long),    
        }
        if self.with_text:
            return_value['text'] = texts 

        return return_value

class TextClassificationDataset(Dataset):

    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        return text, label

In [19]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

Downloading (…)okenizer_config.json: 100%|████████████████████████████████████████████████████| 375/375 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 248k/248k [00:00<00:00, 721kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████████████████████████████████████| 752k/752k [00:00<00:00, 29.7MB/s]
Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████████████████████| 173/173 [00:00<?, ?B/s]


In [24]:
train_loader = DataLoader(
    TextClassificationDataset(data['train']['document'], data['train']['label']),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=TextClassificationCollator(tokenizer, max_length)
)
valid_loader = DataLoader(
    TextClassificationDataset(data['validation']['document'], data['validation']['label']),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=TextClassificationCollator(tokenizer, max_length)
)

In [27]:
print(len(train_loader)*batch_size)
print(len(valid_loader)*batch_size)

120064
30208


## 학습 준비

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [30]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
# set hyper-parameters
epochs = 5
learning_rate = 5e-4
n_total_iterations = len(train_loader)*batch_size
n_warmup_steps = int(n_total_iterations * warmup_ratio)

In [37]:
# set AdamW
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.01
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

optimizer = optim.AdamW(
    optimizer_grouped_parameters,
    lr=learning_rate,
    eps=1e-8
)

In [38]:
# set criterion
crit = nn.BCELoss()

In [41]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    n_warmup_steps,
    n_total_iterations
)

In [42]:
model.cuda()
crit.cuda()

BCELoss()

In [None]:
# trainer 