# 네이버 영화 리뷰 감성분석

최소한의 성능이라도 작동하는 코드를 작성하자.

In [1]:
# import modules

import wandb
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch_optimizer as custom_optim

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgyul611[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
id = wandb.util.generate_id()
id

'xeke1rp8'

In [4]:

wandb.init(project="Binary_Classification_nsmc",
           entity="gyul611",
           id=id,
           name="binary_nsmc")

## 데이터셋 불러오기

In [5]:
from datasets import load_dataset

nsmc_dataset = load_dataset('nsmc')

In [6]:
nsmc_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

간단한 EDA를 통해 데이터 분포와 특징을 살펴보자.

In [7]:
# data move to dataFrame
nsmc_df = nsmc_dataset['train'].to_pandas()

In [8]:
nsmc_df.groupby('label').count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75173,75173
1,74827,74827


label 분포는 적당하다

In [9]:
nsmc_df['length'] = nsmc_df['document'].str.len()
nsmc_df['length'].describe()

count    150000.000000
mean         35.203353
std          29.532097
min           0.000000
25%          16.000000
50%          27.000000
75%          42.000000
max         146.000000
Name: length, dtype: float64

0-0 base line  
리뷰인데, 최솟값이 0인 것이 보인다.   
추후에 데이터를 자세히 살펴보자.

In [10]:
# Set Aurguement
batch_size = 256
max_length = 146
warmup_ratio = 0.05
pretrained_model = "klue/roberta-base"

## 전처리
1. train 데이터를 random하게 shuffleing한다.
2. train 데이터를 train과 valid셋으로 나눈다.
3. DataLoader에 주입하고 batch 별 데이터에 맞게 collate를 수행함

In [11]:
def shuffle_and_split(data, valid_ratio=.2):
    data, labels = data['document'], data['label']

    # label-index map을 만듬(해당 task에서는 label이 이미 숫자이므로 수행할 필요는 없지만 
    # 범용성을 위해 수행)
    unique_labels = list(set(labels))
    label_to_index = {}
    index_to_label = {}
    for i, label in enumerate(unique_labels):
        label_to_index[label] = i
        index_to_label[i] = label

    # label value를 integer value로 convert 수행
    labels = list(map(label_to_index.get, labels))

    # Shuffle before split into train and validation set
    shuffled = list(zip(data, labels))
    random.shuffle(shuffled)
    data = [element[0] for element in shuffled]
    labels = [element[1] for element in shuffled]
    idx = int(len(data) * (1 - valid_ratio)) # split할 경계의 index값

    data = {

        'train':{
            'document':data[:idx],
            'label':labels[:idx]
        },
        'validation':{
            'document':data[idx:],
            'label':labels[idx:]
        }
    }

    return data, index_to_label
                    

In [12]:
data, index_to_label = shuffle_and_split(nsmc_dataset['train'])
print(index_to_label)

{0: 0, 1: 1}


In [13]:
class TextClassificationCollator():
    def __init__(self, tokenizer, max_length, with_text=True):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.with_text = with_text # tokenization을 통과하면 text는 반환하지 않기 때문에 필요에 
                                   # 따라 원본 텍스르도 함께 반환하도록 한다. 

    def __call__(self, samples):
        texts, labels = [], []
        for text, label in samples:
            texts += [text]
            labels += [label]

        encoding = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length
        )

        return_value = {
            'input_ids':encoding['input_ids'],
            'token_type_ids':encoding['token_type_ids'],
            'attention_mask':encoding['attention_mask'],
            'labels':torch.tensor(labels, dtype=torch.long),    
        }
        if self.with_text:
            return_value['text'] = texts 

        return return_value

class TextClassificationDataset(Dataset):

    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        return text, label

In [14]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

In [15]:
train_loader = DataLoader(
    TextClassificationDataset(data['train']['document'], data['train']['label']),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=TextClassificationCollator(tokenizer, max_length)
)
valid_loader = DataLoader(
    TextClassificationDataset(data['validation']['document'], data['validation']['label']),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=TextClassificationCollator(tokenizer, max_length)
)

In [16]:
print(len(train_loader)*batch_size)
print(len(valid_loader)*batch_size)

120064
30208


## 학습 준비

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# set hyper-parameters
epochs = 10
learning_rate = 5e-3 # 0.0005
n_total_iterations = len(train_loader)*batch_size
n_warmup_steps = int(n_total_iterations * warmup_ratio)

print("total_iteration: ", n_total_iterations)
print("number of warmup steps: ", n_warmup_steps)

total_iteration:  120064
number of warmup steps:  6003


In [20]:
# set AdamW
optimizer = optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    eps=1e-8
)

In [21]:
# set criterion
crit = nn.BCELoss()

In [22]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    n_warmup_steps,
    n_total_iterations
)

In [23]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [24]:
def get_accuracy(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [25]:
model.init_weights()

In [None]:

for epoch in range(epochs):
    train_losses = []
    model.train()

    for step, batch in enumerate(train_loader):
        
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        y_i = batch['labels'].to(device)

        model.zero_grad() #??
        y_hat_i = model(input_ids=input_ids, 
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_mask).logits
        # print(y_i.float())
        # print(y_hat_i.t()[1])
        # print(torch.sigmoid(y_hat_i.t()[1]))
        # print(y_hat_i)
       
        loss = crit(torch.sigmoid(y_hat_i.t()[1]), y_i.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # if step % 100 == 0:
        #     print(f"batch loss: {loss:.5f}")
        print(f"batch loss: {loss:.5f}")
        # train_losses.append(loss.item())
        # train_acc += get_accuracy(y_hat_i.argmax(dim=1), label)
    

    # print("Train loss: ", np.mean(train_losses))
    # print("Train acc: ", train_acc/len(train_loader.dataset))

batch loss: 0.69270
batch loss: 0.69363
batch loss: 0.69512
batch loss: 0.68934
batch loss: 0.69292
batch loss: 0.69289
batch loss: 0.69397
batch loss: 0.69471
batch loss: 0.69520
batch loss: 0.69348
batch loss: 0.69244
batch loss: 0.69405
batch loss: 0.69775
batch loss: 0.69238
batch loss: 0.69226
batch loss: 0.69177
batch loss: 0.68861
batch loss: 0.69395
batch loss: 0.69408
batch loss: 0.69526
batch loss: 0.69370
batch loss: 0.69510
batch loss: 0.69365


왜 Loss가 정상적으로 줄어들지 않을까?  
loss function을 잘 못 설정했나?  
optimizer는 제대로 작동하고 있나?  
model의 output에 문제는 없는가?  