In [1]:
import numpy as np
import csv
import random
import torch
import torch.nn as nn

In [2]:
#크롤링한 네이버 뉴스제목 읽기
f = open('섹션별_뉴스제목_데이터.csv', 'r')
new = csv.reader(f)
tmp_data = []
f.seek(0)

#리스트에 읽어온 데이터 삽입
for i in new:
    tmp_data.append(i)
    
f.close()

#리스트를 넘파이 배열로 변환
random.shuffle(tmp_data)    
data = np.array(tmp_data)

#6:2:2 비율로 데이터셋 분리
sep = int(len(data)/5)

train = data[:(3*sep)]
valid = data[(3*sep):(4*sep)]
test = data[(4*sep):]

#트레인과 테스트 데이터 열에 이름을 붙임
train_text = train[:,0]
train_label = train[:,1]
train_label = list(map(int, train_label))

valid_text = valid[:,0]
valid_label = valid[:,1]
valid_label = list(map(int, valid_label))

test_text = test[:,0]
test_label = test[:,1]
test_label = list(map(int, test_label))

In [3]:
#참조 시작

import transformers
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", do_lower_case=False)


#BERT에 넣는 데이터 형식으로 가공
def text_processing(text, Max_len):
  input_ids = []
  mask_ids = []
  token_type_ids = []
  for line in text:
    # [cls], [sep] 추가 Max_len 길이 맞추기  
    encoded_line = tokenizer.encode(line, add_special_tokens = True, max_length=Max_len, truncation=True) 
    input = encoded_line + [0]*(Max_len-len(encoded_line))
    token_type =  [0]*len(input)
    mask = [1]*len(encoded_line)+[0]*(Max_len-len(encoded_line))    
    input_ids.append(torch.tensor(input))
    token_type_ids.append(torch.tensor(token_type))
    mask_ids.append(torch.tensor(mask))

  return input_ids, token_type_ids, mask_ids

train_input_ids, train_token_type_ids, train_mask_ids =text_processing(train_text, 128)
valid_input_ids, valid_token_type_ids, valid_mask_ids =text_processing(valid_text, 128)
test_input_ids, test_token_type_ids, test_mask_ids =text_processing(test_text, 128)

In [4]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class BertCls_dataset(Dataset):
  def __init__(self, input_ids, token_type_ids, mask_ids, label):
    self.input = input_ids
    self.token_type = token_type_ids
    self.attn_mask = mask_ids
    self.label = label
  def __len__(self):
    return len(self.input)  

  def __getitem__(self, idx):
    x = self.input[idx]
    y = self.token_type[idx]
    z = self.attn_mask[idx]
    label = self.label[idx]
    label = [label]

    return x, y, z ,torch.tensor(label)

Batch_size = 16

#Datalodaer 만들기
train_dataset=BertCls_dataset(train_input_ids, train_token_type_ids, train_mask_ids, train_label)
valid_dataset=BertCls_dataset(valid_input_ids, valid_token_type_ids, valid_mask_ids, valid_label)
test_dataset=BertCls_dataset(test_input_ids, test_token_type_ids, test_mask_ids, test_label)

train_dataloader = DataLoader(train_dataset, batch_size=Batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=Batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=Batch_size, shuffle=True)

In [5]:
from transformers import BertModel


bertcls = BertModel.from_pretrained("bert-base-multilingual-cased")

class BertCls(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_cls = bertcls
    self.dropout = nn.Dropout(0.3)
    self.relu = nn.ReLU()
    self.out = nn.Linear(768*2, num_category)

  def forward(self, input_ids, token_type_ids, mask_ids):
    output,_ = self.bert_cls(input_ids, attention_mask=mask_ids, token_type_ids=token_type_ids) #output은 hidden_state
    apool = torch.mean(output, 1)
    mpool, _ = torch.max(output, 1)
    x = torch.cat((apool, mpool), 1) #[cls]를 사용하는 방식 대신, mean, max를 한 후 concat한 output을 사용하였다.
    x = self.relu(x)
    x = self.dropout(x)
    x = self.out(x)

    return x

In [6]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import time

#트레이닝 함수
def train_one_epoch(data_loader, model, optimizer, device, loss_fn):
  
  model.train()
  tk0 = tqdm(data_loader, total=len(data_loader))
  total_loss = 0.0
  
  for bi, d in enumerate(tk0):
      input_ids, token_type_ids, attn_mask_ids, label = d
      input_ids = input_ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      attn_mask_ids = attn_mask_ids.to(device, dtype=torch.long)
      label = label.to(device, dtype=torch.long)

      #model.zero_grad()
      output = model(input_ids, token_type_ids, attn_mask_ids)
      loss = loss_fn(output, label.view(-1))
      total_loss += loss.item()
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

  avg_train_loss = total_loss / len(data_loader) 
  print(" Average training loss: {0:.2f}".format(avg_train_loss))  

#eval 함수(test, validation)
def eval_one_epoch(data_loader, model,  device, loss_fn):

  model.eval()
  tk0 = tqdm(data_loader, total=len(data_loader))
  fin_targets = []
  fin_outputs = []
  
  with torch.no_grad():

    for bi, d in enumerate(tk0):
      input_ids, token_type_ids, attn_mask_ids, label = d
      input_ids = input_ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      attn_mask_ids = attn_mask_ids.to(device, dtype=torch.long)
      label = label.to(device, dtype=torch.long)

      output = model(input_ids, token_type_ids, attn_mask_ids)
      loss = loss_fn(output, label.view(-1))

      output = output.detach().cpu().numpy()
      label = label.detach().cpu().numpy()
      pred = np.argmax(output, axis=1).flatten()

      fin_targets.extend(label.tolist())
      fin_outputs.extend(pred.tolist()) 
        
    
  return fin_outputs, fin_targets
  
def fit(train_dataloader, valid_dataloader, EPOCHS=3):
  Bert_cls=BertCls() #model
  Bert_cls=Bert_cls.to(device)
  loss_fn = nn.CrossEntropyLoss() #loss
  optimizer = torch.optim.AdamW(Bert_cls.parameters(),lr=lr) #optimizer

  for i in range(EPOCHS):
    print(f"EPOCHS:{i+1}")
    print('TRAIN')
    train_one_epoch(train_dataloader, Bert_cls, optimizer, device, loss_fn)
    print('EVAL')
    outputs, targets = eval_one_epoch(valid_dataloader, Bert_cls,  device, loss_fn)
    targets = np.array(targets)
    auc = accuracy_score(targets, outputs)
    print(f"auc;{auc}")

#참조 끝
    
  return Bert_cls
    
    
Max_len = 128
num_category = 6
lr = 3e-5
device='cuda'
EPOCHS=4

#모델 학습(트레이닝 데이터로 파인튜닝)
bert_model = fit(train_dataloader, valid_dataloader, EPOCHS)

  0%|                                                                       | 0/1224 [00:00<?, ?it/s]

EPOCHS:1
TRAIN


100%|████████████████████████████████████████████████████████████| 1224/1224 [10:13<00:00,  1.99it/s]
  0%|                                                                        | 0/408 [00:00<?, ?it/s]

 Average training loss: 0.93
EVAL


100%|██████████████████████████████████████████████████████████████| 408/408 [00:54<00:00,  7.50it/s]

auc;0.7380113375210663





In [7]:
#테스트 데이터 적용

loss_fn = nn.CrossEntropyLoss() #loss

print('TEST')
outputs, targets = eval_one_epoch(test_dataloader, bert_model,  device, loss_fn)
targets = np.array(targets)
auc = accuracy_score(targets, outputs)
print(f"auc;{auc}")

  0%|▏                                                               | 1/409 [00:00<00:54,  7.52it/s]

TEST


100%|██████████████████████████████████████████████████████████████| 409/409 [00:53<00:00,  7.61it/s]

auc;0.7446775922805943



