<a href="https://colab.research.google.com/github/yeonok93/CP2/blob/main/1_%EB%89%B4%EC%8A%A4%EC%B9%B4%ED%85%8C%EA%B3%A0%EB%A6%AC%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 모듈 import, 환경 설정

In [None]:
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install kobert_transformers

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import json
from sklearn.model_selection import train_test_split
import os

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW    #모델의 초기값 함수를 아담으로 지정
from transformers.optimization import get_cosine_schedule_with_warmup

In [3]:
device = torch.device("cuda:0")   #gpu 사용
# device = torch.device("cpu")
if torch.cuda.is_available():
    print('GPU running')
else:
    print('GPU not running')

GPU running


## 데이터 준비

In [4]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# text의 길이가 80이상 256이하의 문장 만들기
def text_slicing(text, max_seq_len) :
  text = text.split('\n')
  saved_text, s, i, data = '', 0, 0, []
  for t in text :
    if len(t) < 8 or t[0] in ['o','『', '(', '┌','│', '└', 'ㄴ', '┌','├','◎', '[', '■', 'ㄱ', '-', '.', '<'] : 
      continue
    if t[-1] in ['쪽', '-', '>', ']'] or t[1] in [')'] or t[0:2] in ['vs', '만,', '해!'] :
      continue
    if len(t) > max_seq_len :
        data.append(saved_text)
        data.append(t)
        saved_text, s = '', 0
        i += 1
    elif s + len(t) > max_seq_len :
        data.append(saved_text)
        saved_text, s = t, len(t)
        i += 1
    else :
        saved_text += t
        s += len(t)
  data.append(saved_text)
  
  ret_data = []
  for d in data :
    if len(d) >= 80 :
      ret_data.append(d)
  return ret_data

In [8]:
data_path = '/content/drive/MyDrive/news_class9x1400'
max_seq_len = 256
train_dataset = []
train_labels = []

for (path, dir, files) in os.walk(data_path):
    for filename in files:
        ext = os.path.splitext(filename)[-1]
        if ext == '.txt':
            with open("%s/%s" % (path, filename), encoding="utf-8") as f:
                label = path[path.rindex('/')+1:]
                text = f.read()
                data = text_slicing(text, max_seq_len)
                train_dataset.extend(data)
                train_labels.extend([label]*len(data))

print('뉴스 카테고리 :', set(train_labels))
len(train_dataset), len(train_labels)

뉴스 카테고리 : {'ITscience', 'politic', 'economy', 'health', 'sport', 'life', 'culture', 'social', 'entertainment'}


(137023, 137023)

In [9]:
train_dataset[0]

'자꾸 읽다가 정들어버린 〈탈북 여대생〉책을 읽는 일, 책을 만드는 일. 오랫동안 꿈꿔왔던 일을 하기 위해 출판사에 발을 내딛었습니다.그리고 첫 출근날, 처음으로 맞이했던 원고는 교정이 거의 완성된 원고였기에 심하게(?) 교정교열을 볼 필요는 없었지만 사소한 맞춤법, 띄어쓰기 등을 교정하였지요.의문나는 점은 저자와 상의하면서 원고를 다듬어 나갔고, 원고를 다 살펴보고 나서는 예쁘게 포장해 줄 카피를 연구하고 보도 자료를 작성했습니다.'

## 전처리

In [10]:
#bert 모델, vocab 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [11]:
df = pd.DataFrame({'content':train_dataset,
                   'label':train_labels})

# dataset의 balance 여부 확인
df.groupby(by=['label']).count()

Unnamed: 0_level_0,content
label,Unnamed: 1_level_1
ITscience,16362
culture,15139
economy,16302
entertainment,15334
health,16004
life,14764
politic,13766
social,14396
sport,14956


In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(df['label'])
num_labels = len(label_encoder.classes_)

df['encoded_label'] = np.asarray(label_encoder.transform(df['label']), dtype=np.int32)
df.tail(3)

Unnamed: 0,content,label,encoded_label
137020,"궁금해 물으니 꼼수라도 부리는 양 우물쭈물하지 않고 찝찝함 없이 대답하고, 무례하지...",economy,2
137021,공사중인 현장이기에 제한된 공간에 머물 수밖에 없었지만 가족의 보금자리가 시공되는 ...,economy,2
137022,요즘 같은 불경기에 이런저런 정책이나 금융지원도 도움이 되겠지만 건설사들이 자구책으...,economy,2


In [13]:
# Split Train and Validation data
train_texts, test_texts, train_labels, test_labels = train_test_split(df.content, df.encoded_label, test_size=0.2, random_state=123, shuffle=True, stratify=df.encoded_label)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=123, shuffle=True, stratify=train_labels)

print(len(train_texts), len(train_labels))
print(len(val_texts), len(val_labels))
print(len(test_texts), len(test_labels))

87694 87694
21924 21924
27405 27405


In [14]:
train_dataset, val_dataset, test_dataset = [], [], []

for text, label in zip(train_texts, train_labels) :
  train_dataset.append([text, label])

for text, label in zip(val_texts, val_labels) :
  val_dataset.append([text, label])

for text, label in zip(test_texts, test_labels) :
  test_dataset.append([text, label])

len(train_dataset), len(val_dataset), len(test_dataset)

(87694, 21924, 27405)

In [15]:
train_dataset[0]

['납득할 수 없는 이유로 작별을 고하게 됩니다라며 아쉬움을 전했다.마지막으로 허 아나운서는 5천만 소비자의 불만이 사라지는 그 날까지 함께하자던 약속. 그 약속을 결정권자는 기억하지 못하시나봅니다.잊지 말아주세요.수없이 많아진 소비자 고발 프로그램 가운데 그 시작은 MBC가 만들어 낸 불만제로라는 작품이었음을이라며 장문의 글을 끝맺었다.MBC 노동조합 부당하게 폐지당한 <불만제로> 후속 <블랙박스>는 졸속프로그램',
 3]

In [16]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [17]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [34]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [19]:
#BERTDataset 클래스 이용, TensorDataset으로 만들어주기
train_data = BERTDataset(train_dataset, 0, 1, tok, max_len, True, False)
val_data = BERTDataset(val_dataset, 0, 1, tok, max_len, True, False)
test_data = BERTDataset(test_dataset, 0, 1, tok, max_len, True, False)

In [20]:
#배치 및 데이터로더 설정
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1)

In [21]:
del df
del train_texts
del train_labels
del val_texts
del val_labels
del test_texts
del test_labels

del train_data
del val_data
del test_data
del tok
del tokenizer
del train_dataset
del val_dataset
del test_dataset

In [22]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=9,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        #valid_length 까지만 1, 나머지는 0으로 mask를 생성
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [23]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [24]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [25]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [26]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [27]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [28]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [29]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        # if batch_id % log_interval == 0:
        #     print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 1 train acc 0.6422352037094926


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 1 test acc 0.7586571793468345


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 2 train acc 0.7834544649369594


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 2 test acc 0.7713555920452472


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 3 train acc 0.8513516593727206


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 3 test acc 0.7789819376026272


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 4 train acc 0.8960042851932896


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 4 test acc 0.7808064221857325


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 5 train acc 0.9299895149525893


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 5 test acc 0.7790549169859514


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 6 train acc 0.9536036652078774


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 6 test acc 0.7893450100346652


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 7 train acc 0.971473832968636


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 7 test acc 0.7948549534756432


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 8 train acc 0.9834518599562363


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 8 test acc 0.7965699689837621


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 9 train acc 0.9900847921225383


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 9 test acc 0.7987228607918263


  0%|          | 0/1371 [00:00<?, ?it/s]

epoch 10 train acc 0.9934810357403355


  0%|          | 0/27405 [00:00<?, ?it/s]

epoch 10 test acc 0.8025542784163474


In [30]:
def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    return ((np.exp(valscpu[idx]))/a).item() * 100

In [31]:
torch.save(model.state_dict(), "news_category_classification.pt")
modelload = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
modelload.load_state_dict(torch.load("news_category_classification.pt", device))

<All keys matched successfully>

In [None]:
modelload.eval()

In [32]:
def testModel(model, seq):
    cate = ["정치","경제","사회", "생활/문화","세계","기술/IT", "연예", "스포츠"]
    tmp = [seq]
    transform = nlp.data.BERTSentenceTransform(tok, max_len, pad=True, pair=False)
    tokenized = transform(tmp)

    modelload.eval()
    result = model(torch.tensor([tokenized[0]]).to(device), [tokenized[1]], torch.tensor(tokenized[2]).to(device))
    idx = result.argmax().cpu().item()
    print("뉴스의 카테고리는:", cate[idx])
    print("신뢰도:", "{:.2f}%".format(softmax(result,idx)))

In [35]:
testModel(model, "신형 아이패드 프로에 m1칩 탑재 예정")

뉴스의 카테고리는: 정치
신뢰도: 99.99%


  


In [36]:
testModel(model, "해당 녹취록에는 대장동 핵심 인물인 김만배 씨가 윤석열 국민의힘 대통령후보가 과거 서울 중앙지검에 근무했던 시절 부산저축은행 대출비리 의혹을 봐주기 수사했다는 취지의 내용이 담겼다.")

뉴스의 카테고리는: 사회
신뢰도: 99.97%
