In [29]:
!pip install tensorflow
!pip install torch
from tensorflow.python.client import device_lib 
# device_lib.list_local_devices()



In [2]:
# Hugging Face의 트랜스포머 모델을 설치
!pip install transformers
# !pip install pytorch-transformers



In [3]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [4]:
# 디렉토리의 파일 목록
!ls nsmc -la

total 38636
drwxrwxr-x  5 team2 team2     4096 Mar 15 16:16 .
drwxr-xr-x 16 team2 team2     4096 Mar 22 09:18 ..
drwxrwxr-x  2 team2 team2     4096 Mar 15 16:16 code
drwxrwxr-x  8 team2 team2     4096 Mar 15 16:16 .git
-rw-rw-r--  1 team2 team2  4893335 Mar 15 16:16 ratings_test.txt
-rw-rw-r--  1 team2 team2 14628807 Mar 15 16:16 ratings_train.txt
-rw-rw-r--  1 team2 team2 19515078 Mar 15 16:16 ratings.txt
drwxrwxr-x  2 team2 team2   462848 Mar 15 16:16 raw
-rw-rw-r--  1 team2 team2     2596 Mar 15 16:16 README.md
-rw-rw-r--  1 team2 team2    36746 Mar 15 16:16 synopses.json


## 모델로드

In [19]:
# GPU 디바이스 이름 구함
device_name = tf.test.gpu_device_name()

# GPU 디바이스 이름 검사
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [20]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB


In [25]:
# load model, optimizer 설정
# 분류를 위한 BERT 모델 생성

model_PATH = "/home/team2/model/"
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
model.cuda()

# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                       lr = 2e-5, # 학습률,
                       eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                       )

model.load_state_dict(torch.load(model_PATH + "model_state_dict.pt"))
optimizer.load_state_dict(torch.load(model_PATH + "optimizer_state_dict.pt"))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

## 라벨링

In [22]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [23]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [24]:
def softmax(a) :
  e_a = np.exp(a - np.max(a))
  return e_a / e_a.sum()

In [27]:
# softmax 적용값인 지수표기를 숫자로 지정

np.set_printoptions(precision=6, suppress=True)
 
pd.options.display.float_format = '{:.5f}'.format
pd.reset_option('display.float_format')

In [28]:
# 테스트
# logits = test_sentences(['주연배우가 아깝다. 총체적 난국...'])

# print(logits)
# print(np.argmax(logits))
# print(softmax(logits))

[[ 5.113339 -3.301582 -2.395385]]
0
[[0.999231 0.000221 0.000548]]


In [78]:
real = pd.read_csv('./data/union_news.csv', header=None, sep="\t", names=['document','label'])
# print(real.shape)
# real.head(10)
news = real['document']

In [79]:
from tqdm import tqdm

document = []
label = []
result = []

for i in tqdm(news):
#     print(i)
    logits = test_sentences([i])
    document.append(i)
    label.append(np.argmax(logits))
    result.append(softmax(logits))

dff = pd.DataFrame({"document": document, "label" : label, "result": result , })

100%|██████████| 1350/1350 [00:11<00:00, 112.69it/s]


In [80]:
dff.to_csv('union_news_label.csv', encoding='utf-8')