In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [3]:
from transformers import logging
logging.set_verbosity_error()

In [4]:
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

In [5]:
model = TFBertModel.from_pretrained("klue/bert-base", output_hidden_states = True, from_pt=True)

In [4]:
with open('Korean_movie_reviews_2016.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t') for doc in f ]
    docs = [(doc[0], int(doc[1])) for doc in docs if len(doc) == 2]
    texts, labels = zip(*docs)

In [5]:
texts=texts[:2000]

In [6]:
labels=labels[:2000]

In [7]:
tokenized_data = tokenizer(texts, return_tensors="np", max_length=30, padding='max_length', truncation=True)

In [8]:
tokenized_data

{'input_ids': array([[   2, 3902, 1903, ...,    0,    0,    0],
       [   2, 3629, 1556, ...,    0,    0,    0],
       [   2, 4027, 1537, ...,    0,    0,    0],
       ...,
       [   2, 1041, 3677, ...,    0,    0,    0],
       [   2, 3771, 3614, ...,    0,    0,    0],
       [   2, 1472, 4200, ...,    0,    0,    0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

In [9]:
outputs = model(tokenized_data)

In [10]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

### 마지막 인코더 블록의 결과물 사용하기

In [11]:
features1 = outputs.last_hidden_state[:,0,:].numpy()

In [12]:
features1.shape

(2000, 768)

In [13]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features1, labels, test_size=0.2, random_state=0)

In [14]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features, train_labels)

LogisticRegression(C=1, max_iter=10000, solver='saga')

In [15]:
pred_labels = lr2.predict(test_features)
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.77      0.80      0.79       192
           1       0.81      0.78      0.80       208

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400



### 풀러 층의 결과물 사용해 보기

In [16]:
features2 = outputs.pooler_output.numpy()

In [17]:
train_features2, test_features2, train_labels2, test_labels2 = train_test_split(features2, labels, test_size=0.2, random_state=0)

In [18]:
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features2, train_labels2)

LogisticRegression(C=1, max_iter=10000, solver='saga')

In [19]:
pred_labels2 = lr2.predict(test_features2)
from sklearn.metrics import classification_report
print(classification_report(test_labels2, pred_labels2))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       192
           1       0.81      0.81      0.81       208

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

