In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel #, AutoTokenizer

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [3]:
model = TFBertModel.from_pretrained("bert-base-multilingual-uncased", output_hidden_states = True)

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
with open('Korean_movie_reviews_2016.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t') for doc in f ]
    docs = [(doc[0], int(doc[1])) for doc in docs if len(doc) == 2]
    texts, labels = zip(*docs)

In [6]:
texts=texts[:2000]

In [7]:
labels=labels[:2000]

In [8]:
# for layer in model.layers:
#     layer.trainable=False

In [8]:
tokenized_data = tokenizer(texts, return_tensors="np", max_length=20, padding='max_length', truncation=True)

In [9]:
tokenized_data

{'input_ids': array([[  101,  1170, 46188, ..., 97104,   102,     0],
       [  101, 47529,  1175, ..., 40049, 97104,   102],
       [  101,  1175, 29347, ..., 97107, 16801,   102],
       ...,
       [  101, 96314,  1180, ...,     0,     0,     0],
       [  101, 41912,  1174, ..., 35945, 18501,   102],
       [  101,  1174, 65633, ..., 63277, 40815,   102]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])}

In [10]:
outputs = model(tokenized_data)

In [11]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

### 마지막 인코더 블록의 결과물 사용하기

In [11]:
features1 = outputs.last_hidden_state[:,0,:].numpy()

In [12]:
features1.shape

(2000, 768)

In [13]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features1, labels, test_size=0.2, random_state=0)

In [14]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features, train_labels)

LogisticRegression(C=1, max_iter=10000, solver='saga')

In [17]:
pred_labels = lr2.predict(test_features)
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.61      0.66      0.64       192
           1       0.66      0.62      0.64       208

    accuracy                           0.64       400
   macro avg       0.64      0.64      0.64       400
weighted avg       0.64      0.64      0.64       400



### 풀러 층의 결과물 사용해 보기

In [15]:
features2 = outputs.pooler_output.numpy()

In [16]:
train_features2, test_features2, train_labels2, test_labels2 = train_test_split(features2, labels, test_size=0.2, random_state=0)

In [17]:
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features2, train_labels2)

LogisticRegression(C=1, max_iter=10000, solver='saga')

In [18]:
pred_labels2 = lr2.predict(test_features2)
from sklearn.metrics import classification_report
print(classification_report(test_labels2, pred_labels2))

              precision    recall  f1-score   support

           0       0.59      0.68      0.63       192
           1       0.65      0.56      0.60       208

    accuracy                           0.62       400
   macro avg       0.62      0.62      0.62       400
weighted avg       0.62      0.62      0.62       400

