In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel #, AutoTokenizer

In [2]:
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

In [3]:
model = TFBertModel.from_pretrained("klue/bert-base", output_hidden_states = True, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'bert.embeddings.position_ids', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [5]:
with open('../data/Korean_movie_reviews_2016.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t') for doc in f ]
    docs = [(doc[0], int(doc[1])) for doc in docs if len(doc) == 2]
    texts, labels = zip(*docs)

In [6]:
texts=texts[:2000]

In [7]:
labels=labels[:2000]

In [8]:
# for layer in model.layers:
#     layer.trainable=False

In [9]:
tokenized_data = tokenizer(texts, return_tensors="np", max_length=30, padding='max_length', truncation=True)

In [10]:
tokenized_data

{'input_ids': array([[   2, 3902, 1903, ...,    0,    0,    0],
       [   2, 3629, 1556, ...,    0,    0,    0],
       [   2, 4027, 1537, ...,    0,    0,    0],
       ...,
       [   2, 1041, 3677, ...,    0,    0,    0],
       [   2, 3771, 3614, ...,    0,    0,    0],
       [   2, 1472, 4200, ...,    0,    0,    0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

In [11]:
outputs = model(tokenized_data)

In [12]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

### 마지막 인코더 블록의 결과물 사용하기

In [13]:
features1 = outputs.last_hidden_state[:,0,:].numpy()

In [14]:
features1.shape

(2000, 768)

In [15]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features1, labels, test_size=0.2, random_state=0)

In [16]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features, train_labels)

In [17]:
pred_labels = lr2.predict(test_features)
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.77      0.80      0.79       192
           1       0.81      0.78      0.80       208

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400



### 풀러 층의 결과물 사용해 보기

In [18]:
features2 = outputs.pooler_output.numpy()

In [19]:
train_features2, test_features2, train_labels2, test_labels2 = train_test_split(features2, labels, test_size=0.2, random_state=0)

In [20]:
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features2, train_labels2)

In [21]:
pred_labels2 = lr2.predict(test_features2)
from sklearn.metrics import classification_report
print(classification_report(test_labels2, pred_labels2))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       192
           1       0.81      0.81      0.81       208

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



### FNN 사용해 보기

In [22]:
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(labels)

In [23]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(features, y_one_hot, test_size=0.2)

NameError: name 'features' is not defined

In [None]:
from tensorflow.keras import models
model = models.Sequential()

In [None]:
from tensorflow.keras import layers
model.add(layers.Dense(32, activation = 'tanh', input_shape=(X_train.shape[1],)))
# model.add(layers.Dropout(0.5))
# model.add(layers.Dense(64, activation = 'tanh'))
model.add(layers.Dense(2, activation='softmax'))

In [None]:
from tensorflow.keras.optimizers import RMSprop
model.compile(optimizer=RMSprop(0.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

In [None]:
model.evaluate(X_test,y_test)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train','val'])
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel('epoch')
plt.ylabel('acc')
plt.legend(['train','val'])
plt.show()

## BoW 모형 사용해보기

In [None]:
selected_texts = texts[:2000]

In [None]:
# To split the data into training and test datasets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(selected_texts, selected_labels, test_size=0.2, random_state=0)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) 
train_tf_features = tf_vectorizer.fit_transform(train_texts)
test_tf_features = tf_vectorizer.transform(test_texts)

In [None]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=10, penalty='l2', solver='sag', max_iter=2000) # Ridge regression
lr2.fit(train_tf_features, train_labels) # 학습
pred_labels = lr2.predict(test_tf_features)

In [None]:
from sklearn.metrics import accuracy_score
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))