In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
from transformers import TFBertModel, BertTokenizer

In [3]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [4]:
df1=df[:2000]

In [5]:
df1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [6]:
labels = df1[1].values

In [7]:
texts = df1[0].values.tolist()

In [8]:
texts[:5]

['a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films',
 'apparently reassembled from the cutting room floor of any given daytime soap',
 "they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science fiction elements of bug eyed monsters and futuristic women in skimpy clothes",
 'this is a visually stunning rumination on love , memory , history and the war between art and commerce',
 "jonathan parker 's bartleby should have been the be all end all of the modern office anomie films"]

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
tokenized_data = tokenizer(texts, return_tensors="np", max_length=30, padding='max_length', truncation=True)

In [11]:
tokenized_data.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [12]:
model = TFBertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
outputs = model(tokenized_data)

In [14]:
len(outputs)

3

### 마지막 인코더 블록의 결과물 사용하기 

In [15]:
features1 = outputs.last_hidden_state[:,0,:].numpy()

In [16]:
features1.shape

(2000, 768)

In [17]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features1, labels, test_size=0.2, random_state=0)

In [18]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features, train_labels)

In [19]:
pred_labels = lr2.predict(test_features)
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       190
           1       0.84      0.82      0.83       210

    accuracy                           0.82       400
   macro avg       0.82      0.83      0.82       400
weighted avg       0.83      0.82      0.83       400



### 풀러 층의 결과물 사용해 보기

In [20]:
features2 = outputs.pooler_output.numpy()

In [21]:
features2.shape

(2000, 768)

In [22]:
train_features2, test_features2, train_labels2, test_labels2 = train_test_split(features2, labels, test_size=0.2, random_state=0)

In [23]:
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features2, train_labels2)

In [24]:
pred_labels2 = lr2.predict(test_features2)
from sklearn.metrics import classification_report
print(classification_report(test_labels2, pred_labels2))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80       190
           1       0.83      0.80      0.81       210

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

