In [1]:
import sys
sys.path.append("../")

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from src.datasets.kmreview import load_korean_moview_review
from src.utils.logger import get_logger
from src.utils.session import reset_session

2024-12-21 11:44:50.030366: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
_ = get_logger()

## Load Data

In [4]:
(X_train_raw, y_train_raw), (X_valid_raw, y_valid_raw), (X_test_raw, y_test_raw) = load_korean_moview_review(
    num_sample=10000,
    val_split=True,
)

X_raw = {
    "X_train": X_train_raw,
    "X_valid": X_valid_raw,
    "X_test": X_test_raw,
}

[INFO] num train: 6400 (data.py:33)
[INFO] num valid: 1600 (data.py:34)
[INFO] num test: 2000 (data.py:35)


## Preproess
<!-- - we will use last hidden states to  -->

### Tokenize input sentences

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")



In [6]:
"""
Args:
 - return_tensors: returns numpy arra
 - max_length: limit the maximum length of sentence as 30
 - padding: fill with zeros if len(sentence) < max_length
 - truncation: truncate if len(sentence) > max_length

Outputs:
 - input_ids: tokenized input ids
 - token_type_ids: 0 if first sentence, 1 if second sentence (note: BERT takes 2 sentences as input)
 - attention_mask: 0 if input_ids[i] = [PAD], 1 otherwise
"""

X_tokenized = {
    k: tokenizer(v.tolist(), return_tensors="np", max_length=30, padding="max_length", truncation=True)
    for k, v in X_raw.items()
}

In [7]:
for i in [0, 10, 13, 20]:
    tmp_raw = X_raw["X_train"][i]
    tmp_tokenized = X_tokenized["X_train"]
    tmp_ids = tmp_tokenized["input_ids"][i]
    tmp_token_type_ids = tmp_tokenized["token_type_ids"][i]
    tmp_att_msk = tmp_tokenized["attention_mask"][i]

    print("==>")
    print(f"Raw: {tmp_raw}")
    print(f"Token ids: {tmp_ids}")
    print(f"Ids to token: {' '.join(tokenizer.convert_ids_to_tokens(tmp_ids))}")
    print(f"Attention Mask: {tmp_att_msk}")
    print()

==>
Raw:  고산 귀신이산다 이장 군수 연기하냐
Token ids: [  101 47468 22214  1163 38365 23918 11112 22214 12261 12398 14509 53543
 15783  1174 43107 13130 35132 97071 85091   102     0     0     0     0
     0     0     0     0     0     0]
Ids to token: [CLS] 고 ##산 ᄀ ##ᅱ ##신 ##이 ##산 ##다 이 ##장 군 ##수 ᄋ ##ᅧᆫ ##기 ##하 ##ᄂ ##ᅣ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]

==>
Raw:  방광 싸움 정도 한장 면도 놓치 싫었
Token ids: [  101  1170 47328 12211 13045  1173 25539 27235 13503 81463 12265 17463
 14509  1169 43107 12265  1165 29347 97109 18893 33401 97102 13413 97104
   102     0     0     0     0     0]
Ids to token: [CLS] ᄇ ##ᅡᆼ ##과 ##ᆼ ᄊ ##ᅡ ##우 ##ᆷ 정 ##도 한 ##장 ᄆ ##ᅧᆫ ##도 ᄂ ##ᅩ ##ᇂ ##치 시 ##ᆶ ##어 ##ᆻ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]

==>
Raw:  저 저런 다른 볼걸
Token ids: [  101  1175 33645  1175 33645 60532 20066 5

### Retrieve Last Hidden States / Pooler Output

In [8]:
reset_session()

In [9]:
model = TFBertModel.from_pretrained("bert-base-multilingual-uncased", output_hidden_states=True)

2024-12-21 11:44:55.790331: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 11:44:55.792143: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 11:44:55.792260: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 11:44:55.792538: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [10]:
batch_size = 256

In [11]:
"""
Outputs:
  - last_hidden_state: hidden states of the last encoder block
    - shape = (batch_size, max_length, 768)
  - pooler_output: fc(last_hidden_state)
    - shape = (batch_size, 768)
  - hidden_states: hidden states of all encoder blocks
    - length = 13 (note: bert uses 13 encoder blocks)
    - shape of each hidden states = (batch_size, max_length, 768)
"""

X_features = {}
for data_type, inputs in X_tokenized.items():
    num_samples = inputs["input_ids"].shape[0]
    num_iter = num_samples // batch_size + 1
    outputs = {"last_hidden_state": None, "pooler_output": None}
    for i in range(num_iter):
        s, e = i*batch_size, (i+1)*batch_size
        cur_inputs = {k: v[s:e] for k, v in inputs.items()}

        # batch inference
        cur_outputs = model(cur_inputs)

        # concat outputs
        for k, v in outputs.items():
            if v is None:                
                outputs[k] = cur_outputs[k]
            else:
                outputs[k] = tf.concat([v, cur_outputs[k]], axis=0)
    
    X_features[data_type] = outputs

## Train model

### Use Last Hidden States

In [12]:
"""
NOTE:
  - last_hidden_state.shape = (batch_size, tokens, 768)
  - first token is [CLS], which contains overall information about the input sentence
  - therefore, we only use the first token (=[CLS]) as the input features
"""

X_train = X_features["X_train"]["last_hidden_state"][:, 0, :]
y_train = y_train_raw

X_test = X_features["X_test"]["last_hidden_state"][:, 0, :]
y_test = y_test_raw

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

X_train: (6400, 768)
y_train: (6400,)
X_test: (2000, 768)
y_test: (2000,)


In [13]:
clf = LogisticRegression(max_iter=10000, solver="saga")
clf.fit(X_train, y_train)

In [14]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [15]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.72      0.71      0.71      3051
           1       0.74      0.75      0.74      3349

    accuracy                           0.73      6400
   macro avg       0.73      0.73      0.73      6400
weighted avg       0.73      0.73      0.73      6400



In [16]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.67      0.68      0.68       935
           1       0.72      0.71      0.71      1065

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000



### Use Pooler Output

In [17]:
"""
NOTE:
  - last_hidden_state.shape = (batch_size, tokens, 768)
  - first token is [CLS], which contains overall information about the input sentence
  - therefore, we only use the first token (=[CLS]) as the input features
"""

X_train = X_features["X_train"]["pooler_output"]
y_train = y_train_raw

X_test = X_features["X_test"]["pooler_output"]
y_test = y_test_raw

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

X_train: (6400, 768)
y_train: (6400,)
X_test: (2000, 768)
y_test: (2000,)


In [18]:
clf = LogisticRegression(max_iter=10000, solver="saga")
clf.fit(X_train, y_train)

In [19]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [20]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.67      0.65      0.66      3051
           1       0.69      0.70      0.70      3349

    accuracy                           0.68      6400
   macro avg       0.68      0.68      0.68      6400
weighted avg       0.68      0.68      0.68      6400



In [21]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.63      0.62      0.63       935
           1       0.67      0.68      0.68      1065

    accuracy                           0.65      2000
   macro avg       0.65      0.65      0.65      2000
weighted avg       0.65      0.65      0.65      2000

