In [1]:
import sys
sys.path.append("../")

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from src.datasets.sst2 import load_sst2
from src.utils.logger import get_logger
from src.utils.session import reset_session

2024-12-21 11:43:39.076918: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
_ = get_logger()

## Load Data
- `datasets.sst2.load_sst2`
    - use `bert-base-uncased`
- `datasets.kmreview.load_korean_moview_review`
    - use `bert-base-multilingual-uncased`

In [4]:
(X_train_raw, y_train_raw), (X_valid_raw, y_valid_raw), (X_test_raw, y_test_raw) = load_sst2(
    num_sample=5000,
    val_split=True,
)

X_raw = {
    "X_train": X_train_raw,
    "X_valid": X_valid_raw,
    "X_test": X_test_raw,
}

[INFO] num train: 3200 (data.py:33)
[INFO] num valid: 800 (data.py:34)
[INFO] num test: 1000 (data.py:35)


## Preproess
<!-- - we will use last hidden states to  -->

### Tokenize input sentences

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



In [6]:
"""
Args:
 - return_tensors: returns numpy arra
 - max_length: limit the maximum length of sentence as 30
 - padding: fill with zeros if len(sentence) < max_length
 - truncation: truncate if len(sentence) > max_length

Outputs:
 - input_ids: tokenized input ids
 - token_type_ids: 0 if first sentence, 1 if second sentence (note: BERT takes 2 sentences as input)
 - attention_mask: 0 if input_ids[i] = [PAD], 1 otherwise
"""

X_tokenized = {
    k: tokenizer(v.tolist(), return_tensors="np", max_length=30, padding="max_length", truncation=True)
    for k, v in X_raw.items()
}

In [7]:
for i in [0, 10, 13, 20]:
    tmp_raw = X_raw["X_train"][i]
    tmp_tokenized = X_tokenized["X_train"]
    tmp_ids = tmp_tokenized["input_ids"][i]
    tmp_token_type_ids = tmp_tokenized["token_type_ids"][i]
    tmp_att_msk = tmp_tokenized["attention_mask"][i]

    print("==>")
    print(f"Raw: {tmp_raw}")
    print(f"Token ids: {tmp_ids}")
    print(f"Ids to token: {' '.join(tokenizer.convert_ids_to_tokens(tmp_ids))}")
    print(f"Attention Mask: {tmp_att_msk}")
    print()

==>
Raw: a meditation on faith and madness , frailty is blood curdling stuff
Token ids: [  101  1037 13804  2006  4752  1998 12013  1010 25737  3723  2003  2668
 12731  4103  2989  4933   102     0     0     0     0     0     0     0
     0     0     0     0     0     0]
Ids to token: [CLS] a meditation on faith and madness , frail ##ty is blood cu ##rd ##ling stuff [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]

==>
Raw: has all the hallmarks of a movie designed strictly for children 's home video , a market so insatiable it absorbs all manner of lame entertainment , as long as 3 year olds find it diverting
Token ids: [  101  2038  2035  1996 25812  2015  1997  1037  3185  2881  9975  2005
  2336  1005  1055  2188  2678  1010  1037  3006  2061 16021 10450  3085
  2009 16888  2015  2035  5450   102]
Ids to token: [CLS] has all the hallmark ##s of a movie designed strictly 

### Retrieve Last Hidden States / Pooler Output

In [8]:
reset_session()

In [9]:
model = TFBertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

2024-12-21 11:43:44.330470: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 11:43:44.332194: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 11:43:44.332299: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 11:43:44.332564: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [10]:
batch_size = 256

In [11]:
"""
Outputs:
  - last_hidden_state: hidden states of the last encoder block
    - shape = (batch_size, max_length, 768)
  - pooler_output: fc(last_hidden_state)
    - shape = (batch_size, 768)
  - hidden_states: hidden states of all encoder blocks
    - length = 13 (note: bert uses 13 encoder blocks)
    - shape of each hidden states = (batch_size, max_length, 768)
"""

X_features = {}
for data_type, inputs in X_tokenized.items():
    num_samples = inputs["input_ids"].shape[0]
    num_iter = num_samples // batch_size + 1
    outputs = {"last_hidden_state": None, "pooler_output": None}
    for i in range(num_iter):
        s, e = i*batch_size, (i+1)*batch_size
        cur_inputs = {k: v[s:e] for k, v in inputs.items()}

        # batch inference
        cur_outputs = model(cur_inputs)

        # concat outputs
        for k, v in outputs.items():
            if v is None:                
                outputs[k] = cur_outputs[k]
            else:
                outputs[k] = tf.concat([v, cur_outputs[k]], axis=0)
    
    X_features[data_type] = outputs

## Train model

### Use Last Hidden States

In [12]:
"""
NOTE:
  - last_hidden_state.shape = (batch_size, tokens, 768)
  - first token is [CLS], which contains overall information about the input sentence
  - therefore, we only use the first token (=[CLS]) as the input features
"""

X_train = X_features["X_train"]["last_hidden_state"][:, 0, :]
y_train = y_train_raw

X_test = X_features["X_test"]["last_hidden_state"][:, 0, :]
y_test = y_test_raw

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

X_train: (3200, 768)
y_train: (3200,)
X_test: (1000, 768)
y_test: (1000,)


In [13]:
clf = LogisticRegression(max_iter=10000, solver="saga")
clf.fit(X_train, y_train)

In [14]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [15]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1544
           1       0.93      0.92      0.92      1656

    accuracy                           0.92      3200
   macro avg       0.92      0.92      0.92      3200
weighted avg       0.92      0.92      0.92      3200



In [16]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       469
           1       0.84      0.82      0.83       531

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000



### Use Pooler Output

In [17]:
"""
NOTE:
  - last_hidden_state.shape = (batch_size, tokens, 768)
  - first token is [CLS], which contains overall information about the input sentence
  - therefore, we only use the first token (=[CLS]) as the input features
"""

X_train = X_features["X_train"]["pooler_output"]
y_train = y_train_raw

X_test = X_features["X_test"]["pooler_output"]
y_test = y_test_raw

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

X_train: (3200, 768)
y_train: (3200,)
X_test: (1000, 768)
y_test: (1000,)


In [18]:
clf = LogisticRegression(max_iter=10000, solver="saga")
clf.fit(X_train, y_train)

In [19]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [20]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1544
           1       0.89      0.86      0.87      1656

    accuracy                           0.87      3200
   macro avg       0.87      0.87      0.87      3200
weighted avg       0.87      0.87      0.87      3200



In [21]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       469
           1       0.86      0.82      0.84       531

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000

