<a href="https://colab.research.google.com/github/xcellentbird/playground/blob/main/bert_imdb_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
TEST_SIZE = 0.5
TRAIN_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 64
WANDB_RUN_NAME = 'bert-imdb-0917-onlybatchsize64'

In [13]:
!pip install datasets evaluate wandb



# Load Dataset

In [14]:
from datasets import load_dataset

dataset = load_dataset('imdb')

In [15]:
from datasets import concatenate_datasets

dataset.pop('unsupervised')

if TEST_SIZE != 0.5:
    merged_dataset = concatenate_datasets([dataset['train'], dataset['test']])
    dataset = merged_dataset.train_test_split(
        test_size=TEST_SIZE,
        seed=42,
        shuffle=True,
        stratify_by_column='label'
    )

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [16]:
import pandas as pd

train_text_len = pd.Series([len(text) for text in dataset['train']['text']])
test_text_len = pd.Series([len(text) for text in dataset['test']['text']])

train_text_len.describe(), test_text_len.describe()

(count    25000.00000
 mean      1325.06964
 std       1003.13367
 min         52.00000
 25%        702.00000
 50%        979.00000
 75%       1614.00000
 max      13704.00000
 dtype: float64,
 count    25000.00000
 mean      1293.79240
 std        975.90776
 min         32.00000
 25%        696.00000
 50%        962.00000
 75%       1572.00000
 max      12988.00000
 dtype: float64)

In [17]:
import numpy as np

train_label_unique = np.unique(dataset['train']['label'], return_counts=True)
test_label_unique = np.unique(dataset['test']['label'], return_counts=True)

train_label_unique, test_label_unique

((array([0, 1]), array([12500, 12500])),
 (array([0, 1]), array([12500, 12500])))

# Load Model

In [18]:
from transformers import BertTokenizer, BertForSequenceClassification

model_id = 'bert-base-uncased'

model = BertForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    dtype='auto',
    device_map='auto'
)
tokenizer = BertTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [20]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [21]:
model.loss_function

# Train

In [22]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True).to(model.device)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [23]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """
    Computes accuracy on a batch of predictions
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [24]:
import wandb
from google.colab import userdata

WANDB_API_KEY = userdata.get('WANDB_API_KEY')
wandb.login(key=WANDB_API_KEY)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mxcellentbird[0m ([33mxcellentbird-private[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [25]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy='steps',
    eval_steps=20,
    save_steps=20,
    logging_steps=10,
    load_best_model_at_end = True,
    metric_for_best_model='accuracy',
    report_to='wandb',
    run_name=WANDB_RUN_NAME
)

In [26]:
# from transformers import EarlyStoppingCallback

# # 학습 그래프를 좀 더 관찰하기 위해 실제로 callback을 적용하지 않습니다.
# callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

In [27]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
    # callbacks=callbacks
)
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
20,0.6618,0.630495,0.68656
40,0.4152,0.348589,0.86132
60,0.3195,0.25756,0.90416
80,0.2999,0.271934,0.88776
100,0.219,0.241705,0.907
120,0.2543,0.303179,0.87808
140,0.2429,0.20615,0.9218
160,0.2513,0.200036,0.92292
180,0.2358,0.208804,0.92168
200,0.2076,0.199903,0.92568


TrainOutput(global_step=1173, training_loss=0.16698309460945446, metrics={'train_runtime': 10706.7464, 'train_samples_per_second': 7.005, 'train_steps_per_second': 0.11, 'total_flos': 1.9733329152e+16, 'train_loss': 0.16698309460945446, 'epoch': 3.0})

In [28]:
trainer.evaluate()

{'eval_loss': 0.19704486429691315,
 'eval_accuracy': 0.93784,
 'eval_runtime': 160.1789,
 'eval_samples_per_second': 156.076,
 'eval_steps_per_second': 2.441,
 'epoch': 3.0}

In [29]:
from datasets import Dataset
from torch.nn.functional import softmax
from torch import tensor

label_names = dataset['train'].features['label'].names

def predict(text: str) -> str:
    ds = Dataset.from_dict({"text": [text]})
    tokenized_ds = ds.map(tokenize_function, batched=True)

    pred = trainer.predict(tokenized_ds).predictions
    probs = softmax(tensor(pred), dim=-1)
    pred_ids = probs.argmax(axis=-1)

    return label_names[pred_ids]

In [30]:
predict("it's good")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

'pos'