This code come from [here](https://teddylee777.github.io/huggingface/hugginface-bbc-bert/).

additional steps  
1. analysis code 
2. add wandb code
3. convert into vscode.

In [2]:
import random
import numpy as np
import os

import torch

## SEED 설정

In [3]:
SEED = 123

In [4]:
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True #?
    torch.backends.cudnn.benchmark = True #?

seed_everything(SEED)

## 샘플 예제 다운로드

In [5]:
import json
from tqdm import tqdm
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


# Set HF

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

PLM = "abhishek/autonlp-bbc-news-classification-37229289"

tokenizer = AutoTokenizer.from_pretrained(PLM)
model = AutoModelForSequenceClassification.from_pretrained(PLM)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
tokenized = tokenizer(df['text'].iloc[0], padding=True, truncation=True)
tokenized

{'input_ids': [101, 2694, 2925, 1999, 1996, 2398, 1997, 7193, 2007, 2188, 3004, 3001, 12123, 2152, 1011, 6210, 2694, 2015, 1998, 3617, 2678, 14520, 2015, 3048, 2046, 1996, 2542, 2282, 1996, 2126, 2111, 3422, 2694, 2097, 2022, 25796, 2367, 1999, 2274, 2086, 2051, 1012, 2008, 2003, 2429, 2000, 2019, 6739, 5997, 2029, 5935, 2012, 1996, 3296, 7325, 8139, 2265, 1999, 5869, 7136, 2000, 6848, 2129, 2122, 2047, 6786, 2097, 4254, 2028, 1997, 2256, 8837, 2627, 14428, 2015, 1012, 2007, 1996, 2149, 2877, 1996, 9874, 8497, 1998, 2060, 4180, 2097, 2022, 5359, 2000, 7193, 3081, 2188, 6125, 2083, 5830, 5871, 18126, 2015, 3316, 1998, 19595, 2326, 11670, 2000, 2392, 4734, 1998, 12109, 5733, 1012, 2028, 1997, 1996, 2087, 5720, 1011, 2055, 6786, 1997, 8292, 2015, 2038, 2042, 3617, 1998, 3167, 2678, 14520, 2015, 1006, 1040, 19716, 1998, 26189, 2099, 1007, 1012, 2122, 2275, 1011, 2327, 8378, 2066, 1996, 2149, 1055, 14841, 6767, 1998, 1996, 2866, 1055, 3712, 1009, 2291, 3499, 2111, 2000, 2501, 3573, 2377, 87

In [9]:
input_ids = tokenized['input_ids']
print(input_ids)

[101, 2694, 2925, 1999, 1996, 2398, 1997, 7193, 2007, 2188, 3004, 3001, 12123, 2152, 1011, 6210, 2694, 2015, 1998, 3617, 2678, 14520, 2015, 3048, 2046, 1996, 2542, 2282, 1996, 2126, 2111, 3422, 2694, 2097, 2022, 25796, 2367, 1999, 2274, 2086, 2051, 1012, 2008, 2003, 2429, 2000, 2019, 6739, 5997, 2029, 5935, 2012, 1996, 3296, 7325, 8139, 2265, 1999, 5869, 7136, 2000, 6848, 2129, 2122, 2047, 6786, 2097, 4254, 2028, 1997, 2256, 8837, 2627, 14428, 2015, 1012, 2007, 1996, 2149, 2877, 1996, 9874, 8497, 1998, 2060, 4180, 2097, 2022, 5359, 2000, 7193, 3081, 2188, 6125, 2083, 5830, 5871, 18126, 2015, 3316, 1998, 19595, 2326, 11670, 2000, 2392, 4734, 1998, 12109, 5733, 1012, 2028, 1997, 1996, 2087, 5720, 1011, 2055, 6786, 1997, 8292, 2015, 2038, 2042, 3617, 1998, 3167, 2678, 14520, 2015, 1006, 1040, 19716, 1998, 26189, 2099, 1007, 1012, 2122, 2275, 1011, 2327, 8378, 2066, 1996, 2149, 1055, 14841, 6767, 1998, 1996, 2866, 1055, 3712, 1009, 2291, 3499, 2111, 2000, 2501, 3573, 2377, 8724, 1998, 2830

In [10]:
print(tokenizer.convert_ids_to_tokens(input_ids))

['[CLS]', 'tv', 'future', 'in', 'the', 'hands', 'of', 'viewers', 'with', 'home', 'theatre', 'systems', 'plasma', 'high', '-', 'definition', 'tv', '##s', 'and', 'digital', 'video', 'recorder', '##s', 'moving', 'into', 'the', 'living', 'room', 'the', 'way', 'people', 'watch', 'tv', 'will', 'be', 'radically', 'different', 'in', 'five', 'years', 'time', '.', 'that', 'is', 'according', 'to', 'an', 'expert', 'panel', 'which', 'gathered', 'at', 'the', 'annual', 'consumer', 'electronics', 'show', 'in', 'las', 'vegas', 'to', 'discuss', 'how', 'these', 'new', 'technologies', 'will', 'impact', 'one', 'of', 'our', 'favourite', 'past', '##ime', '##s', '.', 'with', 'the', 'us', 'leading', 'the', 'trend', 'programmes', 'and', 'other', 'content', 'will', 'be', 'delivered', 'to', 'viewers', 'via', 'home', 'networks', 'through', 'cable', 'satellite', 'telecom', '##s', 'companies', 'and', 'broadband', 'service', 'providers', 'to', 'front', 'rooms', 'and', 'portable', 'devices', '.', 'one', 'of', 'the', '

## Label Map 생성

In [11]:
label_map = {
    'sport':0,
    'business': 1,
    'politics': 2,
    'tech':3,
    'entertainment': 4
}

df['category_num'] = df['category'].map(label_map)

In [12]:
df

Unnamed: 0,category,text,category_num
0,tech,tv future in the hands of viewers with home th...,3
1,business,worldcom boss left books alone former worldc...,1
2,sport,tigers wary of farrell gamble leicester say ...,0
3,sport,yeading face newcastle in fa cup premiership s...,0
4,entertainment,ocean s twelve raids box office ocean s twelve...,4
...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,1
2221,politics,kilroy unveils immigration policy ex-chatshow ...,2
2222,entertainment,rem announce new glasgow concert us band rem h...,4
2223,politics,how political squabbles snowball it s become c...,2


## Split data into train_set and valid_set

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['category_num'],
                                                    stratify=df['category_num'],
                                                    test_size=0.2,
                                                    random_state=SEED)

## Batch Tokenization

In [15]:
tokenizer.model_max_length

512

In [16]:
batch_tokenized = tokenizer(df['text'].iloc[:10].tolist(), padding=True, truncation=True)
batch_tokenized[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [17]:
np.array(batch_tokenized['input_ids']).shape

(10, 512)

## Dataset 생성

In [18]:
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import build_vocab_from_iterator

In [19]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        super().__init__()
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        return text, label

In [20]:
# Dataset 생성
train_ds = CustomDataset(x_train, y_train)
valid_ds = CustomDataset(x_test, y_test)

## DataLoader 생성

In [21]:
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

# torch 디바이스 지정
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [22]:
def collate_batch(batch, tokenizer):
    text_list, label_list = [], []

    for text, label in batch:
        text_list.append(text)
        label_list.append(label)

    label_list = torch.tensor(label_list, dtype=torch.int64) # tensor 정수

    # padding으로 길이를 맞춤
    text_tokenized = tokenizer(text_list, padding=True, truncation=True, return_tensors='pt') # pytorch

    return text_tokenized, label_list

In [23]:
train_loader = DataLoader(train_ds,
                          batch_size=32,
                          shuffle=True,
                          collate_fn=lambda x: collate_batch(x, tokenizer))
valid_loader = DataLoader(valid_ds,
                          batch_size=32,
                          shuffle=False,
                          collate_fn=lambda x: collate_batch(x, tokenizer))

In [24]:
x, y = next(iter(train_loader)) #?
print(x[0])
print(y[0])

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
tensor(3)


In [25]:
x, y = x.to(device), y.to(device)

# 모델 세팅

In [26]:
from tqdm import tqdm
import torch.optim as optim

In [27]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [28]:
# freeze weight
# PLM의 weight는 업데이트 하지 않고
for param in model.parameters():
    param.requires_grad=False #??


In [29]:
model.classifier = nn.Sequential(
    nn.Linear(1024, 256),
    nn.BatchNorm1d(256),
    nn.ReLU(),
    nn.Linear(256, 32),
    nn.BatchNorm1d(32),
    nn.ReLU(),
    nn.Linear(32, 5)
)

In [30]:
# 변경된 classifier 가중치 업데이트 가능 여부 확인?
# 분류기만 전이학습함
for param in model.classifier.parameters():
    print(param.requires_grad)

True
True
True
True
True
True
True
True
True
True


In [31]:
# 입력의 각 키별(input_ids, token_type_ids, attention_mask)를 device에 로드
inputs = {k: v.to(device) for k, v in x.items()}
inputs

{'input_ids': tensor([[  101,  7513,  3084,  ...,  4358,  2012,   102],
         [  101,  4977,  1011,  ...,     0,     0,     0],
         [  101, 19267,  7016,  ...,     0,     0,     0],
         ...,
         [  101,  2399,  4607,  ...,  2000,  1037,   102],
         [  101, 21291,  3089,  ...,     0,     0,     0],
         [  101,  5924,  7065,  ...,  2709,  2009,   102]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')}

In [32]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

## define criterion and optimizer

In [33]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [34]:
def model_train(model, data_loader, criterion, optimizer, device):
    model.train()

    # loss와 accuracy 계산을 위한 임시 변수
    running_loss = 0
    corr = 0
    counts = 0

    # Prograss Bar 출력을 위해 tqdm으로 매핑
    prograss_bar = tqdm(data_loader, unit='batch', total=len(data_loader), mininterval=1)


    # mini-batch 학습
    for idx, (txt, lbl) in enumerate(prograss_bar):
        # txt, label 데이터를 device에 올림
        inputs = {k:v.to(device) for k, v in txt.items()}
        lbl = lbl.to(device)

        # 누적 Gradient를 초기화 함
        optimizer.zero_grad()

        # Forward
        output = model(**inputs)
        # 예측값인 logits만 추출함
        output = output.logits

        # 손실 함수에 output, label을 대입하여 구함
        loss = criterion(output, lbl)
        # Backward
        loss.backward()

        # update Gradient
        optimizer.step()
        # 확률값이 최대인 index를 구함 get Probability max index
        output = output.argmax(dim=1)

        # 정답 갯수를 구함
        corr += (output == lbl).sum().item()
        counts += len(lbl)

        # batch별 loss 계산하여 누적합을 구함
        running_loss += loss.item()

        # 프로그래스바에 학습 상황 업데이트
        prograss_bar.set_description(f"training loss: {running_loss/(idx+1):.5f}, training accuracy: {corr / counts:.5f}")

    # 누적된 정답수를 전체 개수로 나누어 정확도 계산
    acc = corr / len(data_loader.dataset)


    # 평균 손실과 정확도를 반환함
    # train_loss, train_acc
    return running_loss / len(data_loader), acc

### Evaluation code

In [35]:
def model_evaluate(model, data_loader, criterion, device):
    # model.eval()은 모델 평가모드
    # evaluation 진행시 반드시 작성해줘야 함
    model.eval()

    # torch.no_grad()로 Gradient가 업데이트 되는 것을 방지함
    with torch.no_grad():
        corr = 0
        running_loss = 0

        # evaluate each batch data.
        for txt, label in data_loader:
            # upload txt and label into device.
            inputs = {k:v.to(device) for k, v in txt.items()}
            label = label.to(device)

            output = model(**inputs)

            #예측 값인 logits를 가져옴
            output = output.logits
            loss = criterion(output, label)

            # 출력의 확률값이 가장 높은 index를 가져옴
            output = output.argmax(dim=1)

            # 맞은 갯수를 구함
            corr += (output == label).sum().item()

            # batch 별 loss를 계산
            running_loss += loss.item()

    # culculate validation's accuracy
    acc = corr / len(data_loader.dataset)

    return running_loss / len(data_loader), acc

In [36]:
num_epochs = 5
model_name = 'BBC-Text-CLF-BERT'
min_loss = np.inf

In [38]:
for epoch in range(num_epochs):
    train_loss, train_acc = model_train(model, train_loader, criterion, optimizer, device)

    val_loss, val_acc = model_evaluate(model, valid_loader, criterion, device)

    if val_loss < min_loss:
        print(f"[Info] val_loss has been improved from {min_loss:.5f} to {val_loss:.5f}. Saving Model.")
        min_loss = val_loss
        torch.save(model.state_dict(), f'{model_name}.pth')

    print(f'epoch {epoch+1:02d}, loss: {train_loss:.5f}, acc: {train_acc:.5f}, val_loss: {val_loss:.5f}, val_accuracy:{val_acc:.5f}')

training loss: 0.42567, training accuracy: 0.99213: 100%|███████████████████████████| 56/56 [09:26<00:00, 10.12s/batch]


[Info] val_loss has been improved from 0.39143 to 0.31969. Saving Model.
epoch 01, loss: 0.42567, acc: 0.99213, val_loss: 0.31969, val_accuracy:0.99326


training loss: 0.36583, training accuracy: 0.99213: 100%|███████████████████████████| 56/56 [08:32<00:00,  9.15s/batch]


[Info] val_loss has been improved from 0.31969 to 0.28635. Saving Model.
epoch 02, loss: 0.36583, acc: 0.99213, val_loss: 0.28635, val_accuracy:0.99326


training loss: 0.33264, training accuracy: 0.99270: 100%|███████████████████████████| 56/56 [02:55<00:00,  3.14s/batch]


[Info] val_loss has been improved from 0.28635 to 0.25931. Saving Model.
epoch 03, loss: 0.33264, acc: 0.99270, val_loss: 0.25931, val_accuracy:0.99326


training loss: 0.30376, training accuracy: 0.99213: 100%|███████████████████████████| 56/56 [05:32<00:00,  5.94s/batch]


[Info] val_loss has been improved from 0.25931 to 0.23335. Saving Model.
epoch 04, loss: 0.30376, acc: 0.99213, val_loss: 0.23335, val_accuracy:0.99326


training loss: 0.27993, training accuracy: 0.99213: 100%|███████████████████████████| 56/56 [02:55<00:00,  3.14s/batch]


[Info] val_loss has been improved from 0.23335 to 0.21102. Saving Model.
epoch 05, loss: 0.27993, acc: 0.99213, val_loss: 0.21102, val_accuracy:0.99551


# 최종 평가

In [39]:
# load best weight
model.load_state_dict(torch.load(f'{model_name}.pth'))

<All keys matched successfully>

In [41]:
with torch.no_grad():
    val_loss, val_acc = model_evaluate(model, valid_loader, criterion, device)
    print(f"loss: {val_loss:.5f}, accuracy: {val_acc:.5f}")

loss: 0.21102, accuracy: 0.99551
