# 專題（二）：訓練 Bert 新聞分類器並提升精準度

## 專案目標
- 目標：請試著建製 BertForSequenceClassification 看得懂的資料集 NewsDataset
- news_clustering_train.tsv 中有 1800 篇新聞，六種類別的新聞各 300 篇
- news_clustering_test.tsv 中有 600 篇新聞，六種類別的新聞各 100 篇
- 六種類別：體育、財經、科技、旅遊、農業、遊戲

## 實作提示
- STEP1 - STEP4：資料處理
- STEP5：創造 train_batch 函數
- STEP6：創造 evaluate 函數
- STEP7：組合以上元素開始訓練，如果正確 validation accuracy 應該可以超過 90% 以上

## 重要知識點：專題結束後你可以學會
- 了解 BERT 的 Sequence Classification 任務如何進行
- 使用 TRAIN / VALID DATA 來了解深度學習模型的訓練情形
- 了解預訓練模型在 NLP 上的威力

In [1]:
!python --version

Python 3.7.9


In [2]:
!pip install -q transformers

In [3]:
import pandas as pd
import numpy as np

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer, BertForSequenceClassification

In [4]:
df_train = pd.read_csv('news_clustering_train.tsv', sep='\t')
df_test = pd.read_csv('news_clustering_test.tsv', sep='\t')

In [5]:
train_titles = {row['index']: row['title'] for _, row in df_train.iterrows()}
train_classes = {row['index']: row['class'] for _, row in df_train.iterrows()}

valid_titles = {row['index']: row['title'] for _, row in df_test.iterrows()}
valid_classes = {row['index']: row['class'] for _, row in df_test.iterrows()}

In [6]:
ALL_NEWS_CLASSES = ['體育', '財經', '科技', '旅遊', '農業', '遊戲']

In [7]:
MODEL_NAME = 'bert-base-chinese'

In [8]:
# 建立數據集
class NewsDataset(Dataset):
    def __init__(self, tokenizer, titles, classes):
        self.tokenizer = tokenizer
        self.indexes = []
        self.texts = []
        self.labels = []
        for index in titles:
            self.indexes.append(index)
            self.texts.append(titles[index])
            self.labels.append(classes[index])

    def __getitem__(self, idx):
        text = self.texts[idx]

        input = self.tokenizer(text, return_tensors='pt')
        label = torch.tensor(ALL_NEWS_CLASSES.index(self.labels[idx]))

        return input, label

    def __len__(self):
        return len(self.indexes)


def create_mini_batch(samples):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    labels = []
    for s in samples:
        input_ids.append(s[0]['input_ids'].squeeze(0))
        token_type_ids.append(s[0]['token_type_ids'].squeeze(0))
        attention_mask.append(s[0]['attention_mask'].squeeze(0))
        labels.append(s[1])

    # zero pad 到同一序列長度
    input_ids = pad_sequence(input_ids, batch_first=True)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)
 
    labels = torch.stack(labels)

    return input_ids, token_type_ids, attention_mask, labels

In [9]:
batch_size = 32

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

train_dataset = NewsDataset(tokenizer, train_titles, train_classes)
valid_dataset = NewsDataset(tokenizer, valid_titles, valid_classes)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=create_mini_batch,
    shuffle=True)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    collate_fn=create_mini_batch)

In [10]:
def train_batch(model, data, optimizer, device):
    model.train()
    input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

    # Code Here
    outputs = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    loss = outputs.loss
    # End

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

In [11]:
def evaluate(model, valid_loader):
    model.eval()
    device = 'cuda' if next(model.parameters()).is_cuda else 'cpu'

    tot_count = 0
    tot_loss = 0
    tot_correct = 0

    with torch.no_grad():
        for data in valid_loader:
            input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

            # Code Here
            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            tot_count += input_ids.size(0)
            tot_loss += outputs.loss.item()
            tot_correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()
            # End
    
    evaluation = {
        'loss': tot_loss / tot_count,
        'acc': tot_correct / tot_count
    }
    return evaluation

In [12]:
# 訓練模型
epochs = 5
lr = 0.0001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels = 6, 
    return_dict=True
)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

for epoch in range(1, epochs + 1):
    print(f'epoch: {epoch}')

    for i, train_data in enumerate(train_loader):
        loss = train_batch(model, train_data, optimizer, device)
        train_size = train_data[0].size(0)

        if i % 10 == 0:
            print('train_loss: ', loss / train_size)

    evaluation = evaluate(model, valid_loader)
    print('valid_evaluation: loss={loss}, acc={acc}'.format(**evaluation))

    scheduler.step()    

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

epoch: 1
train_loss:  0.06043971702456474
train_loss:  0.03436967730522156
train_loss:  0.010848069563508034
train_loss:  0.01923786848783493
train_loss:  0.011836141347885132
train_loss:  0.011756924912333488
valid_evaluation: loss=0.012588730951150259, acc=0.88
epoch: 2
train_loss:  0.018130671232938766
train_loss:  0.008498404175043106
train_loss:  0.013512895442545414
train_loss:  0.017176590859889984
train_loss:  0.024306263774633408
train_loss:  0.007632988039404154
valid_evaluation: loss=0.014633582290261984, acc=0.8616666666666667
epoch: 3
train_loss:  0.008043169975280762
train_loss:  0.011690777726471424
train_loss:  0.004498790018260479
train_loss:  0.011295170523226261
train_loss:  0.00943633820861578
train_loss:  0.008732910268008709
valid_evaluation: loss=0.010663453393305341, acc=0.9066666666666666
epoch: 4
train_loss:  0.0033593850675970316
train_loss:  0.00438004219904542
train_loss:  0.0031367167830467224
train_loss:  0.012005225755274296
train_loss:  0.00322889327071