# News Classification

In [1]:
import pprint
import torch
import torch.utils.data as data
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from data import VNTCDataset, label2id, id2label

## Load data

In [2]:
label2id

{'Van hoa': 0,
 'The gioi': 1,
 'Khoa hoc': 2,
 'Suc khoe': 3,
 'Chinh tri Xa hoi': 4,
 'Vi tinh': 5,
 'Kinh doanh': 6,
 'The thao': 7,
 'Phap luat': 8,
 'Doi song': 9}

In [3]:
train_data = VNTCDataset(train=True)
test_data = VNTCDataset(train=False)

In [4]:
train_data, dev_data = train_test_split(train_data, test_size = 0.1)

In [5]:
print(f"Train data data: {len(train_data)}")
print(f"Dev data data: {len(dev_data)}")
print(f"Test data data: {len(test_data)}")

Train data data: 30383
Dev data data: 3376
Test data data: 50373


## Load tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tokenizer

PhobertTokenizer(name_or_path='vinai/phobert-base', vocab_size=64000, model_max_length=256, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

In [8]:
sample_sentence = "Tôi là sinh viên trường đại học bách khoa hà nội"
sample_token = tokenizer(sample_sentence)
pprint.pprint(sample_token)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [0,
               218,
               8,
               418,
               1430,
               212,
               2919,
               222,
               22313,
               2054,
               14385,
               2151,
               2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [9]:
tokenizer.decode(sample_token["input_ids"])

'<s> Tôi là sinh viên trường đại học bách khoa hà nội </s>'

## Prepare DataLoader

In [10]:
def collate_fn(data):
    texts = []
    labels = []
    for content, label in data:
        texts.append(content)
        labels.append(label)
    tokens = tokenizer(
        texts, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    labels = torch.LongTensor(labels)
    return tokens, labels

In [11]:
batch_size = 32

In [12]:
train_dataloader = data.DataLoader(
    train_data, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=collate_fn
)

In [13]:
dev_dataloader = data.DataLoader(
    dev_data, 
    batch_size=batch_size, 
    collate_fn=collate_fn
)

In [14]:
test_dataloader = data.DataLoader(
    test_data,
    batch_size=batch_size,
    collate_fn=collate_fn
)

In [15]:
for inputs, labels in train_dataloader:
    print(f"Shape of input_ids: {inputs.input_ids.shape}")
    print(f"Shape of labels: {labels.shape}")
    break

Shape of input_ids: torch.Size([32, 256])
Shape of labels: torch.Size([32])


## Create model

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/phobert-base", 
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

In [17]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [18]:
model.config

RobertaConfig {
  "_name_or_path": "vinai/phobert-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Van hoa",
    "1": "The gioi",
    "2": "Khoa hoc",
    "3": "Suc khoe",
    "4": "Chinh tri Xa hoi",
    "5": "Vi tinh",
    "6": "Kinh doanh",
    "7": "The thao",
    "8": "Phap luat",
    "9": "Doi song"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Chinh tri Xa hoi": 4,
    "Doi song": 9,
    "Khoa hoc": 2,
    "Kinh doanh": 6,
    "Phap luat": 8,
    "Suc khoe": 3,
    "The gioi": 1,
    "The thao": 7,
    "Van hoa": 0,
    "Vi tinh": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_tok

In [19]:
for param in model.roberta.parameters():
    param.requires_grad = False

In [20]:
trainable = 0
fixed = 0
for p in model.parameters():
    if p.requires_grad:
        trainable += p.numel()
    else:
        fixed += p.numel()

print(f"Trainable parameters: {trainable}")
print(f"Fixed parameters: {fixed}")
print(f"Total parameters: {trainable + fixed}")

Trainable parameters: 598282
Fixed parameters: 134407680
Total parameters: 135005962


## Train model

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [22]:
model = model.to(device)

In [23]:
learning_rate = 1e-4
epochs = 3

In [24]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
def train(model, dataloader):
    num_batches = len(dataloader)
    model.train()

    for batch, (inputs, labels) in tqdm(enumerate(train_dataloader), total=num_batches):
        inputs = { k: v.to(device) for k, v in inputs.items() }
        labels = labels.to(device)

        # Compute prediction error
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch + 1
            print(f"loss: {loss:>7f}  [{current:>4d}/{num_batches:>4d}]")

In [26]:
def test(model, dataloader):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, total=num_batches):
            inputs = { k: v.to(device) for k, v in inputs.items() }
            labels = labels.to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            preds = outputs.logits.argmax(-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            test_loss += loss.item()
            correct += (preds == labels).sum().item()

    report = classification_report(all_labels, all_preds, target_names=label2id.keys(), zero_division=0)
    test_loss /= num_batches
    correct /= size
    print(report)
    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [27]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}\n-------------------------------")
    train(model, train_dataloader)
    test(model, dev_dataloader)
print("Done!")

Epoch 1
-------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

loss: 2.312908  [   1/ 950]
loss: 1.683005  [ 101/ 950]
loss: 0.897754  [ 201/ 950]
loss: 0.738735  [ 301/ 950]
loss: 0.543578  [ 401/ 950]
loss: 0.607079  [ 501/ 950]
loss: 0.544631  [ 601/ 950]
loss: 0.501711  [ 701/ 950]
loss: 0.817580  [ 801/ 950]
loss: 0.396132  [ 901/ 950]


  0%|          | 0/106 [00:00<?, ?it/s]

                  precision    recall  f1-score   support

         Van hoa       0.89      0.91      0.90       326
        The gioi       0.87      0.88      0.87       267
        Khoa hoc       0.81      0.76      0.78       182
        Suc khoe       0.88      0.91      0.90       345
Chinh tri Xa hoi       0.75      0.79      0.77       542
         Vi tinh       0.96      0.80      0.88       249
      Kinh doanh       0.81      0.79      0.80       224
        The thao       0.98      0.97      0.98       520
       Phap luat       0.88      0.89      0.88       409
        Doi song       0.81      0.84      0.83       312

        accuracy                           0.87      3376
       macro avg       0.87      0.85      0.86      3376
    weighted avg       0.87      0.87      0.87      3376

Test Error: 
 Accuracy: 86.5%, Avg loss: 0.405479 

Epoch 2
-------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

loss: 0.327408  [   1/ 950]
loss: 0.552953  [ 101/ 950]
loss: 0.593122  [ 201/ 950]
loss: 0.443892  [ 301/ 950]
loss: 0.489611  [ 401/ 950]
loss: 0.405083  [ 501/ 950]
loss: 0.604255  [ 601/ 950]
loss: 0.225154  [ 701/ 950]
loss: 0.330164  [ 801/ 950]
loss: 0.360475  [ 901/ 950]


  0%|          | 0/106 [00:00<?, ?it/s]

                  precision    recall  f1-score   support

         Van hoa       0.93      0.91      0.92       326
        The gioi       0.95      0.83      0.89       267
        Khoa hoc       0.72      0.84      0.77       182
        Suc khoe       0.89      0.91      0.90       345
Chinh tri Xa hoi       0.77      0.80      0.79       542
         Vi tinh       0.92      0.89      0.90       249
      Kinh doanh       0.81      0.82      0.82       224
        The thao       0.98      0.97      0.98       520
       Phap luat       0.88      0.89      0.89       409
        Doi song       0.85      0.84      0.84       312

        accuracy                           0.87      3376
       macro avg       0.87      0.87      0.87      3376
    weighted avg       0.88      0.87      0.88      3376

Test Error: 
 Accuracy: 87.5%, Avg loss: 0.357670 

Epoch 3
-------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

loss: 0.638484  [   1/ 950]
loss: 0.494748  [ 101/ 950]
loss: 0.527917  [ 201/ 950]
loss: 0.571325  [ 301/ 950]
loss: 0.471474  [ 401/ 950]
loss: 0.388036  [ 501/ 950]
loss: 0.394080  [ 601/ 950]
loss: 0.278738  [ 701/ 950]
loss: 0.439574  [ 801/ 950]
loss: 0.238287  [ 901/ 950]


  0%|          | 0/106 [00:00<?, ?it/s]

                  precision    recall  f1-score   support

         Van hoa       0.91      0.94      0.92       326
        The gioi       0.92      0.88      0.90       267
        Khoa hoc       0.74      0.86      0.80       182
        Suc khoe       0.89      0.92      0.90       345
Chinh tri Xa hoi       0.82      0.78      0.80       542
         Vi tinh       0.92      0.90      0.91       249
      Kinh doanh       0.83      0.83      0.83       224
        The thao       0.98      0.97      0.98       520
       Phap luat       0.88      0.91      0.90       409
        Doi song       0.87      0.83      0.85       312

        accuracy                           0.88      3376
       macro avg       0.88      0.88      0.88      3376
    weighted avg       0.88      0.88      0.88      3376

Test Error: 
 Accuracy: 88.4%, Avg loss: 0.344145 

Done!


## Test model

In [29]:
test(model, test_dataloader)

  0%|          | 0/1575 [00:00<?, ?it/s]

                  precision    recall  f1-score   support

         Van hoa       0.89      0.93      0.91      6250
        The gioi       0.91      0.92      0.92      6716
        Khoa hoc       0.62      0.81      0.70      2096
        Suc khoe       0.90      0.91      0.90      5417
Chinh tri Xa hoi       0.84      0.78      0.81      7567
         Vi tinh       0.91      0.90      0.91      4560
      Kinh doanh       0.86      0.84      0.85      5276
        The thao       0.98      0.97      0.98      6667
       Phap luat       0.85      0.90      0.87      3788
        Doi song       0.69      0.48      0.57      2036

        accuracy                           0.87     50373
       macro avg       0.84      0.85      0.84     50373
    weighted avg       0.87      0.87      0.87     50373

Test Error: 
 Accuracy: 87.3%, Avg loss: 0.373198 



## Save model

In [30]:
save_dir = "./save/phobert_news_classification"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('./save/phobert_news_classification/tokenizer_config.json',
 './save/phobert_news_classification/special_tokens_map.json',
 './save/phobert_news_classification/vocab.txt',
 './save/phobert_news_classification/bpe.codes',
 './save/phobert_news_classification/added_tokens.json')