In [47]:
!pip install seacrowd nusacrowd

Collecting nusacrowd
  Downloading nusacrowd-0.1.4-py3-none-any.whl.metadata (1.1 kB)
Downloading nusacrowd-0.1.4-py3-none-any.whl (384 kB)
Installing collected packages: nusacrowd
Successfully installed nusacrowd-0.1.4


# Train Pos Tagger Model

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from collections import defaultdict, namedtuple

import torch
from torch import optim
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertTokenizer
from transformers import BertPreTrainedModel, BertModel, BertConfig
from transformers import AutoTokenizer, AutoConfig
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from datasets import load_dataset

2024-12-07 03:50:28.937416: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-07 03:50:28.937478: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-07 03:50:28.937499: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-07 03:50:28.946418: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [27]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="7"

## Code for Training and Evaluation Pos Tagger Model

In [28]:
Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')

class EvalCounts(object):
    def __init__(self):
        self.correct_chunk = 0    # number of correctly identified chunks
        self.correct_tags = 0     # number of correct chunk tags
        self.found_correct = 0    # number of chunks in corpus
        self.found_guessed = 0    # number of identified chunks
        self.token_counter = 0    # token counter (ignores sentence breaks)

        # counts by type
        self.t_correct_chunk = defaultdict(int)
        self.t_found_correct = defaultdict(int)
        self.t_found_guessed = defaultdict(int)

###
# Evaluate Function
###        
def parse_tag(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, '')

def start_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk started between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_start = False

    if tag == 'B': chunk_start = True
    if tag == 'S': chunk_start = True

    if prev_tag == 'E' and tag == 'E': chunk_start = True
    if prev_tag == 'E' and tag == 'I': chunk_start = True
    if prev_tag == 'S' and tag == 'E': chunk_start = True
    if prev_tag == 'S' and tag == 'I': chunk_start = True
    if prev_tag == 'O' and tag == 'E': chunk_start = True
    if prev_tag == 'O' and tag == 'I': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    # these chunks are assumed to have length 1
    if tag == '[': chunk_start = True
    if tag == ']': chunk_start = True

    return chunk_start

def end_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk ended between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_end = False

    if prev_tag == 'E': chunk_end = True
    if prev_tag == 'S': chunk_end = True

    if prev_tag == 'B' and tag == 'B': chunk_end = True
    if prev_tag == 'B' and tag == 'S': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    if prev_tag == 'I' and tag == 'S': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    # these chunks are assumed to have length 1
    if prev_tag == ']': chunk_end = True
    if prev_tag == '[': chunk_end = True

    return chunk_end

def evaluate_fn(guessed, correct, last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts):
    guessed, guessed_type = parse_tag(guessed)
    correct, correct_type = parse_tag(correct)

    end_correct = end_of_chunk(last_correct, correct,
                               last_correct_type, correct_type)
    end_guessed = end_of_chunk(last_guessed, guessed,
                               last_guessed_type, guessed_type)
    start_correct = start_of_chunk(last_correct, correct,
                                   last_correct_type, correct_type)
    start_guessed = start_of_chunk(last_guessed, guessed,
                                   last_guessed_type, guessed_type)

    if in_correct:
        if (end_correct and end_guessed and
            last_guessed_type == last_correct_type):
            in_correct = False
            counts.correct_chunk += 1
            counts.t_correct_chunk[last_correct_type] += 1
        elif (end_correct != end_guessed or guessed_type != correct_type):
            in_correct = False

    if start_correct and start_guessed and guessed_type == correct_type:
        in_correct = True

    if start_correct:
        counts.found_correct += 1
        counts.t_found_correct[correct_type] += 1
    if start_guessed:
        counts.found_guessed += 1
        counts.t_found_guessed[guessed_type] += 1
    if correct == guessed and guessed_type == correct_type:
        counts.correct_tags += 1
    counts.token_counter += 1

    last_guessed = guessed
    last_correct = correct
    last_guessed_type = guessed_type
    last_correct_type = correct_type
    
    return last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts
    
def evaluate(hyps_list, labels_list):
    counts = EvalCounts()
    num_features = None       # number of features per line
    in_correct = False        # currently processed chunks is correct until now
    last_correct = 'O'        # previous chunk tag in corpus
    last_correct_type = ''    # type of previously identified chunk tag
    last_guessed = 'O'        # previously identified chunk tag
    last_guessed_type = ''    # type of previous chunk tag in corpus

    for hyps, labels in zip(hyps_list, labels_list):
        for hyp, label in zip(hyps, labels):
            step_result = evaluate_fn(hyp, label, last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts)
            last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts = step_result
        # Boundary between sentence
        step_result = evaluate_fn('O', 'O', last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts)
        last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts = step_result
        
    if in_correct:
        counts.correct_chunk += 1
        counts.t_correct_chunk[last_correct_type] += 1

    return counts

###
# Calculate Metrics Function
###
def uniq(iterable):
    seen = set()
    return [i for i in iterable if not (i in seen or seen.add(i))]

def calculate_metrics(correct, guessed, total):
    tp, fp, fn = correct, guessed-correct, total-correct
    p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
    r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
    f = 0 if p + r == 0 else (2 * p * r) / (p + r)
    return Metrics(tp, fp, fn, p, r, f)

def metric(counts):
    c = counts
    overall = calculate_metrics(
        c.correct_chunk, c.found_guessed, c.found_correct
    )
    by_type = {}
    for t in uniq(list(c.t_found_correct.keys()) + list(c.t_found_guessed.keys())):
        by_type[t] = calculate_metrics(
            c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
        )
    return overall, by_type

###
# Main Function
###
def conll_evaluation(hyps_list, labels_list):
    counts = evaluate(hyps_list, labels_list)
    overall, by_type = metric(counts)

    c = counts
    acc = c.correct_tags / c.token_counter
    pre = overall.prec
    rec = overall.rec
    f1 = overall.fscore
    
    type_macro_pre = 0.0
    type_macro_rec = 0.0
    type_macro_f1 = 0.0
    for k in by_type.keys():
        type_macro_pre += by_type[k].prec
        type_macro_rec += by_type[k].rec
        type_macro_f1 += by_type[k].fscore
        
    type_macro_pre = type_macro_pre / float(len(by_type))
    type_macro_rec = type_macro_rec / float(len(by_type))
    type_macro_f1 = type_macro_f1 / float(len(by_type))
    
    return (acc, pre, rec, f1, type_macro_pre, type_macro_rec, type_macro_f1)

In [29]:
def forward_word_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 4:
        (subword_batch, mask_batch, subword_to_word_indices_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 5:
        (subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    subword_to_word_indices_batch = torch.LongTensor(subword_to_word_indices_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        subword_to_word_indices_batch = subword_to_word_indices_batch.cuda()
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, subword_to_word_indices_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyps = []
    list_labels = []
    hyps_list = torch.topk(logits, k=1, dim=-1)[1].squeeze(dim=-1)
    for i in range(len(hyps_list)):
        hyps, labels = hyps_list[i].tolist(), label_batch[i].tolist()        
        list_hyp, list_label = [], []
        for j in range(len(hyps)):
            if labels[j] == -100:
                break
            else:
                list_hyp.append(i2w[hyps[j]])
                list_label.append(i2w[labels[j]])
        list_hyps.append(list_hyp)
        list_labels.append(list_label)
        
    return loss, list_hyps, list_labels

In [30]:
def pos_tag_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

In [31]:
class BertForWordClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        subword_to_word_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        # average the token-level outputs to compute word-level representations
        max_seq_len = subword_to_word_ids.max() + 1
        word_latents = []
        for i in range(max_seq_len):
            mask = (subword_to_word_ids == i).unsqueeze(dim=-1)
            word_latents.append((sequence_output * mask).sum(dim=1) / mask.sum())
        word_batch = torch.stack(word_latents, dim=1)

        sequence_output = self.dropout(word_batch)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [32]:
class PosTagProsaDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'B-PPO': 0, 'B-KUA': 1, 'B-ADV': 2, 'B-PRN': 3, 'B-VBI': 4, 'B-PAR': 5, 'B-VBP': 6, 'B-NNP': 7, 'B-UNS': 8, 'B-VBT': 9, 'B-VBL': 10, 'B-NNO': 11, 'B-ADJ': 12, 'B-PRR': 13, 'B-PRK': 14, 'B-CCN': 15, 'B-$$$': 16, 'B-ADK': 17, 'B-ART': 18, 'B-CSN': 19, 'B-NUM': 20, 'B-SYM': 21, 'B-INT': 22, 'B-NEG': 23, 'B-PRI': 24, 'B-VBE': 25}
    INDEX2LABEL = {0: 'B-PPO', 1: 'B-KUA', 2: 'B-ADV', 3: 'B-PRN', 4: 'B-VBI', 5: 'B-PAR', 6: 'B-VBP', 7: 'B-NNP', 8: 'B-UNS', 9: 'B-VBT', 10: 'B-VBL', 11: 'B-NNO', 12: 'B-ADJ', 13: 'B-PRR', 14: 'B-PRK', 15: 'B-CCN', 16: 'B-$$$', 17: 'B-ADK', 18: 'B-ART', 19: 'B-CSN', 20: 'B-NUM', 21: 'B-SYM', 22: 'B-INT', 23: 'B-NEG', 24: 'B-PRI', 25: 'B-VBE'}
    NUM_LABELS = 26
    
    def load_dataset(self, data):
        # Prepare buffer
        dataset = []
        sentence = []
        seq_label = []
        for i in range (len(data)):
            for j in range (len(data[i]['tokens'])):
                sentence.append(data[i]['tokens'][j])
                seq_label.append(self.LABEL2INDEX[data[i]['pos_tags'][j]])
            dataset.append({
                    'sentence': sentence,
                    'seq_label': seq_label
                })
            sentence = []
            seq_label = []
        return dataset
    
    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):
        data = self.data[index]
        sentence, seq_label = data['sentence'], data['seq_label']
        
        # Add CLS token
        subwords = [self.tokenizer.cls_token_id]
        subword_to_word_indices = [-1] # For CLS
        
        # Add subwords
        for word_idx, word in enumerate(sentence):
            subword_list = self.tokenizer.encode(word, add_special_tokens=False)
            subword_to_word_indices += [word_idx for i in range(len(subword_list))]
            subwords += subword_list
            
        # Add last SEP token
        subwords += [self.tokenizer.sep_token_id]
        subword_to_word_indices += [-1]
        
        return np.array(subwords), np.array(subword_to_word_indices), np.array(seq_label), data['sentence']
    
    def __len__(self):
        return len(self.data)

class PosTagDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(PosTagDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        max_tgt_len = max(map(lambda x: len(x[2]), batch))
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        subword_to_word_indices_batch = np.full((batch_size, max_seq_len), -1, dtype=np.int64)
        seq_label_batch = np.full((batch_size, max_tgt_len), -100, dtype=np.int64)

        seq_list = []
        for i, (subwords, subword_to_word_indices, seq_label, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_to_word_indices = subword_to_word_indices[:max_seq_len]

            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            subword_to_word_indices_batch[i,:len(subwords)] = subword_to_word_indices
            seq_label_batch[i,:len(seq_label)] = seq_label

            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, subword_to_word_indices_batch, seq_label_batch, seq_list


## Utils Function

In [33]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [34]:
set_seed(42)

## Load Model and Dataset

In [5]:
data = load_dataset("SEACrowd/posp", trust_remote_code=True)

In [None]:
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-large-p1')
config.num_labels = PosTagProsaDataset.NUM_LABELS
w2i, i2w = PosTagProsaDataset.LABEL2INDEX, PosTagProsaDataset.INDEX2LABEL

model = BertForWordClassification.from_pretrained('indobenchmark/indobert-large-p1', config=config)

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
train_dataset = PosTagProsaDataset(data['train'], tokenizer, lowercase=True)
valid_dataset = PosTagProsaDataset(data['validation'], tokenizer, lowercase=True)
test_dataset = PosTagProsaDataset(data['test'], tokenizer, lowercase=True)

train_loader = PosTagDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=True)  
valid_loader = PosTagDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=False)  
test_loader = PosTagDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=False)

In [44]:
optimizer = optim.Adam(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
model = model.cuda()

## Train and Evaluate Model

In [38]:
max_norm = 10
n_epochs = 10

for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = pos_tag_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))
    
    scheduler.step()

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = pos_tag_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = pos_tag_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.8431 LR:0.00004000: 100%|██████████| 840/840 [01:19<00:00, 10.62it/s]


(Epoch 1) TRAIN LOSS:0.8431 ACC:0.90 F1:0.78 REC:0.74 PRE:0.85 LR:0.00004000


VALID LOSS:0.4065 ACC:0.95 F1:0.91 REC:0.91 PRE:0.90: 100%|██████████| 105/105 [00:07<00:00, 14.19it/s]


(Epoch 1) VALID LOSS:0.4065 ACC:0.95 F1:0.91 REC:0.91 PRE:0.90


(Epoch 2) TRAIN LOSS:0.2981 LR:0.00003600: 100%|██████████| 840/840 [01:19<00:00, 10.55it/s]


(Epoch 2) TRAIN LOSS:0.2981 ACC:0.96 F1:0.93 REC:0.91 PRE:0.96 LR:0.00003600


VALID LOSS:0.2442 ACC:0.96 F1:0.95 REC:0.96 PRE:0.94: 100%|██████████| 105/105 [00:07<00:00, 14.32it/s]


(Epoch 2) VALID LOSS:0.2442 ACC:0.96 F1:0.95 REC:0.96 PRE:0.94


(Epoch 3) TRAIN LOSS:0.1670 LR:0.00003240: 100%|██████████| 840/840 [01:19<00:00, 10.54it/s]


(Epoch 3) TRAIN LOSS:0.1670 ACC:0.98 F1:0.96 REC:0.96 PRE:0.97 LR:0.00003240


VALID LOSS:0.1951 ACC:0.96 F1:0.94 REC:0.95 PRE:0.96: 100%|██████████| 105/105 [00:07<00:00, 14.23it/s]


(Epoch 3) VALID LOSS:0.1951 ACC:0.96 F1:0.94 REC:0.95 PRE:0.96


(Epoch 4) TRAIN LOSS:0.1094 LR:0.00002916: 100%|██████████| 840/840 [01:19<00:00, 10.59it/s]


(Epoch 4) TRAIN LOSS:0.1094 ACC:0.98 F1:0.97 REC:0.97 PRE:0.98 LR:0.00002916


VALID LOSS:0.1667 ACC:0.97 F1:0.97 REC:0.96 PRE:0.97: 100%|██████████| 105/105 [00:07<00:00, 14.30it/s]


(Epoch 4) VALID LOSS:0.1667 ACC:0.97 F1:0.97 REC:0.96 PRE:0.97


(Epoch 5) TRAIN LOSS:0.0759 LR:0.00002624: 100%|██████████| 840/840 [01:19<00:00, 10.56it/s]


(Epoch 5) TRAIN LOSS:0.0759 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98 LR:0.00002624


VALID LOSS:0.1639 ACC:0.97 F1:0.96 REC:0.97 PRE:0.96: 100%|██████████| 105/105 [00:07<00:00, 14.20it/s]


(Epoch 5) VALID LOSS:0.1639 ACC:0.97 F1:0.96 REC:0.97 PRE:0.96


(Epoch 6) TRAIN LOSS:0.0532 LR:0.00002362: 100%|██████████| 840/840 [01:19<00:00, 10.55it/s]


(Epoch 6) TRAIN LOSS:0.0532 ACC:0.99 F1:0.99 REC:0.98 PRE:0.99 LR:0.00002362


VALID LOSS:0.1726 ACC:0.97 F1:0.97 REC:0.97 PRE:0.96: 100%|██████████| 105/105 [00:07<00:00, 14.10it/s]


(Epoch 6) VALID LOSS:0.1726 ACC:0.97 F1:0.97 REC:0.97 PRE:0.96


(Epoch 7) TRAIN LOSS:0.0408 LR:0.00002126: 100%|██████████| 840/840 [01:19<00:00, 10.55it/s]


(Epoch 7) TRAIN LOSS:0.0408 ACC:0.99 F1:0.99 REC:0.98 PRE:0.99 LR:0.00002126


VALID LOSS:0.1727 ACC:0.97 F1:0.96 REC:0.97 PRE:0.95: 100%|██████████| 105/105 [00:07<00:00, 14.30it/s]


(Epoch 7) VALID LOSS:0.1727 ACC:0.97 F1:0.96 REC:0.97 PRE:0.95


(Epoch 8) TRAIN LOSS:0.0304 LR:0.00001913: 100%|██████████| 840/840 [01:19<00:00, 10.59it/s]


(Epoch 8) TRAIN LOSS:0.0304 ACC:1.00 F1:0.99 REC:0.99 PRE:0.99 LR:0.00001913


VALID LOSS:0.1725 ACC:0.97 F1:0.96 REC:0.97 PRE:0.95: 100%|██████████| 105/105 [00:07<00:00, 14.10it/s]


(Epoch 8) VALID LOSS:0.1725 ACC:0.97 F1:0.96 REC:0.97 PRE:0.95


(Epoch 9) TRAIN LOSS:0.0229 LR:0.00001722: 100%|██████████| 840/840 [01:19<00:00, 10.59it/s]


(Epoch 9) TRAIN LOSS:0.0229 ACC:1.00 F1:0.99 REC:0.99 PRE:1.00 LR:0.00001722


VALID LOSS:0.1817 ACC:0.97 F1:0.96 REC:0.97 PRE:0.96: 100%|██████████| 105/105 [00:07<00:00, 14.22it/s]


(Epoch 9) VALID LOSS:0.1817 ACC:0.97 F1:0.96 REC:0.97 PRE:0.96


(Epoch 10) TRAIN LOSS:0.0198 LR:0.00001550: 100%|██████████| 840/840 [01:19<00:00, 10.58it/s]


(Epoch 10) TRAIN LOSS:0.0198 ACC:1.00 F1:0.99 REC:0.99 PRE:1.00 LR:0.00001550


VALID LOSS:0.1932 ACC:0.97 F1:0.96 REC:0.98 PRE:0.96: 100%|██████████| 105/105 [00:07<00:00, 14.25it/s]

(Epoch 10) VALID LOSS:0.1932 ACC:0.97 F1:0.96 REC:0.98 PRE:0.96





In [41]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):  
    batch_seq = batch_data[-1]        
    loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

    # Calculate evaluation metrics
    list_hyp += batch_hyp
    list_label += batch_label

metrics = pos_tag_metrics_fn(list_hyp, list_label)
print(metrics_to_string(metrics))

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df) 

100%|██████████| 105/105 [00:03<00:00, 28.52it/s]

ACC:0.97 F1:0.95 REC:0.95 PRE:0.96
     index                                              label
0        0  [B-NNO, B-NNO, B-NNP, B-VBI, B-PPO, B-NNP, B-S...
1        1  [B-NNO, B-VBP, B-PPO, B-NNO, B-SYM, B-NNO, B-N...
2        2  [B-SYM, B-NNO, B-PPO, B-NNO, B-VBI, B-PPO, B-N...
3        3  [B-SYM, B-PRN, B-NNO, B-ART, B-NNO, B-NNO, B-S...
4        4  [B-KUA, B-KUA, B-NNO, B-NNO, B-ADV, B-ADJ, B-C...
..     ...                                                ...
835    835  [B-NNP, B-ADV, B-VBT, B-NNO, B-NNP, B-PPO, B-V...
836    836  [B-SYM, B-CCN, B-VBP, B-NNO, B-VBT, B-NNO, B-N...
837    837  [B-NNP, B-NNP, B-NNO, B-ART, B-ADJ, B-VBP, B-P...
838    838  [B-NNP, B-SYM, B-NNP, B-SYM, B-NNP, B-NNO, B-N...
839    839  [B-NNO, B-NNO, B-NNP, B-SYM, B-NNP, B-SYM, B-N...

[840 rows x 2 columns]





In [40]:
torch.save(model.state_dict(), 'postagger_indobert.pth')

# Use Pos Tagger Model for Verb Extraction

## Load and Tokenize IndoSum Data

In [6]:
indosum_data = load_dataset("maryantocinn/indosum", trust_remote_code=True)

In [None]:
tokenized_indosum = []
sentence_data = []
for i in range (len(indosum_data['train'])):
    sentences = indosum_data['train'][i]['document'].split('. ')
    for j in range (len(sentences)):
        tokens = re.findall(r'\w+|[^\w\s]', sentences[j].lower())
        tokens.append('.')
        pos_tags = ['B-NNP' for token in range (len(tokens))]
        tokenized_indosum.append({'tokens': tokens, 'pos_tags': pos_tags})
        sentence_data.append({'sentence': sentences[j].lower(), 'article_id': i, 'sentence_id': j})
sentence_data[:5]

[{'sentence': 'ketua mpr zulkifli hasan menyesalkan kisruh yang terjadi antara pelaku sarana transportasi online dan tradisional',
  'article_id': 0,
  'sentence_id': 0},
 {'sentence': 'zulkifli menyarankan adanya pertemuan bersama antara pemerintah, pelaku transportasi online dan transportasi tradisional demi meredam kisruh yang masih belum terselesaikan',
  'article_id': 0,
  'sentence_id': 1},
 {'sentence': 'zulkifli menilai aturan yang dikeluarkan pemerintah seharusnya tidak hanya membahas tarif tapi juga mekanisme yang dapat menguntungkan semua pihak, baik pelaku transportasi online maupun tradisional',
  'article_id': 0,
  'sentence_id': 2},
 {'sentence': '" tidak hanya tarif tapi apa saja harus diatur',
  'article_id': 0,
  'sentence_id': 3},
 {'sentence': 'dipanggil keduanya untuk berbicara masing-masing, musyawarah, duduk bareng kemudian dibuat aturan yang saling menguntungkan',
  'article_id': 0,
  'sentence_id': 4}]

In [121]:
tokenized_indosum.pop(253116)
sentence_data.pop(253116)

{'sentence': 'penulis: wisnu nova wistowo \n                    \n                                             editor: jalu wisnu wirajati \n                                        \n                                         sumber: \n                                             bbc \n                                        \n                                    \n            \n            \n           \n                 topik: \n                                \n                                              \n                             paul pogba \n                        \n                                              \n                             manchester united \n                        \n                                              \n                             premier league \n                        \n                                    \n            \n        \n\n        \n                \n        \n            \n                \n                     komentar \n         

In [141]:
test_indosum = PosTagProsaDataset(tokenized_indosum, tokenizer, lowercase=True)
test_indosum_loader = PosTagDataLoader(dataset=test_indosum, max_seq_len=512, batch_size=8, shuffle=False)

## Load Model and Extract Verb

In [37]:
model.load_state_dict(torch.load('postagger_indobert.pth'))

  model.load_state_dict(torch.load('postagger_indobert.pth'))


<All keys matched successfully>

In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_indosum_loader, leave=True, total=len(test_indosum_loader))
for i, batch_data in enumerate(pbar):  
    batch_seq = batch_data[-1]        
    loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

    # Calculate evaluation metrics
    list_hyp += batch_hyp
    list_label += batch_label

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('indosum_postags_train.csv', index=False)

print(df) 

100%|██████████| 1643/1643 [00:57<00:00, 28.56it/s]


       index                                              label
0          0  [B-NNO, B-NNP, B-NNP, B-NNP, B-VBT, B-ADJ, B-P...
1          1  [B-NNP, B-VBT, B-NNO, B-NNO, B-VBI, B-PPO, B-N...
2          2  [B-NNP, B-VBT, B-NNO, B-PRR, B-VBP, B-NNO, B-A...
3          3  [B-SYM, B-NEG, B-ADV, B-NNO, B-CCN, B-PRI, B-A...
4          4  [B-VBP, B-NUM, B-PPO, B-VBI, B-PRN, B-SYM, B-P...
...      ...                                                ...
13139  13139  [B-PPO, B-NNO, B-NUM, B-VBT, B-NNO, B-NNP, B-P...
13140  13140  [B-SYM, B-CSN, B-SYM, B-PRN, B-VBP, B-VBT, B-N...
13141  13141  [B-NNP, B-NNP, B-PAR, B-VBI, B-VBP, B-KUA, B-N...
13142  13142  [B-SYM, B-CCN, B-PAR, B-PRN, B-ADV, B-VBT, B-P...
13143  13143         [B-SYM, B-NNP, B-SYM, B-NNP, B-SYM, B-SYM]

[13144 rows x 2 columns]


## Save Output

In [143]:
sentence_postag = []
for i in range (len(sentence_data)):
    for j in range (len(df.loc[i]['label'])):
        if w2i[df.loc[i]['label'][j]] in [4,6,9,10,25]:
            sentence_postag.append({'sentence': sentence_data[i]['sentence'], 'article_id': sentence_data[i]['article_id'], 'sentence_id': sentence_data[i]['sentence_id'], 'verb': tokenized_indosum[i]['tokens'][j]})
sentence_postag[:5]

[{'sentence': 'ketua mpr zulkifli hasan menyesalkan kisruh yang terjadi antara pelaku sarana transportasi online dan tradisional',
  'article_id': 0,
  'sentence_id': 0,
  'verb': 'menyesalkan'},
 {'sentence': 'ketua mpr zulkifli hasan menyesalkan kisruh yang terjadi antara pelaku sarana transportasi online dan tradisional',
  'article_id': 0,
  'sentence_id': 0,
  'verb': 'terjadi'},
 {'sentence': 'zulkifli menyarankan adanya pertemuan bersama antara pemerintah, pelaku transportasi online dan transportasi tradisional demi meredam kisruh yang masih belum terselesaikan',
  'article_id': 0,
  'sentence_id': 1,
  'verb': 'menyarankan'},
 {'sentence': 'zulkifli menyarankan adanya pertemuan bersama antara pemerintah, pelaku transportasi online dan transportasi tradisional demi meredam kisruh yang masih belum terselesaikan',
  'article_id': 0,
  'sentence_id': 1,
  'verb': 'bersama'},
 {'sentence': 'zulkifli menyarankan adanya pertemuan bersama antara pemerintah, pelaku transportasi online d

In [144]:
verb_df = pd.DataFrame(sentence_postag)
verb_df.head()

Unnamed: 0,sentence,article_id,sentence_id,verb
0,ketua mpr zulkifli hasan menyesalkan kisruh ya...,0,0,menyesalkan
1,ketua mpr zulkifli hasan menyesalkan kisruh ya...,0,0,terjadi
2,zulkifli menyarankan adanya pertemuan bersama ...,0,1,menyarankan
3,zulkifli menyarankan adanya pertemuan bersama ...,0,1,bersama
4,zulkifli menyarankan adanya pertemuan bersama ...,0,1,meredam


In [None]:
verb_df.to_csv('indosum_verb_train.csv', index=False)