In [1]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers seqeval[gpu] -q
!pip install fairseq -q
!pip install fastBPE -q
!pip install pytorch-crf -q

[K     |████████████████████████████████| 3.4 MB 5.5 MB/s 
[K     |████████████████████████████████| 43 kB 1.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 36.7 MB/s 
[K     |████████████████████████████████| 895 kB 46.0 MB/s 
[K     |████████████████████████████████| 596 kB 45.8 MB/s 
[K     |████████████████████████████████| 61 kB 380 kB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.7 MB 5.3 MB/s 
[K     |████████████████████████████████| 145 kB 48.4 MB/s 
[K     |████████████████████████████████| 90 kB 8.0 MB/s 
[K     |████████████████████████████████| 112 kB 53.6 MB/s 
[K     |████████████████████████████████| 74 kB 2.4 MB/s 
[?25h  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Building wheel for fastBPE (setup.py) ... [?25l[?25hdone


In [3]:
%cd /content/drive/MyDrive/NLP/project_nlp

/content/drive/MyDrive/NLP/project_nlp


In [16]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchcrf import CRF

from transformers import RobertaConfig, RobertaPreTrainedModel, RobertaModel, RobertaForTokenClassification, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
from transformers.modeling_outputs import TokenClassifierOutput, SequenceClassifierOutput

import seqeval
from seqeval.metrics import classification_report, f1_score

import pandas as pd
import numpy as np
import argparse
import time
import tqdm

from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary

from sklearn.utils.class_weight import compute_class_weight

from src.dataset import *

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
BATCHSIZE_TRAIN = 8
BATCHSIZE_VAL = 4
LEARNING_RATE = 5e-5
MAX_LEN = 128
NUM_EPOCH = 20
SEED = 42
NUM_CLASS = 5

## Data


In [7]:
data = pd.read_csv('./data/joint/data_joint.csv')
data

Unnamed: 0.1,Unnamed: 0,sentence,word_labels,label
0,0,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DES,B-DE...",-1
1,1,mình mua áo có cổ màu trắng lại ship tới cho m...,"O,O,B-DES,B-DES,I-DES,B-DES,I-DES,O,O,O,O,O,B-...",-1
2,2,giao sai hàng . tôi muốn trả hàng . đặt be đậm...,"O,O,O,O,O,O,O,O,O,O,B-DES,I-DES,O,B-DES,I-DES",-1
3,3,sản_xuất việt_nam nhưng thấy in chữ trung_quốc...,"O,O,O,O,B-DES,I-DES,O,O,O,O,O,O,O",-1
4,4,mình đặt áo sơ_mi trắng dài tay mà shop giao c...,"O,O,B-DES,I-DES,B-DES,B-DES,I-DES,O,O,O,O,O,O,...",-1
...,...,...,...,...
3652,3652,chất_lượng sản_phẩm giống mô tả . giao hàng nh...,"O,O,O,O,O,O,O,O,O,O",1
3653,3653,"hài_lòng vô_cùng , giao nhanh , nhân_viên giao...","O,O,O,O,O,O,O,O,O,O,O,B-DES,O,B-DES",1
3654,3654,sản_phẩm ổn . giá phải_chăng . thật_sự là nhận...,"O,O,O,B-PRI,B-PRI,O,O,O,O,O,O,O,O,O,O,O,O,O",1
3655,3655,hàng đẹp chuẩn chất_lượng . nếu áo có đai ngan...,"O,B-DES,O,O,O,O,B-DES,O,O,O,B-DES,O,B-DES,O,O,O",1


In [8]:
parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="./PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("./PhoBERT_base_transformers/dict.txt")

labels_to_ids = {'B-DES': 1, 'B-PRI': 3, 'I-DES': 2, 'I-PRI': 4, 'O': 0, 'X': -100}
ids_to_labels = {0: 'O', 1: 'B-DES', 2: 'I-DES', 3:'B-PRI', 4:'I-PRI'}

X, Y_label, Y_mask = convert_lines(
    data.sentence.values, 
    data.word_labels.values, 
    vocab, 
    bpe, 
    labels_to_ids, 
    max_sequence_length=MAX_LEN)

print('X shape: ', X.shape)
print('Y label shape', Y_label.shape)
print('Y mask shape', Y_mask.shape)

train_size = 0.8
def train_test_split(data, train_size):
    X_df = pd.DataFrame(data)
    X_train = X_df.sample(frac = train_size, random_state=200)
    X_test = X_df.drop(X_train.index).reset_index(drop=True)
    X_train = X_train.reset_index(drop=True)
    return X_train.values, X_test .values

X_train, X_test = train_test_split(X, train_size)
Y_label_train, Y_label_test = train_test_split(Y_label, train_size)
Y_mask_train, Y_mask_test = train_test_split(Y_mask, train_size)

class_weight = compute_class_weight(
    class_weight='balanced', 
    classes = np.array([0,1,2,3,4]), 
    y=Y_label_train.flatten()[Y_label_train.flatten()>=0])

print(class_weight)

100%|██████████| 3657/3657 [00:01<00:00, 2483.53it/s]


X shape:  (3657, 128)
Y label shape (3657, 128)
Y mask shape (3657, 128)
[ 0.24328616  1.55779008  6.52840909 15.97682503 31.33636364]


In [9]:
train_dataset = TensorDataset(
    torch.tensor(X_train,dtype=torch.long), 
    torch.tensor(Y_label_train,dtype=torch.long)
    )
valid_dataset = TensorDataset(
    torch.tensor(X_test,dtype=torch.long), 
    torch.tensor(Y_label_test,dtype=torch.long)
    )

train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=BATCHSIZE_TRAIN, 
    shuffle=True
    )
valid_loader = torch.utils.data.DataLoader(
    valid_dataset, 
    batch_size=BATCHSIZE_VAL, 
    shuffle=False
    )

In [10]:
class argu():
    def __init__(self):
        self.dict_path = "./PhoBERT_base_transformers/dict.txt"
        self.config_path = "./PhoBERT_base_transformers/config.json"
        self.max_sequence_length = MAX_LEN
        self.accumulation_steps = 1
        self.epochs = NUM_EPOCH
        self.seed = SEED
        self.bpe_codes = "./PhoBERT_base_transformers/bpe.codes"
args = argu()

config = RobertaConfig.from_pretrained(
    args.config_path,
    output_hidden_states=True,
    return_dict=True,
    num_labels=NUM_CLASS,
    pad_token_id = 1,
    bos_token_id = 0,
    eos_token_id = 2,
    attention_probs_dropout_prob = 0.1,
    classifier_dropout=0.5,
    gradient_checkpointing=False,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    hidden_size=768,
    initializer_range=0.02,
    intermediate_size=3072,
    layer_norm_eps=1e-05,
    max_position_embeddings=258,
    model_type="roberta",
    num_attention_heads=12,
    num_hidden_layers=12,
    position_embedding_type="absolute",
    tokenizer_class="PhobertTokenizer",
    transformers_version="4.15.0",
    type_vocab_size=1,
    use_cache=True,
    vocab_size=64001
)

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [None]:
def train(epoch, verbose = False):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        ids, labels = batch
        ids = ids.to(device)
        labels = labels.to(device)
        mask = ids!=1

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        
        tr_logits = outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0 and verbose:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        try: 
            scheduler0.step()
        except:
            scheduler.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    labels = [ids_to_labels[id.item()] for id in tr_labels]
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    f1 = seqeval.metrics.f1_score([labels], [predictions])

    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1}")

def valid(model, test_loader, verbose=False):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            
            ids, labels = batch
            ids = ids.to(device)
            labels = labels.to(device)
            mask = ids!=1 

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            eval_logits = outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0 and verbose:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    f1 = seqeval.metrics.f1_score([labels], [predictions])
    print(f"Validation Loss: {eval_loss}", f"Validation F1: {f1}")

    return labels, predictions, f1

def train_crf(epoch, verbose = False):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    model.train()
    
    for idx, batch in enumerate(train_loader):
        ids, labels = batch
        ids = ids.to(device)
        labels = labels.to(device)
        mask = ids!=1
        
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
                
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0 and verbose:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        flattened_predictions = outputs[1].view(-1)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        try: 
            scheduler0.step()
        except:
            scheduler.step()
            
    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    labels = [ids_to_labels[id.item()] for id in tr_labels]
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    f1 = seqeval.metrics.f1_score([labels], [predictions])

    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1}")

def valid_crf(model, test_loader, verbose=False):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            
            ids, labels = batch
            ids = ids.to(device)
            labels = labels.to(device)
            mask = ids!=1 

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0 and verbose:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            flattened_predictions = outputs[1].view(-1)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    # eval_accuracy = eval_accuracy / nb_eval_steps
    f1 = seqeval.metrics.f1_score([labels], [predictions])
    print(f"Validation Loss: {eval_loss}", f"Validation F1: {f1}")

    return labels, predictions, f1

## Default Roberta sequence classification


In [18]:
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class RobertaForSequenceClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
checkpoint_path = './checkpoints/RobertaCNN.pth'     
model = RobertaForSequenceClassification('vinai/phobert-base', config=config)
model.cuda()

TypeError: ignored

## Phobert + LSTM + CRF + Tiki dataset

https://github.com/hemingkx/CLUENER2020/tree/main/BERT-Softmax

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train_modified(epoch, verbose = False):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    labels_sent_ar = []
    y_preds = None
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        ids, labels, labels_sent = batch
        ids = ids.to(device)
        labels = labels.to(device)
        labels_sent = labels_sent.to(device)
        mask = ids!=1
        
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels, labels_sent=labels_sent)
        loss = outputs[0]
        # predictions = outputs[1]
        # print('prediction:', predictions, predictions.shape)

        y_pred = outputs[2].squeeze().detach().cpu().numpy()
        y_preds = np.atleast_1d(y_pred) if y_preds is None else np.concatenate([y_preds, np.atleast_1d(y_pred)])
                
        # tr_logits = outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0 and verbose:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        # active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        # flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        flattened_predictions = outputs[1].view(-1)
        
        # only compute accuracy at active labels
        # print(labels)
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        # print(flattened_predictions.shape, active_accuracy.shape)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)
        labels_sent_ar.extend(labels_sent.cpu().detach().numpy())

        # tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        # tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    labels = [ids_to_labels[id.item()] for id in tr_labels]
    # print(tr_preds, tr_preds.shape)
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    # labels_se = [ids_to_labels[id.item()] for id in labels_sent_ar]
    # print(val_preds.shape)
    # print(len(labels_sent_ar))
    # print(labels_sent_ar.shape)
    f1_token = seqeval.metrics.f1_score([labels], [predictions])
    val_preds = np.argmax(softmax(y_preds), axis=1)
    # for item in zip(labels_sent_ar, val_preds):
    #   print(item)
    f1_sent = f1_score(labels_sent_ar, val_preds, average='micro')

    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1_token}")
    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1_sent}")

In [None]:
def valid_modified(model, test_loader, verbose=False):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    labels_sent_ar = []
    val_preds = None
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            
            ids, labels, labels_sent = batch
            ids = ids.to(device)
            labels = labels.to(device)
            labels_sent = labels_sent.to(device)
            mask = ids!=1 

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels, labels_sent=labels_sent)
            loss = outputs[0]
            # eval_logits = outputs[1]

            y_pred = outputs[2].squeeze().detach().cpu().numpy()
            # print(y_pred.shape)
            val_preds = np.atleast_1d(y_pred) if val_preds is None else np.concatenate([val_preds, np.atleast_1d(y_pred)])

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0 and verbose:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            # active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            # flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            flattened_predictions = outputs[1].view(-1)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            labels_sent_ar.extend(labels_sent.cpu().detach().numpy())
            
            # tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            # eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    # labels_se = [ids_to_labels[id.item()] for id in labels_sent_ar]
    
    eval_loss = eval_loss / nb_eval_steps
    # eval_accuracy = eval_accuracy / nb_eval_steps
    f1_token = seqeval.metrics.f1_score([labels], [predictions])
    val_preds = np.argmax(softmax(val_preds), axis=1)
    # print(val_preds.shape)
    # print(len(labels_sent_ar))
    # print(labels_sent_ar.shape)
    f1_sent = f1_score(labels_sent_ar, val_preds, average='micro')

    print(f"Validation Loss: {eval_loss}", f"Validation F1: {f1_token}")
    print(f"Validation Loss: {eval_loss}", f"Validation F1: {f1_sent}")

    return labels, predictions

In [None]:
import torch.nn.functional as F
EMBEDDING_SIZE = 768
NUM_FILTERS = 10

class RobertaLSTMCRF(RobertaPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.bilstm = nn.LSTM(config.hidden_size, (config.hidden_size) // 2, dropout=0.1, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.crf = CRF(num_tags=self.num_labels, batch_first=True)
        self.post_init()

        window_sizes=(1,2,3,5)

        self.convs = nn.ModuleList([nn.Conv2d(1, NUM_FILTERS, [window_size, EMBEDDING_SIZE], padding=(window_size - 1, 0)) for window_size in window_sizes])
       
        self.fc = nn.Linear(NUM_FILTERS * len(window_sizes), 3)

        # self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        labels_sent=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=True,
            return_dict=return_dict,
        )
        # print(type(outputs[0]))
        # print(outputs[0].shape)
        # print(len(outputs[1]))

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        logits_lstm = self.classifier(lstm_output)

        cls_output = torch.cat((outputs[1][-1][:,0, ...],outputs[1][-2][:,0, ...], outputs[1][-3][:,0, ...], outputs[1][-4][:,0, ...]),-1)

        xs = []
        for conv in self.convs:
            x2 = F.relu(conv(outputs[0].unsqueeze(1)))
            x2 = torch.squeeze(x2, -1)
            x2 = F.max_pool1d(x2, x2.size(2))
            xs.append(x2)
        x = torch.cat(xs, 2)       
        x = x.view(x.size(0), -1)
        logits = self.fc(x)

        loss_cls = nn.CrossEntropyLoss()
        # print(logits.shape)
        # print(labels_sent.shape)
        loss_cls =  loss_cls(logits, labels_sent.squeeze(1))

        loss = None
        if labels is not None:
            labels[labels==-100] = 0
            log_likelihood, tags = self.crf(logits_lstm, labels), self.crf.decode(logits_lstm)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits_lstm)
        tags = torch.Tensor(tags)
        tags = tags.to(device)

        if not return_dict:
            output = (tags,) + outputs[2:]
            return (((0.05 * loss)+loss_cls,) + output) if loss is not None else output

        return (0.05 * loss)+loss_cls, tags, logits

In [None]:
model = RobertaLSTMCRF.from_pretrained('vinai/phobert-base', num_labels=5)
model.cuda()

In [None]:
# Creating optimizer and lr schedulers
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(args.epochs*len(train_dataset)/args.batch_size/args.accumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler

tsfm = model.roberta
for child in tsfm.children():
    for param in child.parameters():
        if not param.requires_grad:
            print("whoopsies")
        param.requires_grad = False
frozen = True

EPOCHS = 10
MAX_GRAD_NORM = 1

import time

for epoch in range(EPOCHS):
    if epoch > 0 and frozen:
        for child in tsfm.children():
            for param in child.parameters():
                param.requires_grad = True
        frozen = False
        del scheduler0
        torch.cuda.empty_cache()
    st = time.time()
    print(f"Training epoch: {epoch + 1}")
    train_modified(epoch)
    labels, predictions = valid_modified(model, valid_loader)
    
    print('Time: ',time.time() - st)

Training epoch: 1
Training loss epoch: 3.469600145206895 Training F1 epoch: 0.3770815692915609
Training loss epoch: 3.469600145206895 Training F1 epoch: 0.5923392612859097
Validation Loss: 1.5113120947798637 Validation F1: 0.6185177409195884
Validation Loss: 1.5113120947798637 Validation F1: 0.7109589041095891
Time:  302.3198871612549
Training epoch: 2
Training loss epoch: 1.5101695066437675 Training F1 epoch: 0.7605388516109205
Training loss epoch: 1.5101695066437675 Training F1 epoch: 0.7055403556771546
Validation Loss: 0.8301835503888457 Validation F1: 0.8645441961637335
Validation Loss: 0.8301835503888457 Validation F1: 0.7575342465753426
Time:  377.74051785469055
Training epoch: 3
Training loss epoch: 0.8452368989566446 Training F1 epoch: 0.8881943021110883
Training loss epoch: 0.8452368989566446 Training F1 epoch: 0.8033515731874145
Validation Loss: 0.7875342660417703 Validation F1: 0.897861283230062
Validation Loss: 0.7875342660417703 Validation F1: 0.7698630136986301
Time:  374

In [None]:
a = torch.rand((2,3), dtype=torch.float32)
b = torch.tensor([[1],[2]], dtype=torch.long)
l = nn.CrossEntropyLoss()


In [None]:
l(a,b.squeeze())

In [None]:
b.squeeze().shape

torch.Size([2])

In [None]:
torch.rand((1,1)).squeeze(1).shape

torch.Size([1])

# Temp


In [None]:
from vncorenlp import VnCoreNLP

# To perform word segmentation, POS tagging, NER and then dependency parsing
annotator = VnCoreNLP("/content/VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g') 
 
    
# Input 
text = "Ông Nguyễn Khắc Chúc  đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây."

# To perform word segmentation, POS tagging, NER and then dependency parsing
annotated_text = annotator.annotate(text)

# To perform word segmentation only
word_segmented_text = annotator.tokenize(text) 


In [None]:
# Input 
text = "thô lỗ"

# To perform word segmentation, POS tagging, NER and then dependency parsing
annotated_text = annotator.annotate(text)

# To perform word segmentation only
word_segmented_text = annotator.tokenize(text) 
word_segmented_text

[['thô_lỗ']]