In [None]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP/project_nlp

/content/drive/MyDrive/NLP/project_nlp


### Environment

#### Install dependency

In [None]:
!pip install transformers seqeval[gpu] -q
!pip install fairseq -q
!pip install fastBPE -q

[K     |████████████████████████████████| 3.4 MB 10.0 MB/s 
[K     |████████████████████████████████| 43 kB 1.4 MB/s 
[K     |████████████████████████████████| 61 kB 451 kB/s 
[K     |████████████████████████████████| 596 kB 51.1 MB/s 
[K     |████████████████████████████████| 895 kB 50.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 26.7 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.7 MB 8.4 MB/s 
[K     |████████████████████████████████| 145 kB 50.9 MB/s 
[K     |████████████████████████████████| 90 kB 8.7 MB/s 
[K     |████████████████████████████████| 74 kB 2.8 MB/s 
[K     |████████████████████████████████| 112 kB 52.8 MB/s 
[?25h  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Building wheel for fastBPE (setup.py) ... [?25l[?25hdone


#### Import libs and check environment

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import re

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
from transformers import RobertaModel, RobertaConfig, BertPreTrainedModel, RobertaForTokenClassification
from transformers.modeling_outputs  import TokenClassifierOutput

from torch.utils.data import TensorDataset

import seqeval
from seqeval.metrics import classification_report, f1_score

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


### Dataset

#### Read dataframe

In [None]:
data = pd.read_csv("./data/seq_tag/tokens_labeled_no_whitelist_col_price.csv", encoding='utf-8')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.rename(columns={'sentence': 'Sentence #', 'tokens': 'Word', 'tag': 'Tag'}, inplace=True)
data['Sentence #'] = data['Sentence #'].apply(lambda x: f'Sentence: {int(x+1)}')
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,combo,O
1,Sentence: 1,3,O
2,Sentence: 1,cái,O
3,Sentence: 1,giao,O
4,Sentence: 1,có,O


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89860 entries, 0 to 89859
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence #  89860 non-null  object
 1   Word        89860 non-null  object
 2   Tag         89860 non-null  object
dtypes: object(3)
memory usage: 2.1+ MB


In [None]:
data.count()

Sentence #    89860
Word          89860
Tag           89860
dtype: int64

In [None]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 5


O        86762
B-COL     1145
B-PRI      985
I-PRI      496
I-COL      472
Name: Tag, dtype: int64

In [None]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('COL', 1617), ('PRI', 1481)]


In [None]:
labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
labels_to_ids

{'B-COL': 1, 'B-PRI': 3, 'I-COL': 2, 'I-PRI': 4, 'O': 0}

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head(20)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,combo,O
1,Sentence: 1,3,O
2,Sentence: 1,cái,O
3,Sentence: 1,giao,O
4,Sentence: 1,có,O
5,Sentence: 1,1,O
6,Sentence: 1,cái,O
7,Sentence: 1,",",O
8,Sentence: 1,thành_ra,O
9,Sentence: 1,đặt,O


In [None]:
# let's create a new column called "sentence" which groups the words by sentence 
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,Tag,sentence,word_labels
0,Sentence: 1,combo,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Sentence: 1,3,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,Sentence: 1,cái,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Sentence: 1,giao,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,Sentence: 1,có,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data

Unnamed: 0,sentence,word_labels
0,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,mình mua áo có cổ màu trắng lại ship tới cho m...,"O,O,O,O,O,B-COL,I-COL,O,O,O,O,O,O,O,O,B-COL,I-..."
2,giao sai hàng . tôi muốn trả hàng . đặt be đậm...,"O,O,O,O,O,O,O,O,O,O,B-COL,I-COL,O,B-COL,I-COL"
3,sản_xuất việt_nam nhưng thấy in chữ trung_quốc...,"O,O,O,O,B-COL,I-COL,O,O,O,O,O,O,O"
4,mình đặt áo sơ_mi trắng dài tay mà shop giao c...,"O,O,O,O,B-COL,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...
3652,chất_lượng sản_phẩm giống mô tả.giao hàng nhan...,"O,O,O,O,O,O,O,O"
3653,"hài_lòng vô_cùng , giao nhanh , nhân_viên giao...","O,O,O,O,O,O,O,O,O,O,O,O,O,O"
3654,sản_phẩm ổn . giá phải_chăng . thật_sự là nhận...,"O,O,O,B-PRI,B-PRI,O,O,O,O,O,O,O,O,O,O,O,O,O"
3655,hàng đẹp chuẩn chất_lượng . nếu áo có đai ngan...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


#### Expand the labels with subword based tokenizer

*PhoBERT sử dụng RDRSegmenter của VnCoreNLP (đã thực hiện trong dataframe) để tách từ cho dữ liệu đầu vào trước khi qua BPE encoder.*

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="./PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("./PhoBERT_base_transformers/dict.txt")

In [None]:
labels_to_ids['X'] = -100

In [None]:
labels_to_ids

{'B-COL': 1, 'B-PRI': 3, 'I-COL': 2, 'I-PRI': 4, 'O': 0, 'X': -100}

In [None]:
from tqdm import tqdm
def convert_lines(lines, tags, vocab, bpe, max_sequence_length=256):
    """
    lines: list các văn bản input
    tags: list các chuỗi tag
    vocab: từ điển dùng để encoding subwords
    bpe: 
    """
    # Index của các token cls (đầu câu), eos (cuối câu), padding (padding token)
    outputs = np.zeros((len(lines), max_sequence_length), dtype=np.int32) # --> shape (number_lines, max_seq_len)
    outputs_labels = np.zeros((len(lines), max_sequence_length), dtype=np.int32)
    outputs_attention_mask = np.zeros((len(lines), max_sequence_length), dtype=np.int32)
    # Index của các token cls (đầu câu), eos (cuối câu), padding (padding token)
    cls_id = 0
    eos_id = 2
    pad_id = 1
    
    for idx, row in tqdm(enumerate(lines), total=len(lines)): 
        # Mã hóa subwords theo byte pair encoding(bpe)
        subwords = bpe.encode(row)
        subwords = '<s> '+ subwords +' </s>'
        input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
        
        tag_list = ['O'] + tags[idx].split(',') + ['O']
        subword_idx = [subwords.split().index(word) for word in subwords.split() if '@@' in word]
        for i, orig_idx in enumerate(subword_idx):
            tag_list.insert(orig_idx+1, 'X')
        # print(tag_list)
        labels = [labels_to_ids[label] for label in tag_list] 

        # Truncate input nếu độ dài vượt quá max_seq_len
        if len(input_ids) > max_sequence_length: 
            input_ids = input_ids[:max_sequence_length]
            input_ids[-1] = eos_id
            labels = labels[:max_sequence_length]
            labels[-1] = -100
        else:
        # Padding nếu độ dài câu chưa bằng max_seq_len
            input_ids = input_ids + [pad_id, ]*(max_sequence_length - len(input_ids))
            labels = labels + [-100, ]*(max_sequence_length - len(labels))
        
        labels[0] = -100
        outputs[idx,:] = np.array(input_ids)
        outputs_labels[idx,:] = np.array(labels)
        outputs_attention_mask[idx, np.array(input_ids)!=pad_id] = 1

    return outputs, outputs_labels, outputs_attention_mask

lines = ['mua được giá tốt lại được freeship mừng rơi nước_mắt đối_với tỉnh_lẻ thì tiền ship là 1 trở_ngại sản_phẩm quá ổn cảm_ơn shop cảm_ơn tiki this is english sentences'] 
tags = ['O,O,B-PRI,O,O,O,B-PRI,O,O,O,O,O,O,B-PRI,I-PRI,O,O,O,O,O,O,O,O,O,O,O,O,O,O']

ids, labels, masks = convert_lines(lines, tags, vocab, bpe)
print('input_ids tensor encode: {}\n, shape: {}\n'.format(ids[:10], ids.size))
print('label_ids tensor encode: {}\n, shape: {}\n'.format(labels[:10], labels.size))
print('masks tensor encode: {}\n, shape: {}\n'.format(masks[:10], masks.size))
# print('x1 tensor decode: ', phoBERT_cls.decode(torch.tensor(x1))[:103])

100%|██████████| 1/1 [00:00<00:00, 15.26it/s]

input_ids tensor encode: [[    0   188    11   133   167    44    11 18288  2438 56679  2766   891
   2396   190 27159    54   123 16132     8    99  5769   265   204  4752
   2321  9405  2321  2081  5418 22304  2573 15601  2455 14641  1302  6502
  26442     2     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1




In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [None]:
X, Y_label, Y_mask = convert_lines(data.sentence.values, data.word_labels.values, vocab, bpe, max_sequence_length=MAX_LEN)
print('X shape: ', X.shape)
print('Y label shape', Y_label.shape)
print('Y mask shape', Y_mask.shape)

100%|██████████| 3657/3657 [00:01<00:00, 2593.87it/s]

X shape:  (3657, 256)
Y label shape (3657, 256)
Y mask shape (3657, 256)





In [None]:
import pickle

def _save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)

def _load_pkl(path):
  with open(path, 'rb') as f:
    obj = pickle.load(f)
  return obj

_save_pkl('./data/processed/X_col.pkl', X)
_save_pkl('./data/processed/Y_label_col.pkl', Y_label)
_save_pkl('./data/processed/Y_mask_col.pkl', Y_mask)

In [None]:
X = _load_pkl('./data/processed/X_col.pkl')
Y_label = _load_pkl('./data/processed/Y_label_col.pkl')
Y_mask = _load_pkl('./data/processed/Y_mask_col.pkl')

print('length of X: ', len(X))
print('length of y: ', len(Y_label))
print('length of y: ', len(Y_mask))

length of X:  3657
length of y:  3657
length of y:  3657


Let's have a look at the first training example:

Let's verify that the input ids and corresponding targets are correct:

In [None]:
def decode(tokens: torch.LongTensor, labels: torch.LongTensor):
    print(tokens.dim())
    assert tokens.dim() == 1
    assert labels.dim() == 1
    tokens = tokens.numpy()
    labels - labels.numpy()
    if tokens[0] == vocab.bos():
        tokens = tokens[1:]  # remove <s>
    eos_mask = tokens == vocab.eos()
    doc_mask = eos_mask[1:] & eos_mask[:-1]
    sentences = np.split(tokens,  doc_mask.nonzero()[0] + 1)
    labels = np.split(labels, doc_mask.nonzero()[0] + 1)
    sentences = [
        bpe.decode(vocab.string(s)) for s in sentences
    ]
    labels = [np.delete(l, np.where(l == -100))[:-1] for l in labels]
    if len(sentences) == 1:
        return sentences[0], labels[0]
    return sentences, labels

In [None]:
test_idx = 54
sentences, labels = decode(torch.tensor(X[test_idx]), torch.tensor(Y_label[test_idx]))
for token, label in zip(sentences.split(), labels):
  print('{0:10}  {1:10} {2:10}'.format(token, label, ids_to_labels[int(label)]))

1
tôi                  0 O         
đặt                  0 O         
áo                   0 O         
bra                  0 O         
màu                  1 B-COL     
trắng                2 I-COL     
mà                   0 O         
shop                 0 O         
gửi                  0 O         
màu                  1 B-COL     
đen                  2 I-COL     
,                    0 O         
và                   0 O         
chất_lượng           0 O         
vải                  0 O         
và                   0 O         
mút                  0 O         
đệm                  0 O         
ngực                 0 O         
thì                  0 O         
quá                  0 O         
tệ                   0 O         
so                   0 O         
với                  0 O         
giá                  3 B-PRI     
tiền                 4 I-PRI     
của                  0 O         
sản_phẩm             0 O         
!                    0 O         


### Training

#### **Defining the model**

In [None]:
class argu():
    def __init__(self):
        # self.train_path = './data/train.csv'
        self.dict_path = "./PhoBERT_base_transformers/dict.txt"
        self.config_path = "./PhoBERT_base_transformers/config.json"
        # self.rdrsegmenter_path = '/content/vncorenlp/VnCoreNLP-1.1.1.jar'
        self.pretrained_path = './PhoBERT_base_transformers/model.bin'
        self.max_sequence_length = 256
        self.batch_size = 8
        self.accumulation_steps = 1
        self.epochs = 10
        self.seed = 69
        self.fold = 0
        self.lr= 1e-5
        self.ckpt_path = './checkpoints'
        self.bpe_codes = "./PhoBERT_base_transformers/bpe.codes"
args = argu()

In [None]:
config = RobertaConfig.from_pretrained(
    args.config_path,
    output_hidden_states=True,
    return_dict=True,
    num_labels=5,
    classifier_dropout = True
)

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [None]:
train_size = 0.8
def train_test_split(data, train_size):
    X_df = pd.DataFrame(data)
    X_train = X_df.sample(frac = train_size, random_state=200)
    X_test = X_df.drop(X_train.index).reset_index(drop=True)
    X_train = X_train.reset_index(drop=True)
    return X_train.values, X_test .values

X_train, X_test = train_test_split(X, train_size)
Y_label_train, Y_label_test = train_test_split(Y_label, train_size)
Y_mask_train, Y_mask_test = train_test_split(Y_mask, train_size)

In [None]:
train_dataset = TensorDataset(torch.tensor(X_train,dtype=torch.long), 
                              torch.tensor(Y_label_train,dtype=torch.long))

valid_dataset = TensorDataset(torch.tensor(X_test,dtype=torch.long), 
                              torch.tensor(Y_label_test,dtype=torch.long))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [None]:
Y_label_train.flatten().shape

(749056,)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
y_org = Y_label_train
class_weight = compute_class_weight(class_weight='balanced', classes = np.array([0,1,2,3,4]), y=y_org.flatten()[y_org.flatten()>=0])

In [None]:
class_weight

array([ 0.20681367, 16.49712974, 41.89212828, 18.54064516, 38.01322751])

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class Roberta_SeqTag(BertPreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"
    def __init__(self, config):
        super(Roberta_SeqTag, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)

        classifier_dropout = (
                config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
            )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # self.lstm = nn.LSTM(config.hidden_size, 256, num_layers=1, bidirectional=True)
        # self.classifier = nn.Linear(256*2, config.num_labels)
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        # x, _ = self.lstm(sequence_output)
        # sequence_output = torch.tanh(x)
        logits = self.classifier(sequence_output)
        # print(logits.shape)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weight, dtype=torch.float).to(device))
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                # print(
                #     'active loss', active_loss, active_loss.shape,
                #     'active_logits', active_logits, active_logits.shape,
                #     'active_labels', active_labels, active_labels.shape)
                loss = loss_fct(active_logits, active_labels)
                # quit()
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
model = Roberta_SeqTag.from_pretrained(args.pretrained_path, config=config)
model.cuda()

# model = RobertaForTokenClassification.from_pretrained(args.pretrained_path, config=config)
# model.cuda()

from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", num_labels=9)
model.cuda()

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch, verbose = False):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        ids, labels = batch
        ids = ids.to(device)
        labels = labels.to(device)
        mask = ids!=1
        
        # check(ids, mask, labels)
        # break

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        tr_logits = outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0 and verbose:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        # print(active_logits)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # print(flattened_predictions)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    labels = [ids_to_labels[id.item()] for id in tr_labels]
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    f1 = seqeval.metrics.f1_score([labels], [predictions])

    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1}")

In [None]:
def valid(model, test_loader, verbose=False):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            
            ids, labels = batch
            ids = ids.to(device)
            labels = labels.to(device)
            mask = ids!=1 

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            eval_logits = outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0 and verbose:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    f1 = seqeval.metrics.f1_score([labels], [predictions])
    print(f"Validation Loss: {eval_loss}", f"Validation F1: {f1}")

    return labels, predictions

And let's train the model!

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

In [None]:
# Creating optimizer and lr schedulers
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(args.epochs*len(train_dataset)/args.batch_size/args.accumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler

tsfm = model.roberta
for child in tsfm.children():
    for param in child.parameters():
        if not param.requires_grad:
            print("whoopsies")
        param.requires_grad = False
frozen = True

In [None]:
EPOCHS = 50
import time

for epoch in range(EPOCHS):
    if epoch > 0 and frozen:
        for child in tsfm.children():
            for param in child.parameters():
                param.requires_grad = True
        frozen = False
        del scheduler0
        torch.cuda.empty_cache()
    st = time.time()
    print(f"Training epoch: {epoch + 1}")
    train(epoch)
    labels, predictions = valid(model, valid_loader)
    
    print('Time: ',time.time() - st)

Training epoch: 1
Training loss epoch: 1.601916303074425 Training F1 epoch: 0.0
Validation Loss: 1.5978773343758506 Validation F1: 0.013726581339269153
Time:  42.21763038635254
Training epoch: 2
Training loss epoch: 1.5942021004163502 Training F1 epoch: 0.0
Validation Loss: 1.5868488765153728 Validation F1: 0.014876978828914741
Time:  118.25689506530762
Training epoch: 3
Training loss epoch: 1.5893093720485605 Training F1 epoch: 0.0
Validation Loss: 1.5777369985163539 Validation F1: 0.014016385351890238
Time:  132.99086046218872
Training epoch: 4
Training loss epoch: 1.5851695277000386 Training F1 epoch: 0.0
Validation Loss: 1.5694774444637403 Validation F1: 0.013484523444682806
Time:  118.61079287528992
Training epoch: 5
Training loss epoch: 1.5793420068227528 Training F1 epoch: 0.0
Validation Loss: 1.561345940079194 Validation F1: 0.012985630654603512
Time:  117.08675813674927
Training epoch: 6
Training loss epoch: 1.5778584133406155 Training F1 epoch: 0.0
Validation Loss: 1.55410355

KeyboardInterrupt: ignored

In [None]:
for item in zip(labels, predictions):
    print(item)

In [None]:
from seqeval.metrics import classification_report
print(classification_report([labels], [labels]))

In [None]:
print("Number of tags: {}".format(len(data.tags.unique())))
frequencies = data.Tag.value_counts()
frequencies

#### **Inference**

The fun part is when we can quickly test the model on new, unseen sentences. 
Here, we use the prediction of the **first word piece of every word** (which is how the model was trained). 

*In other words, the code below does not take into account when predictions of different word pieces that belong to the same word do not match.*

In [None]:
ids_to_labels[-100] = 'X'

In [None]:
sentence = ["Adam is a company based in New York, but is also has employees working in Paris"]
ids, labels, masks = convert_lines(lines, tags, vocab, bpe)

# move to gpu
ids = torch.tensor(ids).to(device)
mask = torch.tensor(masks).to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

sentences, predictions = decode(ids.squeeze(0).cpu(), flattened_predictions.cpu())
_, labels = decode(ids.squeeze(0).cpu(), torch.tensor(labels).squeeze())
for token, pred, label in zip(sentences.split(), predictions, labels.squeeze()):
  print('{0:10}  {1:10} {2:10} {3:10}'.format(token, pred, ids_to_labels[int(pred)], ids_to_labels[int(label)]))


# tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
# token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
# wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

# prediction = []
# for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
#   #only predictions on first word pieces are important
#   if mapping[0] == 0 and mapping[1] != 0:
#     prediction.append(token_pred[1])
#   else:
#     continue

# print(sentence.split())
# print(prediction)

#### **Saving the model for future use**

Finally, let's save the vocabulary (.txt) file, model weights (.bin) and the model's configuration (.json) to a directory, so that both the tokenizer and model can be re-loaded using the `from_pretrained()` class method.


In [None]:
import os

directory = "./model"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

## Legacy

The following code blocks were used during the development of this notebook, but are not included anymore.

label for subword 

In [None]:
# text = '<s> '+'Hôm_nay trời nóng quá nên tôi ở nhà viết Viblo!' +' </s>'
text = 'nên xem_lại tư_duy bán hàng , bán thua hàng_chợ thì cần xem_lại , đặt giá gốc cao xong hạ saleoff lừa à !' 
subwords = bpe.encode(text)
subwords = '<s> '+subwords +' </s>'
input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
print(subwords)
print(input_ids)
subword_idx = [subwords.split().index(word) for word in subwords.split() if '@@' in word]
print(subword_idx)
tag_list = 'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-PRI,I-PRI,I-PRI,O,B-PRI,B-PRI,O,O,O,O'.split(',')
for i, idx in enumerate(subword_idx):
    orig_idx = idx - i
    tag_list.insert(orig_idx+1, tag_list[orig_idx])
for pair in zip(subwords.split(), tag_list):
    print(pair)

In [None]:
def prepare_sentence(sentence, tokenizer, maxlen):    
      # step 1: tokenize the sentence
      tokenized_sentence = tokenizer.tokenize(sentence)
      
      # step 2: add special tokens 
      tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] 

      # step 3: truncating/padding
      if (len(tokenized_sentence) > maxlen):
        # truncate
        tokenized_sentence = tokenized_sentence[:maxlen]
      else:
        # pad
        tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]

      # step 4: obtain the attention mask
      attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
      
      # step 5: convert tokens to input ids
      ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
      
      return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
      }

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
sentence = "this is a test @huggingface".strip().split()

inputs = tokenizer(sentence, is_pretokenized=True, return_offsets_mapping=True, padding='max_length', truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
token_offsets = inputs["offset_mapping"]
print(tokens)
print(token_offsets)

In [None]:
word = "@huggingface"

inputs = tokenizer(word, return_offsets_mapping=True, padding='max_length', truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
token_offsets = inputs["offset_mapping"]
print(tokens)
print(token_offsets)

In [None]:
# now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

In [None]:
# check for initial loss
input_ids, labels = next(iter(train_loader))
input_ids.shape, labels.shape

input_ids = input_ids.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=input_ids!=1, labels=labels)
initial_loss = outputs[0]
initial_loss