In [None]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP/project_nlp

/content/drive/MyDrive/NLP/project_nlp


### Environment

#### Install dependency

In [None]:
!pip install transformers seqeval[gpu] -q
!pip install fairseq -q
!pip install fastBPE -q
!pip install pytorch-crf



#### Import libs and check environment

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import re

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
from transformers import RobertaModel, RobertaConfig, BertPreTrainedModel, RobertaForTokenClassification
from transformers.modeling_outputs  import TokenClassifierOutput

from torch.utils.data import TensorDataset

import seqeval
from seqeval.metrics import classification_report, f1_score

from torchcrf import CRF

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


### Dataset

#### Read dataframe

In [None]:
data = pd.read_csv("./data/seq_tag/tokens_labeled_no_whitelist.csv", encoding='utf-8')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.rename(columns={'sentence': 'Sentence #', 'tokens': 'Word', 'tag': 'Tag'}, inplace=True)
data['Sentence #'] = data['Sentence #'].apply(lambda x: f'Sentence: {int(x+1)}')
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,combo,O
1,Sentence: 1,3,O
2,Sentence: 1,cái,O
3,Sentence: 1,giao,O
4,Sentence: 1,có,O


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89860 entries, 0 to 89859
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence #  89860 non-null  object
 1   Word        89860 non-null  object
 2   Tag         89860 non-null  object
dtypes: object(3)
memory usage: 2.1+ MB


In [None]:
data.count()

Sentence #    89860
Word          89860
Tag           89860
dtype: int64

In [None]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 5


O        74100
B-DES    11522
I-DES     2757
B-PRI      985
I-PRI      496
Name: Tag, dtype: int64

In [None]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('DES', 14279), ('PRI', 1481)]


In [None]:
labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
labels_to_ids

{'B-DES': 1, 'B-PRI': 3, 'I-DES': 2, 'I-PRI': 4, 'O': 0}

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head(20)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,combo,O
1,Sentence: 1,3,O
2,Sentence: 1,cái,O
3,Sentence: 1,giao,O
4,Sentence: 1,có,O
5,Sentence: 1,1,O
6,Sentence: 1,cái,O
7,Sentence: 1,",",O
8,Sentence: 1,thành_ra,O
9,Sentence: 1,đặt,O


In [None]:

# let's create a new column called "sentence" which groups the words by sentence 
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,Tag,sentence,word_labels
0,Sentence: 1,combo,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DES,B-DE..."
1,Sentence: 1,3,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DES,B-DE..."
2,Sentence: 1,cái,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DES,B-DE..."
3,Sentence: 1,giao,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DES,B-DE..."
4,Sentence: 1,có,O,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DES,B-DE..."


In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data

Unnamed: 0,sentence,word_labels
0,"combo 3 cái giao có 1 cái , thành_ra đặt 6 cái...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DES,B-DE..."
1,mình mua áo có cổ màu trắng lại ship tới cho m...,"O,O,B-DES,B-DES,I-DES,B-DES,I-DES,O,O,O,O,O,B-..."
2,giao sai hàng . tôi muốn trả hàng . đặt be đậm...,"O,O,O,O,O,O,O,O,O,O,B-DES,I-DES,O,B-DES,I-DES"
3,sản_xuất việt_nam nhưng thấy in chữ trung_quốc...,"O,O,O,O,B-DES,I-DES,O,O,O,O,O,O,O"
4,mình đặt áo sơ_mi trắng dài tay mà shop giao c...,"O,O,B-DES,I-DES,B-DES,B-DES,I-DES,O,O,O,O,O,O,..."
...,...,...
3652,chất_lượng sản_phẩm giống mô tả.giao hàng nhan...,"O,O,O,O,O,O,O,O"
3653,"hài_lòng vô_cùng , giao nhanh , nhân_viên giao...","O,O,O,O,O,O,O,O,O,O,O,B-DES,O,B-DES"
3654,sản_phẩm ổn . giá phải_chăng . thật_sự là nhận...,"O,O,O,B-PRI,B-PRI,O,O,O,O,O,O,O,O,O,O,O,O,O"
3655,hàng đẹp chuẩn chất_lượng . nếu áo có đai ngan...,"O,B-DES,O,O,O,O,B-DES,O,O,O,B-DES,O,B-DES,O,O,O"


#### Expand the labels with subword based tokenizer

*PhoBERT sử dụng RDRSegmenter của VnCoreNLP (đã thực hiện trong dataframe) để tách từ cho dữ liệu đầu vào trước khi qua BPE encoder.*

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="./PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("./PhoBERT_base_transformers/dict.txt")

In [None]:
labels_to_ids['X'] = -100

In [None]:
labels_to_ids

{'B-DES': 1, 'B-PRI': 3, 'I-DES': 2, 'I-PRI': 4, 'O': 0, 'X': -100}

In [None]:
from tqdm import tqdm
def convert_lines(lines, tags, vocab, bpe, max_sequence_length=256):
    """
    lines: list các văn bản input
    tags: list các chuỗi tag
    vocab: từ điển dùng để encoding subwords
    bpe: 
    """
    # Index của các token cls (đầu câu), eos (cuối câu), padding (padding token)
    outputs = np.zeros((len(lines), max_sequence_length), dtype=np.int32) # --> shape (number_lines, max_seq_len)
    outputs_labels = np.zeros((len(lines), max_sequence_length), dtype=np.int32)
    outputs_attention_mask = np.zeros((len(lines), max_sequence_length), dtype=np.int32)
    # Index của các token cls (đầu câu), eos (cuối câu), padding (padding token)
    cls_id = 0
    eos_id = 2
    pad_id = 1
    
    for idx, row in tqdm(enumerate(lines), total=len(lines)): 
        # Mã hóa subwords theo byte pair encoding(bpe)
        subwords = bpe.encode(row)
        subwords = '<s> '+ subwords +' </s>'
        input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
        
        tag_list = ['O'] + tags[idx].split(',') + ['O']
        subword_idx = [subwords.split().index(word) for word in subwords.split() if '@@' in word]
        for i, orig_idx in enumerate(subword_idx):
            tag_list.insert(orig_idx+1, 'X')
        # print(tag_list)
        labels = [labels_to_ids[label] for label in tag_list] 

        # Truncate input nếu độ dài vượt quá max_seq_len
        if len(input_ids) > max_sequence_length: 
            input_ids = input_ids[:max_sequence_length]
            input_ids[-1] = eos_id
            labels = labels[:max_sequence_length]
            labels[-1] = -100
        else:
        # Padding nếu độ dài câu chưa bằng max_seq_len
            input_ids = input_ids + [pad_id, ]*(max_sequence_length - len(input_ids))
            labels = labels + [-100, ]*(max_sequence_length - len(labels))
        
        labels[0] = -100
        # print(len(labels))
        labels[np.where(np.array(input_ids)==eos_id)[0][0]] = -100
        # print(np.where(np.array(input_ids)==eos_id)[0][0])
        # labels[input_ids==eos_id] = -100
        outputs[idx,:] = np.array(input_ids)
        outputs_labels[idx,:] = np.array(labels)
        outputs_attention_mask[idx, np.array(input_ids)!=pad_id] = 1

    return outputs, outputs_labels, outputs_attention_mask

lines = ['mua được giá tốt lại được freeship mừng rơi nước_mắt đối_với tỉnh_lẻ thì tiền ship là 1 trở_ngại sản_phẩm quá ổn cảm_ơn shop cảm_ơn tiki this is english sentences cảm_ơn'] 
tags = ['O,O,B-PRI,O,O,O,B-PRI,O,O,O,O,O,O,B-PRI,I-PRI,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O']

ids, labels, masks = convert_lines(lines, tags, vocab, bpe)
# print('input_ids tensor encode: {}\n, shape: {}\n'.format(ids[:10], ids.size))
# print('label_ids tensor encode: {}\n, shape: {}\n'.format(labels[:10], labels.size))
# print('masks tensor encode: {}\n, shape: {}\n'.format(masks[:10], masks.size))
# print('x1 tensor decode: ', phoBERT_cls.decode(torch.tensor(x1))[:103])

100%|██████████| 1/1 [00:00<00:00, 423.03it/s]


In [None]:
for item in zip(ids[0], labels[0], masks[0]):
    print(item)

(0, -100, 1)
(188, 0, 1)
(11, 0, 1)
(133, 3, 1)
(167, 0, 1)
(44, 0, 1)
(11, 0, 1)
(18288, 3, 1)
(2438, -100, 1)
(56679, -100, 1)
(2766, 0, 1)
(891, 0, 1)
(2396, 0, 1)
(190, 0, 1)
(27159, 0, 1)
(54, 0, 1)
(123, 3, 1)
(16132, 4, 1)
(8, 0, 1)
(99, 0, 1)
(5769, 0, 1)
(265, 0, 1)
(204, 0, 1)
(4752, 0, 1)
(2321, 0, 1)
(9405, 0, 1)
(2321, 0, 1)
(2081, 0, 1)
(5418, -100, 1)
(22304, 0, 1)
(2573, 0, 1)
(15601, 0, 1)
(2455, -100, 1)
(14641, -100, 1)
(1302, 0, 1)
(6502, -100, 1)
(26442, -100, 1)
(2321, 0, 1)
(2, -100, 1)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, -100, 0)
(1, 

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [None]:
X, Y_label, Y_mask = convert_lines(data.sentence.values, data.word_labels.values, vocab, bpe, max_sequence_length=MAX_LEN)
print('X shape: ', X.shape)
print('Y label shape', Y_label.shape)
print('Y mask shape', Y_mask.shape)

100%|██████████| 3657/3657 [00:01<00:00, 2933.63it/s]

X shape:  (3657, 128)
Y label shape (3657, 128)
Y mask shape (3657, 128)





In [None]:
import pickle

def _save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)

def _load_pkl(path):
  with open(path, 'rb') as f:
    obj = pickle.load(f)
  return obj

_save_pkl('./data/processed/X.pkl', X)
_save_pkl('./data/processed/Y_label.pkl', Y_label)
_save_pkl('./data/processed/Y_mask.pkl', Y_mask)

In [None]:
X = _load_pkl('./data/processed/X.pkl')
Y_label = _load_pkl('./data/processed/Y_label.pkl')
Y_mask = _load_pkl('./data/processed/Y_mask.pkl')

print('length of X: ', len(X))
print('length of y: ', len(Y_label))
print('length of y: ', len(Y_mask))

length of X:  3657
length of y:  3657
length of y:  3657


Let's have a look at the first training example:

Let's verify that the input ids and corresponding targets are correct:

In [None]:
def decode(tokens: torch.LongTensor, labels: torch.LongTensor):
    print(tokens.dim())
    assert tokens.dim() == 1
    assert labels.dim() == 1
    tokens = tokens.numpy()
    labels - labels.numpy()
    if tokens[0] == vocab.bos():
        tokens = tokens[1:]  # remove <s>
    eos_mask = tokens == vocab.eos()
    doc_mask = eos_mask[1:] & eos_mask[:-1]
    sentences = np.split(tokens,  doc_mask.nonzero()[0] + 1)
    labels = np.split(labels, doc_mask.nonzero()[0] + 1)
    sentences = [
        bpe.decode(vocab.string(s)) for s in sentences
    ]
    labels = [np.delete(l, np.where(l == -100))[:-1] for l in labels]
    if len(sentences) == 1:
        return sentences[0], labels[0]
    return sentences, labels

In [None]:
test_idx = 0
sentences, labels = decode(torch.tensor(X[test_idx]), torch.tensor(Y_label[test_idx]))
for token, label in zip(sentences.split(), labels):
  print('{0:10}  {1:10} {2:10}'.format(token, label, ids_to_labels[int(label)]))

1
combo                0 O         
3                    0 O         
cái                  0 O         
giao                 0 O         
có                   0 O         
1                    0 O         
cái                  0 O         
,                    0 O         
thành_ra             0 O         
đặt                  0 O         
6                    0 O         
cái                  0 O         
nhận                 0 O         
được                 0 O         
4                    0 O         
,                    0 O         
hàng                 0 O         
thì                  0 O         
vải                  1 B-DES     
xấu                  1 B-DES     
giống                0 O         
vải                  1 B-DES     
áo_mưa               1 B-DES     
,                    0 O         
con                  0 O         
trai                 0 O         
chê                  0 O         
vứt                  0 O         
đi                   0 O         


### Training

#### **Defining the model**

In [None]:
class argu():
    def __init__(self):
        # self.train_path = './data/train.csv'
        self.dict_path = "./PhoBERT_base_transformers/dict.txt"
        self.config_path = "./PhoBERT_base_transformers/config.json"
        # self.rdrsegmenter_path = '/content/vncorenlp/VnCoreNLP-1.1.1.jar'
        self.pretrained_path = './PhoBERT_base_transformers/model.bin'
        self.max_sequence_length = 128
        self.batch_size = 8
        self.accumulation_steps = 1
        self.epochs = 10
        self.seed = 69
        self.fold = 0
        self.lr= 1e-3
        self.ckpt_path = './checkpoints'
        self.bpe_codes = "./PhoBERT_base_transformers/bpe.codes"
args = argu()

In [None]:
config = RobertaConfig.from_pretrained(
    args.config_path,
    output_hidden_states=True,
    return_dict=True,
    num_labels=5,
    classifier_dropout = True,
    pad_token_id = 1,
    bos_token_id = 0,
    eos_token_id = 2
)

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [None]:
train_size = 0.8
def train_test_split(data, train_size):
    X_df = pd.DataFrame(data)
    X_train = X_df.sample(frac = train_size, random_state=200)
    X_test = X_df.drop(X_train.index).reset_index(drop=True)
    X_train = X_train.reset_index(drop=True)
    return X_train.values, X_test .values

X_train, X_test = train_test_split(X, train_size)
Y_label_train, Y_label_test = train_test_split(Y_label, train_size)
Y_mask_train, Y_mask_test = train_test_split(Y_mask, train_size)

In [None]:
train_dataset = TensorDataset(torch.tensor(X_train,dtype=torch.long), 
                              torch.tensor(Y_label_train,dtype=torch.long))

valid_dataset = TensorDataset(torch.tensor(X_test,dtype=torch.long), 
                              torch.tensor(Y_label_test,dtype=torch.long))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
y_org = Y_label_train
class_weight = compute_class_weight(class_weight='balanced', classes = np.array([0,1,2,3,4]), y=y_org.flatten()[y_org.flatten()>=0])

In [None]:
class_weight

array([ 0.24253478,  1.55774552,  6.60394231, 17.83922078, 36.43554377])

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class Roberta_SeqTag(BertPreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"
    def __init__(self, config):
        super(Roberta_SeqTag, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)

        classifier_dropout = (
                config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
            )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # self.lstm = nn.LSTM(config.hidden_size, 256, num_layers=1, bidirectional=True)
        # self.classifier = nn.Linear(256*2, config.num_labels)
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        # x, _ = self.lstm(sequence_output)
        # sequence_output = torch.tanh(x)
        logits = self.classifier(sequence_output)
        # print(logits.shape)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weight, dtype=torch.float).to(device))
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                # print(active_logits, active_labels)
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


class BertNER(BertPreTrainedModel):
    def __init__(self, config):
        super(BertNER, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = CRF(config.num_labels, batch_first=True)

        self.init_weights()

    def forward(self, input_data, token_type_ids=None, attention_mask=None, labels=None,
                position_ids=None, inputs_embeds=None, head_mask=None):
        input_ids, input_token_starts = input_data
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)
        sequence_output = outputs[0]

        # 去除[CLS]标签等位置，获得与label对齐的pre_label表示
        origin_sequence_output = [layer[starts.nonzero().squeeze(1)]
                                  for layer, starts in zip(sequence_output, input_token_starts)]
        # 将sequence_output的pred_label维度padding到最大长度
        padded_sequence_output = pad_sequence(origin_sequence_output, batch_first=True)
        # dropout pred_label的一部分feature
        padded_sequence_output = self.dropout(padded_sequence_output)
        # 得到判别值
        logits = self.classifier(padded_sequence_output)
        outputs = (logits,)
        if labels is not None:
            loss_mask = labels.gt(-1)
            loss = self.crf(logits, labels, loss_mask) * (-1)
            outputs = (loss,) + outputs

        # contain: (loss), scores
        return outputs



In [None]:

# model = Roberta_SeqTag.from_pretrained(args.pretrained_path, config=config)
# model.cuda()

# model = BertNER.from_pretrained(args.pretrained_path, config=config)
# model.cuda()


# model = RobertaForTokenClassification.from_pretrained(args.pretrained_path, config=config)
# model.cuda()

from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", num_labels=5)
model.cuda()

In [None]:
def check(ids, mask, labels):
    print(ids.shape, mask.shape, labels.shape)
    for id, mask, label in zip(ids, mask, labels):
        for item in zip(id, mask, label):
            print(item)
        # break

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch, verbose = False):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        ids, labels = batch
        ids = ids.to(device)
        labels = labels.to(device)
        mask = ids!=1
        
        # check(ids, mask, labels)
        # break

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        
        tr_logits = outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0 and verbose:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        # print(active_logits)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # print(flattened_predictions)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    labels = [ids_to_labels[id.item()] for id in tr_labels]
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    f1 = seqeval.metrics.f1_score([labels], [predictions])

    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1}")

In [None]:
def valid(model, test_loader, verbose=False):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            
            ids, labels = batch
            ids = ids.to(device)
            labels = labels.to(device)
            mask = ids!=1 

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            eval_logits = outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0 and verbose:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    f1 = seqeval.metrics.f1_score([labels], [predictions])
    print(f"Validation Loss: {eval_loss}", f"Validation F1: {f1}")

    return labels, predictions

And let's train the model!

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

In [None]:
# Creating optimizer and lr schedulers
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(args.epochs*len(train_dataset)/args.batch_size/args.accumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler


In [None]:
tsfm = model.roberta
# tsfm = model.bert
for child in tsfm.children():
    for param in child.parameters():
        if not param.requires_grad:
            print("whoopsies")
        param.requires_grad = False
frozen = True

In [None]:
EPOCHS = 50
import time

for epoch in range(EPOCHS):
    if epoch > 0 and frozen:
        for child in tsfm.children():
            for param in child.parameters():
                param.requires_grad = True
        frozen = False
        del scheduler0
        torch.cuda.empty_cache()
    st = time.time()
    print(f"Training epoch: {epoch + 1}")
    train(epoch)
    labels, predictions = valid(model, valid_loader)
    
    print('Time: ',time.time() - st)

Training epoch: 1
Training loss epoch: 0.5083288725852315 Training F1 epoch: 0.3209125475285171
Validation Loss: 0.3665421425766958 Validation F1: 0.5707162284678151
Time:  23.1031973361969
Training epoch: 2
Training loss epoch: 0.1407387124016993 Training F1 epoch: 0.8277699420153581
Validation Loss: 0.07579330769952773 Validation F1: 0.9144363341443633
Time:  72.79584550857544
Training epoch: 3
Training loss epoch: 0.05879130292651396 Training F1 epoch: 0.9288209834697815
Validation Loss: 0.0694106772027196 Validation F1: 0.9253974724826742
Time:  72.46054148674011
Training epoch: 4
Training loss epoch: 0.03974647820874561 Training F1 epoch: 0.9483019458500703
Validation Loss: 0.08786922149225634 Validation F1: 0.9119804400977995
Time:  72.0927665233612
Training epoch: 5
Training loss epoch: 0.023982190624452126 Training F1 epoch: 0.9653699942717284
Validation Loss: 0.06650996672552076 Validation F1: 0.9375127213515164
Time:  72.27134609222412
Training epoch: 6
Training loss epoch: 0

KeyboardInterrupt: ignored

In [None]:
print(f1_score([labels], [predictions]))
print(classification_report([labels], [predictions]))

### BiLSTM-CRF

In [None]:
class BertLstmCRF(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.bilstm = nn.LSTM(config.hidden_size, (config.hidden_size) // 2, dropout=0.1, batch_first=True,
                              bidirectional=True)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        logits = self.classifier(lstm_output)

        loss = None
        if labels is not None:
            # print(labels)
            labels[labels==-100] = 0
            log_likelihood, tags = self.crf(logits, labels), self.crf.decode(logits)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits)
        tags = torch.Tensor(tags)

        if not return_dict:
            output = (tags,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return loss, tags

In [None]:
from transformers import BertPreTrainedModel, BertModel, RobertaModel

model = BertLstmCRF.from_pretrained("vinai/phobert-base", num_labels=5)
model.cuda()

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
  "num_layers={}".format(dropout, num_layers))
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing BertLstmCRF: ['roberta.encoder.layer.1.intermediate.dense.bias', 'roberta.encoder.layer.4.attention.self.value.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.10.attention.self.query.bias', 'roberta.encoder.layer.11.attention.self.value.bias', 'roberta.encoder.layer.7.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.9.intermediate.dense.bias', 'roberta.encoder.layer.7.attention.output.dense.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.11.output.dense.bias', 'roberta.encoder.layer.5.attention.self.query.weight', 'rober

BertLstmCRF(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train_modified(epoch, verbose = False):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        ids, labels = batch
        ids = ids.to(device)
        labels = labels.to(device)
        mask = ids!=1
        
        # check(ids, mask, labels)
        # break

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        
        print(outputs[1])
        
        tr_logits = outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0 and verbose:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        # print(active_logits)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # print(flattened_predictions)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    labels = [ids_to_labels[id.item()] for id in tr_labels]
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    f1 = seqeval.metrics.f1_score([labels], [predictions])

    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1}")

In [None]:
# Creating optimizer and lr schedulers
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(args.epochs*len(train_dataset)/args.batch_size/args.accumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler

# tsfm = model.roberta
tsfm = model.bert
for child in tsfm.children():
    for param in child.parameters():
        if not param.requires_grad:
            print("whoopsies")
        param.requires_grad = False
frozen = True

EPOCHS = 50
import time

for epoch in range(EPOCHS):
    if epoch > 0 and frozen:
        for child in tsfm.children():
            for param in child.parameters():
                param.requires_grad = True
        frozen = False
        del scheduler0
        torch.cuda.empty_cache()
    st = time.time()
    print(f"Training epoch: {epoch + 1}")
    train_modified(epoch)
    labels, predictions = valid(model, valid_loader)
    
    print('Time: ',time.time() - st)

whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies
whoopsies


RuntimeError: ignored

In [None]:
512/128

4.0

#### **Inference**

The fun part is when we can quickly test the model on new, unseen sentences. 
Here, we use the prediction of the **first word piece of every word** (which is how the model was trained). 

*In other words, the code below does not take into account when predictions of different word pieces that belong to the same word do not match.*

In [None]:
ids_to_labels[-100] = 'X'

In [None]:
sentence = ["Adam is a company based in New York, but is also has employees working in Paris"]
ids, labels, masks = convert_lines(lines, tags, vocab, bpe)

# move to gpu
ids = torch.tensor(ids).to(device)
mask = torch.tensor(masks).to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

sentences, predictions = decode(ids.squeeze(0).cpu(), flattened_predictions.cpu())
_, labels = decode(ids.squeeze(0).cpu(), torch.tensor(labels).squeeze())
for token, pred, label in zip(sentences.split(), predictions, labels.squeeze()):
  print('{0:10}  {1:10} {2:10} {3:10}'.format(token, pred, ids_to_labels[int(pred)], ids_to_labels[int(label)]))


# tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
# token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
# wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

# prediction = []
# for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
#   #only predictions on first word pieces are important
#   if mapping[0] == 0 and mapping[1] != 0:
#     prediction.append(token_pred[1])
#   else:
#     continue

# print(sentence.split())
# print(prediction)

100%|██████████| 1/1 [00:00<00:00, 188.83it/s]


1
1
mua                  0 O          O         
được                 0 O          O         
giá                  0 O          B-PRI     
tốt                  0 O          O         
lại                  0 O          O         
được                 0 O          O         
freeship             0 O          B-PRI     
mừng                 0 O          O         
rơi                  0 O          O         
nước_mắt             0 O          O         
đối_với              0 O          O         
tỉnh_lẻ              0 O          O         
thì                  0 O          O         
tiền                 0 O          B-PRI     
ship                 0 O          I-PRI     
là                   0 O          O         
1                    0 O          O         
trở_ngại             0 O          O         
sản_phẩm             0 O          O         
quá                  0 O          O         
ổn                   0 O          O         
cảm_ơn               0 O          O         
shop  

#### **Saving the model for future use**

Finally, let's save the vocabulary (.txt) file, model weights (.bin) and the model's configuration (.json) to a directory, so that both the tokenizer and model can be re-loaded using the `from_pretrained()` class method.


In [None]:
import os

directory = "./model"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

### Test new data

In [None]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP/project_nlp

/content/drive/MyDrive/NLP/project_nlp


In [None]:
import pandas as pd

https://github.com/minhpqn/vietner/tree/master/vlsp2016_exp

In [None]:
test_path = './data/VSLP/test.txt'
train_path = './data/VSLP/train.txt'

train_data = pd.read_csv(train_path,  sep='\t', header=None, usecols=[0,3], names=['Word', 'Tag'], skip_blank_lines=False)
test_data = pd.read_csv(test_path,  sep='\t', header=None, usecols=[0,3], names=['Word', 'Tag'], skip_blank_lines=False)

In [None]:
train_data['Sentence #'] = [None] * len(train_data)
sent_count = 1
for idx in range(len(train_data)):
    train_data['Sentence #'][idx] = f'Sentence: {sent_count}'
    if train_data[['Word']].iloc[idx].isnull().any():
        # print(idx)
        sent_count += 1
train_data.dropna(inplace=True)
train_data.isnull().sum()

Word          0
Tag           0
Sentence #    0
dtype: int64

In [None]:
test_data['Sentence #'] = [None] * len(test_data)
sent_count = 1
for idx in range(len(test_data)):
    test_data['Sentence #'][idx] = f'Sentence: {sent_count}'
    if test_data[['Word']].iloc[idx].isnull().any():
        # print(idx)
        sent_count += 1
test_data.dropna(inplace=True)
test_data.isnull().sum()

Word          0
Tag           0
Sentence #    0
dtype: int64

In [None]:
print("Number of tags: {}".format(len(train_data.Tag.unique())))
frequencies = train_data.Tag.value_counts()
frequencies

Number of tags: 9


O         1793
B-LOC       55
B-PER       23
I-LOC       22
I-PER       22
B-ORG        8
I-ORG        7
I-MISC       1
B-MISC       1
Name: Tag, dtype: int64

In [None]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('LOC', 77), ('PER', 45), ('ORG', 15), ('MIS', 2)]


In [None]:
labels_to_ids = {k: v for v, k in enumerate(train_data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(train_data.Tag.unique())}
labels_to_ids

{'B-LOC': 1,
 'B-MISC': 7,
 'B-ORG': 2,
 'B-PER': 4,
 'I-LOC': 3,
 'I-MISC': 8,
 'I-ORG': 6,
 'I-PER': 5,
 'O': 0}

In [None]:
# let's create a new column called "sentence" which groups the words by sentence 
train_data['sentence'] = train_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
train_data['word_labels'] = train_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
train_data.head()

Unnamed: 0,Word,Tag,Sentence #,sentence,word_labels
0,Đó,O,Sentence: 1,Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_...,"O,O,O,O,O,O,O,O,O,O,B-LOC,O,B-LOC,O,O,O,O,O,O,..."
1,là,O,Sentence: 1,Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_...,"O,O,O,O,O,O,O,O,O,O,B-LOC,O,B-LOC,O,O,O,O,O,O,..."
2,con,O,Sentence: 1,Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_...,"O,O,O,O,O,O,O,O,O,O,B-LOC,O,B-LOC,O,O,O,O,O,O,..."
3,đường,O,Sentence: 1,Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_...,"O,O,O,O,O,O,O,O,O,O,B-LOC,O,B-LOC,O,O,O,O,O,O,..."
4,biển,O,Sentence: 1,Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_...,"O,O,O,O,O,O,O,O,O,O,B-LOC,O,B-LOC,O,O,O,O,O,O,..."


In [None]:
# let's create a new column called "sentence" which groups the words by sentence 
test_data['sentence'] = test_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
test_data['word_labels'] = test_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
test_data.head()

Unnamed: 0,Word,Tag,Sentence #,sentence,word_labels
0,Chị,O,Sentence: 1,Chị Minh ôm đứa con_gái mới hơn hai tháng rưỡi...,"O,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Minh,B-PER,Sentence: 1,Chị Minh ôm đứa con_gái mới hơn hai tháng rưỡi...,"O,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,ôm,O,Sentence: 1,Chị Minh ôm đứa con_gái mới hơn hai tháng rưỡi...,"O,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,đứa,O,Sentence: 1,Chị Minh ôm đứa con_gái mới hơn hai tháng rưỡi...,"O,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,con_gái,O,Sentence: 1,Chị Minh ôm đứa con_gái mới hơn hai tháng rưỡi...,"O,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="./PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("./PhoBERT_base_transformers/dict.txt")

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="./PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("./PhoBERT_base_transformers/dict.txt")

In [None]:
labels_to_ids['X'] = -100
labels_to_ids

{'B-LOC': 1,
 'B-MISC': 7,
 'B-ORG': 2,
 'B-PER': 4,
 'I-LOC': 3,
 'I-MISC': 8,
 'I-ORG': 6,
 'I-PER': 5,
 'O': 0,
 'X': -100}

In [None]:
from tqdm import tqdm
def convert_lines(lines, tags, vocab, bpe, max_sequence_length=256):
    """
    lines: list các văn bản input
    tags: list các chuỗi tag
    vocab: từ điển dùng để encoding subwords
    bpe: 
    """
    # Index của các token cls (đầu câu), eos (cuối câu), padding (padding token)
    outputs = np.zeros((len(lines), max_sequence_length), dtype=np.int32) # --> shape (number_lines, max_seq_len)
    outputs_labels = np.zeros((len(lines), max_sequence_length), dtype=np.int32)
    outputs_attention_mask = np.zeros((len(lines), max_sequence_length), dtype=np.int32)
    # Index của các token cls (đầu câu), eos (cuối câu), padding (padding token)
    cls_id = 0
    eos_id = 2
    pad_id = 1
    
    for idx, row in tqdm(enumerate(lines), total=len(lines)): 
        # Mã hóa subwords theo byte pair encoding(bpe)
        subwords = bpe.encode(row)
        subwords = '<s> '+ subwords +' </s>'
        input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
        
        tag_list = ['O'] + tags[idx].split(',') + ['O']
        subword_idx = [subwords.split().index(word) for word in subwords.split() if '@@' in word]
        for i, orig_idx in enumerate(subword_idx):
            tag_list.insert(orig_idx+1, 'X')
        # print(tag_list)
        labels = [labels_to_ids[label] for label in tag_list] 

        # Truncate input nếu độ dài vượt quá max_seq_len
        if len(input_ids) > max_sequence_length: 
            input_ids = input_ids[:max_sequence_length]
            input_ids[-1] = eos_id
            labels = labels[:max_sequence_length]
            labels[-1] = -100
        else:
        # Padding nếu độ dài câu chưa bằng max_seq_len
            input_ids = input_ids + [pad_id, ]*(max_sequence_length - len(input_ids))
            labels = labels + [-100, ]*(max_sequence_length - len(labels))
        
        labels[0] = -100
        outputs[idx,:] = np.array(input_ids)
        outputs_labels[idx,:] = np.array(labels)
        outputs_attention_mask[idx, np.array(input_ids)!=pad_id] = 1

    return outputs, outputs_labels, outputs_attention_mask

lines = ['Chị Minh ôm đứa con_gái mới hơn hai tháng rưỡi tuổi nấc lên từng tiếng thảm_thiết khi kể lại cho chúng_tôi nghe về cái chết của chồng .'] 
tags = ['O,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O']

ids, labels, masks = convert_lines(lines, tags, vocab, bpe)
print('input_ids tensor encode: {}\n, shape: {}\n'.format(ids[:10], ids.size))
print('label_ids tensor encode: {}\n, shape: {}\n'.format(labels[:10], labels.size))
print('masks tensor encode: {}\n, shape: {}\n'.format(masks[:10], masks.size))
# print('x1 tensor decode: ', phoBERT_cls.decode(torch.tensor(x1))[:103])

In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [None]:
class argu():
    def __init__(self):
        # self.train_path = './data/train.csv'
        self.dict_path = "./PhoBERT_base_transformers/dict.txt"
        self.config_path = "./PhoBERT_base_transformers/config.json"
        # self.rdrsegmenter_path = '/content/vncorenlp/VnCoreNLP-1.1.1.jar'
        self.pretrained_path = './PhoBERT_base_transformers/model.bin'
        self.max_sequence_length = 256
        self.batch_size = 8
        self.accumulation_steps = 1
        self.epochs = 10
        self.seed = 69
        self.fold = 0
        self.lr= 1e-5
        self.ckpt_path = './checkpoints'
        self.bpe_codes = "./PhoBERT_base_transformers/bpe.codes"
args = argu()

config = RobertaConfig.from_pretrained(
    args.config_path,
    output_hidden_states=True,
    return_dict=True,
    num_labels=9,
    classifier_dropout = True
)

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [None]:
X_train, Y_label_train, Y_mask_train = convert_lines(train_data.sentence.values, train_data.word_labels.values, vocab, bpe, max_sequence_length=MAX_LEN)
X_test, Y_label_test, Y_mask_test = convert_lines(test_data.sentence.values, test_data.word_labels.values, vocab, bpe, max_sequence_length=MAX_LEN)

train_dataset = TensorDataset(torch.tensor(X_train,dtype=torch.long), 
                              torch.tensor(Y_label_train,dtype=torch.long))

valid_dataset = TensorDataset(torch.tensor(X_test,dtype=torch.long), 
                              torch.tensor(Y_label_test,dtype=torch.long))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

100%|██████████| 1932/1932 [00:01<00:00, 1278.14it/s]
100%|██████████| 934/934 [00:00<00:00, 1273.62it/s]


In [None]:
from sklearn.utils.class_weight import compute_class_weight
y_org = Y_label_train
class_weight = compute_class_weight(class_weight='balanced', classes = np.array([0,1,2,3,4,5,6,7,8]), y=y_org.flatten()[y_org.flatten()>=0])

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class Roberta_SeqTag(BertPreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"
    def __init__(self, config):
        super(Roberta_SeqTag, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)

        classifier_dropout = (
                config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
            )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # self.lstm = nn.LSTM(config.hidden_size, 256, num_layers=1, bidirectional=True)
        # self.classifier = nn.Linear(256*2, config.num_labels)
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        # x, _ = self.lstm(sequence_output)
        # sequence_output = torch.tanh(x)
        logits = self.classifier(sequence_output)
        # print(logits.shape)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weight, dtype=torch.float).to(device))
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                # print(
                #     'active loss', active_loss, active_loss.shape,
                #     'active_logits', active_logits, active_logits.shape,
                #     'active_labels', active_labels, active_labels.shape)
                loss = loss_fct(active_logits, active_labels)
                # quit()
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
model = Roberta_SeqTag.from_pretrained(args.pretrained_path, config=config)
model.cuda()


In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch, verbose = False):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        ids, labels = batch
        ids = ids.to(device)
        labels = labels.to(device)
        mask = ids!=1
        
        # check(ids, mask, labels)
        # break

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        tr_logits = outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0 and verbose:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        # print(active_logits)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # print(flattened_predictions)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    labels = [ids_to_labels[id.item()] for id in tr_labels]
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    f1 = seqeval.metrics.f1_score([labels], [predictions])

    print(f"Training loss epoch: {epoch_loss}", f"Training F1 epoch: {f1}")

In [None]:
def valid(model, test_loader, verbose=False):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            
            ids, labels = batch
            ids = ids.to(device)
            labels = labels.to(device)
            mask = ids!=1 

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            eval_logits = outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0 and verbose:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    f1 = seqeval.metrics.f1_score([labels], [predictions])
    print(f"Validation Loss: {eval_loss}", f"Validation F1: {f1}")

    return labels, predictions

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

# Creating optimizer and lr schedulers
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(args.epochs*len(train_dataset)/args.batch_size/args.accumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler

tsfm = model.roberta
for child in tsfm.children():
    for param in child.parameters():
        if not param.requires_grad:
            print("whoopsies")
        param.requires_grad = False
frozen = True

EPOCHS = 5
import time

for epoch in range(EPOCHS):
    if epoch > 0 and frozen:
        for child in tsfm.children():
            for param in child.parameters():
                param.requires_grad = True
        frozen = False
        del scheduler0
        torch.cuda.empty_cache()
    st = time.time()
    print(f"Training epoch: {epoch + 1}")
    train(epoch)
    labels, predictions = valid(model, valid_loader)
    
    print('Time: ',time.time() - st)

Training epoch: 1
Training loss epoch: 0.41969581694953434 Training F1 epoch: 0.059007832898172324
Validation Loss: 0.26131024922401797 Validation F1: 0.0
Time:  32.54197573661804
Training epoch: 2
Training loss epoch: 0.018631261992490232 Training F1 epoch: 0.9401363050285503
Validation Loss: 0.046891400712337265 Validation F1: 0.7998414585810543
Time:  83.0831093788147
Training epoch: 3
Training loss epoch: 0.0003756325251053782 Training F1 epoch: 1.0
Validation Loss: 0.04141283615487836 Validation F1: 0.8848776574408344
Time:  82.51424026489258
Training epoch: 4
Training loss epoch: 0.0017992888610816045 Training F1 epoch: 0.9979496738117428
Validation Loss: 0.043928374911619855 Validation F1: 0.923015873015873
Time:  82.69051885604858
Training epoch: 5
Training loss epoch: 0.006399250457575617 Training F1 epoch: 0.9865420560747664
Validation Loss: 0.02750404675098408 Validation F1: 0.9050807404489958
Time:  82.56903052330017


In [None]:
for item in zip(labels, predictions):
    print(item)

In [None]:
from seqeval.metrics import classification_report
print(classification_report([labels], [predictions]))

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification
phobert = AutoModel.from_pretrained("vinai/phobert-base")

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
phobert

In [None]:
# model = RobertaForTokenClassification.from_pretrained("vinai/phobert-base", config=config)
# model.cuda()

model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", num_labels=9)
model.cuda()

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mo

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
model_name = 'phobert'
args = TrainingArguments(
    f"{model_name}-finetuned-{'ner'}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

## Legacy

The following code blocks were used during the development of this notebook, but are not included anymore.

label for subword 

In [None]:
# text = '<s> '+'Hôm_nay trời nóng quá nên tôi ở nhà viết Viblo!' +' </s>'
text = 'nên xem_lại tư_duy bán hàng , bán thua hàng_chợ thì cần xem_lại , đặt giá gốc cao xong hạ saleoff lừa à !' 
subwords = bpe.encode(text)
subwords = '<s> '+subwords +' </s>'
input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
print(subwords)
print(input_ids)
subword_idx = [subwords.split().index(word) for word in subwords.split() if '@@' in word]
print(subword_idx)
tag_list = 'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-PRI,I-PRI,I-PRI,O,B-PRI,B-PRI,O,O,O,O'.split(',')
for i, idx in enumerate(subword_idx):
    orig_idx = idx - i
    tag_list.insert(orig_idx+1, tag_list[orig_idx])
for pair in zip(subwords.split(), tag_list):
    print(pair)

In [None]:
def prepare_sentence(sentence, tokenizer, maxlen):    
      # step 1: tokenize the sentence
      tokenized_sentence = tokenizer.tokenize(sentence)
      
      # step 2: add special tokens 
      tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] 

      # step 3: truncating/padding
      if (len(tokenized_sentence) > maxlen):
        # truncate
        tokenized_sentence = tokenized_sentence[:maxlen]
      else:
        # pad
        tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]

      # step 4: obtain the attention mask
      attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
      
      # step 5: convert tokens to input ids
      ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
      
      return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
      }

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
sentence = "this is a test @huggingface".strip().split()

inputs = tokenizer(sentence, is_pretokenized=True, return_offsets_mapping=True, padding='max_length', truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
token_offsets = inputs["offset_mapping"]
print(tokens)
print(token_offsets)

In [None]:
word = "@huggingface"

inputs = tokenizer(word, return_offsets_mapping=True, padding='max_length', truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
token_offsets = inputs["offset_mapping"]
print(tokens)
print(token_offsets)

In [None]:
# now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

In [None]:
# check for initial loss
input_ids, labels = next(iter(train_loader))
input_ids.shape, labels.shape

input_ids = input_ids.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=input_ids!=1, labels=labels)
initial_loss = outputs[0]
initial_loss

In [None]:
import torch
import numpy as np
from src.loss import FocalLoss
from torch.nn import CrossEntropyLoss
# test focal loss
logits = torch.rand(3, 3, 3)
labels = torch.LongTensor([[0,1,1],[1, 2, 2],[2,0,1]])
focal_loss = FocalLoss(gamma = 0, alpha = 1)
print(focal_loss(logits, labels))
loss_fct = CrossEntropyLoss()
seq_loss = loss_fct(logits.permute(0, 2, 1), labels)
print(seq_loss)

tensor(1.1434)
tensor(1.0161)


  log_p = F.log_softmax(logits)


In [None]:
import torch
from src.loss import *

criterion = SelfAdjDiceLoss(reduction="none")
# (batch_size, num_tokens, num_classes)
logits = torch.rand(3, 3, 3)
targets = torch.LongTensor([[0,1,1],[1, 2, 2],[2,0,1]])
# logits = torch.rand(128, 40, 10, requires_grad=True)
# targets = torch.randint(0, 10, size=(128, 40))

loss = criterion(logits.view(-1, 3), targets.view(-1))
loss = loss.reshape(-1, 9).mean(-1).mean()
# loss.backward()
seq_loss = loss_fct(logits.permute(0, 2, 1), labels)
print(seq_loss)
loss

tensor(1.0425)


tensor(0.3492)

In [None]:
import torch
from src.loss import *

criterion = SelfAdjDiceLoss(reduction="none")
# (batch_size, num_tokens, num_classes)
logits = torch.rand(3, 3, 3)
targets = torch.LongTensor([[0,1,1],[1, 2, 2],[2,0,1]])
# logits = torch.rand(128, 40, 10, requires_grad=True)
# targets = torch.randint(0, 10, size=(128, 40))

loss = criterion(logits.view(-1, 3), targets.view(-1))
loss = loss.reshape(-1, 9).mean(-1).mean()
# loss.backward()
loss

In [None]:
%cd /content

/content


In [None]:
!git clone https://github.com/Gxzzz/BiLSTM-CRF.git

Cloning into 'BiLSTM-CRF'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 38 (delta 12), reused 32 (delta 7), pack-reused 0[K
Unpacking objects: 100% (38/38), done.


In [None]:
%cd  BiLSTM-CRF

/content/BiLSTM-CRF


In [None]:
!sh run.sh train

num of training examples: 40526
num of development examples: 10132
start training...
log: epoch 1, iter 10, 19481.0 words/sec, avg_loss 99.965345, time 0.8 sec
log: epoch 1, iter 20, 20281.8 words/sec, avg_loss 67.051926, time 0.7 sec
log: epoch 1, iter 30, 20410.7 words/sec, avg_loss 57.057752, time 0.7 sec
log: epoch 1, iter 40, 21280.2 words/sec, avg_loss 56.877848, time 0.7 sec
log: epoch 1, iter 50, 20123.4 words/sec, avg_loss 53.036066, time 0.7 sec
log: epoch 1, iter 60, 18880.7 words/sec, avg_loss 48.014140, time 0.7 sec
log: epoch 1, iter 70, 20524.0 words/sec, avg_loss 51.402844, time 0.7 sec
log: epoch 1, iter 80, 20496.1 words/sec, avg_loss 47.476018, time 0.7 sec
log: epoch 1, iter 90, 19278.2 words/sec, avg_loss 46.728068, time 0.7 sec
log: epoch 1, iter 100, 19415.3 words/sec, avg_loss 48.263848, time 0.8 sec
log: epoch 1, iter 110, 20988.8 words/sec, avg_loss 44.111340, time 0.7 sec
log: epoch 1, iter 120, 20159.1 words/sec, avg_loss 44.468773, time 0.7 sec
log: epoch 1

In [None]:
!sh run.sh test

num of test samples: 4631
start testing...
using device cuda
processed 172601 tokens with 6192 phrases; found: 5308 phrases; correct: 4532.
accuracy:  97.46%; precision:  85.38%; recall:  73.19%; FB1:  78.82
              LOC: precision:  86.14%; recall:  79.74%; FB1:  82.82  2663
              ORG: precision:  84.59%; recall:  65.59%; FB1:  73.89  1032
              PER: precision:  84.62%; recall:  68.80%; FB1:  75.90  1613
