In [1]:
import re
import numpy as np
import os
import json
import pandas as pd
#from dateparser.search import search_dates
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from vncorenlp import VnCoreNLP
from collections import defaultdict, Counter
from transformers import AutoModel, AutoTokenizer, BertPreTrainedModel, RobertaModel, RobertaConfig

In [2]:
text = {}
with open("./data.txt", encoding='utf-8-sig') as f:
    text = json.loads(f.read())

seq = [text["data_direction"][i]["subject"] for i in range(len(text["data_direction"]))]
lab = [text["data_direction"][i]["category"] for i in range(len(text["data_direction"]))]

In [3]:
data_pd = pd.DataFrame({"Sequence": seq, "Label": lab})

acrronym = {"NQ": "Nghị quyết", "CP": "Chính phủ", "TTTT": "Thông tin truyền thông",
            "CBCCVCLĐ" : "Cán bộ công chức viên chức người lao động"}

In [12]:
def find_accronym(data_file):
    accronym = []
    for i in range(data_file.shape[0]):
        sent = data_file.iloc[i]["Sequence"]
        sent_acc = re.findall(r'[A-ZĐ]{2,}', sent)
        if sent_acc:
            for ele in sent_acc:
                accronym.append(ele)
    return set(accronym)

def regex_sentence(s):
    s = re.sub('((www\.[^s]+)|(https://[^\s]+))', 'URL', s)  # replace url
    s = re.sub("V/v", "", s)
    s = re.sub("v/v", "", s)
    s = re.sub("Về việc", "", s)
    s = re.sub(r'[-–()/"#@;:<>{}`+=~|.!?,&“”%*⋅…]', ' ', s)
    s = re.sub(r"\b\d+\b", '', s)  # remove number, date, etc...
    #s = re.sub("TTg", "", s)
    #s = re.sub("CTr]", "", s)
    #s = re.sub(r'\b[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠ]\b', "", s)  #remove single uppercase character
    s = re.sub(r'\b[BCEXHVICJFQPKcvhđmgbs]\b',"",s)
    #s = re.sub(r'[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠ]{2,}', "", s)  #remove 2 consecutive uppercase character
    #s = re.sub('[\n]+', '', s)  #remove white space
    s = s.replace('\n', '').replace('\r', '').replace("\\", "")
    s = s.strip()
    s = ' '.join(word for word in s.split())  #
    s = s.lower()
    return s


def get_single_letter(s):
    return [word for word in s.split() if len(word)==1]

In [13]:
clean_data = data_pd.copy()
clean_data["Sequence"] = clean_data["Sequence"].apply(regex_sentence)

In [6]:
def single_letters(data):
    single = defaultdict(list)
    for i,seq in enumerate(data["Sequence"]):
        ok = get_single_letter(seq)
        for lt in ok:
            single[lt].append(i)
    return single

single = single_letters(clean_data)

single.keys()

dict_keys(['ý', 'y', 'ở', 'ô', 'á', 'e', 'a', 'ạ', 'ỷ', 'i'])

In [7]:
def get_unique_label(dataset):
    dict_text = defaultdict(list)
    for k, v in zip(dataset["Sequence"], dataset["Label"]):
        if v not in dict_text[k]:
            #print(f"{v} not in {dict_text[k]}")
            dict_text[k].append(v)
        else:
            #print(f"{v} in {dict_text[k]}")
            pass
    return dict_text

rdrsegmenter = VnCoreNLP("/Users/Slaton/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

def segmenter(s):
    sentence = rdrsegmenter.tokenize(s)[0]
    return " ".join(sentence)

def data_label_dict(dataframe):
    unseg_dict = get_unique_label(dataframe)
    seg_dict = dict()  #defaultdict(list)
    for k, v in unseg_dict.items():
        seg_dict[segmenter(k)] = v
    return seg_dict

def create_pd_dummies_label(dictionary, label_list):
    empty_pd = pd.DataFrame(index=range(len(dictionary)), columns=["Sequence", *label_list])
    for i, k in enumerate(dictionary.keys()):
        empty_pd.iloc[i]["Sequence"] = k
        for lab in dictionary[k]:
            empty_pd.iloc[i][lab] = 1
    empty_pd = empty_pd.fillna(0)
    return empty_pd

In [8]:
labels = list(np.unique(clean_data["Label"]))
segmented_dict = data_label_dict(clean_data)
vocab = set([word for k in segmented_dict.keys() for word in k.split()])  #3938

In [9]:
final_data = create_pd_dummies_label(segmented_dict, labels)

In [10]:
final_data

Unnamed: 0,Sequence,Báo chí xuất bản,Báo cáo,Chỉ thị,Công văn,Giấy mời,Hướng dẫn,Kế hoạch,Quyết định,Thông báo,Thông tư,Tờ trình
0,thống_kê danh_sách cá_nhân gia_đình hiến máu t...,0,0,0,1,0,0,0,0,0,0,0
1,xin cấp tên_miền cho trang thông_tin điện_tử t...,0,0,0,1,0,0,0,0,0,0,0
2,đề_nghị hỗ_trợ tập_huấn triển_khai ứng_dụng ch...,0,0,0,1,0,0,0,0,0,0,0
3,thống_kê chỉ_tiêu theo nghị_quyết nq cp ngày s...,0,0,0,1,0,0,0,0,0,0,0
4,vận_động cbccvclđ tham_gia hiến máu tình_nguyệ...,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10135,phân_công nhiệm_vụ chủ_tịch các phó chủ_tịch u...,0,0,0,0,0,0,0,1,0,0,0
10136,tham_gia ý_kiến đối_với dự_thảo xin chủ_trương...,0,0,0,1,0,0,0,0,0,0,0
10137,tiếp_tục thực_hiện một_số biện_pháp cấp_bách p...,0,0,0,1,0,0,0,0,0,0,0
10138,phúc_đáp công_văn số stttt bcvt cntt ngày của ...,0,0,0,1,0,0,0,0,0,0,0


# Train Test split

In [14]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [120]:
from sklearn.utils import compute_sample_weight
weight = 1/np.sum(y_train,axis=0)

In [119]:
np.sum(y_train,axis=0)

array([  65,  573,   46, 4240,  582,   41,  316,  746,  411,   48,   92])

In [15]:
labels = final_data[[lab for lab in final_data.columns if not lab.startswith("Sequence")]].values
Xfinal = final_data["Sequence"].values
X_train, X_val, y_train, y_val = train_test_split(Xfinal, labels, test_size=0.3, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [16]:
encoded_cv = [tokenizer.encode(sent, add_special_tokens=True) for sent in final_data["Sequence"]]
max_len = max([len(sent) for sent in encoded_cv])

In [17]:
def bert_preprocessing(data):
    inputs_id = []
    att_masks = []
    for sent in data:
        encoded_sent = tokenizer.encode_plus(text=sent, add_special_tokens=True, return_attention_mask=True,max_length=114,padding='max_length',)
        #print(len(encoded_sent.get('input_ids')))
        inputs_id.append(encoded_sent.get('input_ids'))
        att_masks.append(encoded_sent.get('attention_mask'))
    inputs_id = torch.tensor(inputs_id)
    att_masks = torch.tensor(att_masks)
    return inputs_id, att_masks

In [18]:
train_inputs_id, train_mask = bert_preprocessing(X_train)
val_inputs_id, val_mask = bert_preprocessing(X_val)
test_inputs_id, test_mask = bert_preprocessing(X_test)

# DataLoader 

In [19]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [20]:
train_lab = torch.tensor(y_train,dtype=torch.float)
val_lab = torch.tensor(y_val,dtype=torch.float)
test_lab = torch.tensor(y_test, dtype=torch.float)

In [21]:
batch_size = 32
train_data = TensorDataset(train_inputs_id, train_mask, train_lab)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs_id, val_mask, val_lab)
val_sampler = RandomSampler(val_data)
val_loader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs_id, test_mask, test_lab)
test_sampler = RandomSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Model 

In [127]:
class PhoBertClassifier(nn.Module):
    def __init__(self, freeze=True):
        super(PhoBertClassifier,self).__init__()
        d_in, hidden, d_out = 768, 64, 11
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        
        self.classifier = nn.Sequential(
            nn.Linear(d_in, hidden),
            nn.ReLU(),
            nn.Linear(hidden, d_out))
        self.sgm = nn.Sigmoid()
        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, inp_id, att_msk):
        output = self.bert(input_ids = inp_id, attention_mask = att_msk)
        last_hidden = output[0][:,0,:]
        output_cls = self.classifier(last_hidden)
        output_sgm = self.sgm(output_cls)
        return output_sgm

In [128]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    phobert = PhoBertClassifier()
    optimizer = AdamW(phobert.parameters(),
                     lr=3e-5,
                     eps=1e-8)
    total_steps = len(train_loader)*epochs
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                               num_warmup_steps=0,
                                               num_training_steps=total_steps)
    return phobert, optimizer, scheduler

In [129]:
import random
import time
from sklearn.metrics import precision_score, recall_score, f1_score

loss_fn = nn.BCELoss(reduction="none")

def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    
def train(model,optimizer,scheduler,train_dataloader, val_dataloader, epochs=4,evaluation=False):
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val F1':^9} | {'Val Precision':^9} | {'Val Recall':^9}")
        #print("-"*70)
        #print(f"----------Epoch {epoch_i}----------")
        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()
        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()
        
        for step, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            batch_counts+=1
            b_input_id, b_att_mask, b_label = batch
            #model.zero_grad()
            
            output = model(b_input_id, b_att_mask)
            tmp_loss = loss_fn(output, b_label)
            weighted_loss = tmp_loss*torch.tensor(weight)
            loss = weighted_loss.mean()
            
            batch_loss += loss.item()
            total_loss += loss.item()
            with torch.enable_grad():
                loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            #scheduler.step()
            
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch
                
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {'-':^10} | {'-':^10}")
                
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
            avg_train_loss = total_loss/len(train_dataloader)
            #print("-"*70)
            # =======================================
            #               Evaluation
            # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            #val_loss, val_accuracy = evaluate(model, val_dataloader)
            val_loss, val_f1, val_precision, val_recall = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_f1:^10.6f} | {val_precision:^10.6f} | {val_recall:^10.6f}")
                #print("-"*70)
        print("\n")
    
    print("Training complete!")
    
def calculate_metrics(pred, target, threshold=0.5):
    pred = np.array(pred > threshold, dtype=float)
    return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
            'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
            'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'),
            'macro/precision': precision_score(y_true=target, y_pred=pred, average='macro'),
            'macro/recall': recall_score(y_true=target, y_pred=pred, average='macro'),
            'macro/f1': f1_score(y_true=target, y_pred=pred, average='macro'),
            'samples/precision': precision_score(y_true=target, y_pred=pred, average='samples'),
            'samples/recall': recall_score(y_true=target, y_pred=pred, average='samples'),
            'samples/f1': f1_score(y_true=target, y_pred=pred, average='samples'),
            }
    


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    #val_accuracy = []
    val_loss = []
    val_f1, val_prec, val_recall = [], [], []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = batch #tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        tmp_loss = loss_fn(logits, b_labels)
        weighted_loss = tmp_loss*torch.tensor(weight)
        loss = weighted_loss.mean()
        val_loss.append(loss.item())

        # Get the predictions
        #preds = torch.argmax(logits, dim=1).flatten()
        preds = np.array(logits>0.5, dtype=float)

        # Calculate the accuracy rate
        #ccuracy = (preds == b_labels).cpu().numpy().mean() * 100
        precision = precision_score(b_labels.numpy(), preds,average='weighted', zero_division=0)
        f1 = f1_score(b_labels.numpy(), preds,average='weighted', zero_division=0)
        recall = recall_score(b_labels.numpy(), preds,average='weighted', zero_division=0)
        
        val_f1.append(f1)
        val_recall.append(recall)
        val_prec.append(precision)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_f1 = np.mean(val_f1)
    val_prec = np.mean(val_prec)
    val_recall = np.mean(val_recall)
    #val_accuracy = np.mean(val_accuracy)

    return val_loss, val_f1, val_prec, val_recall

In [130]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=5)
train(bert_classifier,optimizer, scheduler, train_loader, val_loader, epochs=5, evaluation=True)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val F1   | Val Precision | Val Recall
   1    |   20    |   0.006196   |     -      |     -     |     -      |     -     
   1    |   40    |   0.005848   |     -      |     -     |     -      |     -     
   1    |   60    |   0.005520   |     -      |     -     |     -      |     -     
   1    |   80    |   0.005153   |     -      |     -     |     -      |     -     
   1    |   100   |   0.004783   |     -      |     -     |     -      |     -     
   1    |   120   |   0.004402   |     -      |     -     |     -      |     -     
   1    |   140   |   0.004074   |     -      |     -     |     -      |     -     
   1    |   160   |   0.003746   |     -      |     -     |     -      |     -     
   1    |   180   |   0.003481   |     -      |     -     |     -      |     -     
   1    |   200   |   0.003245   |     -      |     -     |     -      |     -     
   1    |   220   |   0.002964   |     -      |     - 

In [131]:
test_loss, test_f1, test_prec, test_recall = evaluate(bert_classifier, test_loader)

In [132]:
test_loss, test_f1, test_prec, test_recall

(0.0005686048640899743,
 0.4421930938862923,
 0.35645496977896557,
 0.5888542727697139)

In [133]:
batch = iter(test_loader).next()
test = bert_classifier(batch[0], batch[1])

In [134]:
haha = np.array(test>0.5, dtype=float)

In [135]:
batch[2]

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0.,

In [136]:
haha

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.