In [33]:
import re
import numpy as np
import os
import json
import pandas as pd
#from dateparser.search import search_dates
from vncorenlp import VnCoreNLP
from collections import defaultdict, Counter
from transformers import AutoModel, AutoTokenizer, BertPreTrainedModel, RobertaModel, RobertaConfig

In [6]:
text = {}
with open("./data.txt", encoding='utf-8-sig') as f:
    text = json.loads(f.read())

seq = [text["data_direction"][i]["subject"] for i in range(len(text["data_direction"]))]
lab = [text["data_direction"][i]["category"] for i in range(len(text["data_direction"]))]

In [7]:
data_pd = pd.DataFrame({"Sequence": seq, "Label": lab})

acrronym = {"NQ": "Nghị quyết", "CP": "Chính phủ", "TTTT": "Thông tin truyền thông",
            "CBCCVCLĐ" : "Cán bộ công chức viên chức người lao động"}

In [8]:
def find_accronym(data_file):
    accronym = []
    for i in range(data_file.shape[0]):
        sent = data_file.iloc[i]["Sequence"]
        sent_acc = re.findall(r'[A-ZĐ]{2,}', sent)
        if sent_acc:
            for ele in sent_acc:
                accronym.append(ele)
    return set(accronym)

def regex_sentence(s):
    s = re.sub('((www\.[^s]+)|(https://[^\s]+))', 'URL', s)  # replace url
    s = re.sub("V/v", "", s)
    s = re.sub("v/v", "", s)
    s = re.sub("Về việc", "", s)
    s = re.sub(r'[-–()/"#@;:<>{}`+=~|.!?,&“”%*⋅…]', ' ', s)
    s = re.sub(r"\b\d+\b", '', s)  # remove number, date, etc...
    #s = re.sub("TTg", "", s)
    #s = re.sub("CTr]", "", s)
    #s = re.sub(r'\b[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠ]\b', "", s)  #remove single uppercase character
    s = re.sub(r'\b[BCEXHVICJFQPKcvhđmgbs]\b',"",s)
    #s = re.sub(r'[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠ]{2,}', "", s)  #remove 2 consecutive uppercase character
    #s = re.sub('[\n]+', '', s)  #remove white space
    s = s.replace('\n', '').replace('\r', '').replace("\\", "")
    s = s.strip()
    s = ' '.join(word for word in s.split())  #
    s = s.lower()
    return s


def get_single_letter(s):
    return [word for word in s.split() if len(word)==1]

In [9]:
clean_data = data_pd.copy()
clean_data["Sequence"] = clean_data["Sequence"].apply(regex_sentence)

In [10]:
def single_letters(data):
    single = defaultdict(list)
    for i,seq in enumerate(data["Sequence"]):
        ok = get_single_letter(seq)
        for lt in ok:
            single[lt].append(i)
    return single

single = single_letters(clean_data)

single.keys()

dict_keys(['ý', 'y', 'ở', 'ô', 'á', 'e', 'a', 'ạ', 'ỷ', 'i'])

In [11]:
def get_unique_label(dataset):
    dict_text = defaultdict(list)
    for k, v in zip(dataset["Sequence"], dataset["Label"]):
        if v not in dict_text[k]:
            #print(f"{v} not in {dict_text[k]}")
            dict_text[k].append(v)
        else:
            #print(f"{v} in {dict_text[k]}")
            pass
    return dict_text

rdrsegmenter = VnCoreNLP("/Users/Slaton/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

def segmenter(s):
    sentence = rdrsegmenter.tokenize(s)[0]
    return " ".join(sentence)

def data_label_dict(dataframe):
    unseg_dict = get_unique_label(dataframe)
    seg_dict = dict()  #defaultdict(list)
    for k, v in unseg_dict.items():
        seg_dict[segmenter(k)] = v
    return seg_dict

def create_pd_dummies_label(dictionary, label_list):
    empty_pd = pd.DataFrame(index=range(len(dictionary)), columns=["Sequence", *label_list])
    for i, k in enumerate(dictionary.keys()):
        empty_pd.iloc[i]["Sequence"] = k
        for lab in dictionary[k]:
            empty_pd.iloc[i][lab] = 1
    empty_pd = empty_pd.fillna(0)
    return empty_pd

In [12]:
labels = list(np.unique(clean_data["Label"]))
segmented_dict = data_label_dict(clean_data)
vocab = set([word for k in segmented_dict.keys() for word in k.split()])  #3938

In [13]:
final_data = create_pd_dummies_label(segmented_dict, labels)

In [14]:
final_data

Unnamed: 0,Sequence,Báo chí xuất bản,Báo cáo,Chỉ thị,Công văn,Giấy mời,Hướng dẫn,Kế hoạch,Quyết định,Thông báo,Thông tư,Tờ trình
0,thống_kê danh_sách cá_nhân gia_đình hiến máu t...,0,0,0,1,0,0,0,0,0,0,0
1,xin cấp tên_miền cho trang thông_tin điện_tử t...,0,0,0,1,0,0,0,0,0,0,0
2,đề_nghị hỗ_trợ tập_huấn triển_khai ứng_dụng ch...,0,0,0,1,0,0,0,0,0,0,0
3,thống_kê chỉ_tiêu theo nghị_quyết nq cp ngày s...,0,0,0,1,0,0,0,0,0,0,0
4,vận_động cbccvclđ tham_gia hiến máu tình_nguyệ...,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10135,phân_công nhiệm_vụ chủ_tịch các phó chủ_tịch u...,0,0,0,0,0,0,0,1,0,0,0
10136,tham_gia ý_kiến đối_với dự_thảo xin chủ_trương...,0,0,0,1,0,0,0,0,0,0,0
10137,tiếp_tục thực_hiện một_số biện_pháp cấp_bách p...,0,0,0,1,0,0,0,0,0,0,0
10138,phúc_đáp công_văn số stttt bcvt cntt ngày của ...,0,0,0,1,0,0,0,0,0,0,0


In [18]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
phobert = AutoModel.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
sentence = final_data.iloc[0]["Sequence"]
input_ids = torch.tensor([tokenizer.encode(sentence)])
with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

In [36]:
features[0].shape

torch.Size([1, 15, 768])

In [34]:
class RobertaForAIViVN(BertPreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"
    def __init__(self, config):
        super(RobertaForAIViVN, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.qa_outputs = nn.Linear(4*config.hidden_size, self.num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                start_positions=None, end_positions=None):

        outputs = self.roberta(input_ids,
                                attention_mask=attention_mask,
#                            token_type_ids=token_type_ids,
                                position_ids=position_ids,
                                head_mask=head_mask)
        cls_output = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)
        logits = self.qa_outputs(cls_output)
        return logits

In [35]:
torch.tensor(tokenizer.encode(final_data.iloc[0]["Sequence"]))

tensor([   0, 1799, 1092,  435,  161, 3312,  764, 4047,  101,  682,   72,  102,
          11, 3968,    2])