In [1]:
pip install vncorenlp

Collecting vncorenlp
  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: vncorenlp
  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645932 sha256=c93035ee6b0d9b8170ec73dbce86cd04330f98d71bfd6f4ddc02f8be79b2f80f
  Stored in directory: /root/.cache/pip/wheels/5d/d9/b3/41f6c6b1ab758561fd4aab55dc0480b9d7a131c6aaa573a3fa
Successfully built vncorenlp
Installing collected packages: vncorenlp
Successfully installed vncorenlp-1.0.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import string

In [3]:
import requests
url = "https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords-dash.txt"
response = requests.get(url)
if response.status_code == 200:
    stop_words = response.text.splitlines()
    print(stop_words[:10])
else:
    print("Failed to fetch the stopwords file.")

['a_lô', 'a_ha', 'ai', 'ai_ai', 'ai_nấy', 'ai_đó', 'alô', 'amen', 'anh', 'anh_ấy']


In [4]:
from vncorenlp import VnCoreNLP

vncorenlp = VnCoreNLP("/kaggle/input/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g')

In [5]:
df = pd.read_csv('/kaggle/input/economic-documents/vietnamese_economic_docs.csv')
df = df.dropna(subset=['text'])

def preprocess_text(text):
    text = re.sub(r'\n+', '. ', text)
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\r', '', text)
    text = re.sub(r"Chú thích\.|Xem thêm\.|Liên kết ngoài\.", "", text)
    text = text.strip()
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)
def tokenize_text(text):
    try:
        tokenized_text = vncorenlp.tokenize(text)
        filtered_sentences = [
            " ".join([word for word in sentence if word not in string.punctuation and word != '–'])
            for sentence in tokenized_text
        ]
        return ". ".join(filtered_sentences)
    except Exception as e:
        print(f"Error tokenizing text {text[:500]} ...: {e}\n")
        return ""
df['tokenized_text'] = df['cleaned_text'].apply(tokenize_text)
corpus = df['tokenized_text'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(X.T.toarray(), index=words, columns=[f"Doc {i+1}" for i in range(len(corpus))])
df_tfidf

Error tokenizing text Kinh tế Bắc Triều Tiên phản ánh những quan hệ sản xuất, cơ cấu kinh tế và tình hình kinh tế, đời sống tại CHDCND Triều Tiên. Nhìn chung, nền kinh tế CHDCND Triều Tiên là một nền kinh tế công nghiệp với việc sản xuất công nghiệp là hoạt động chính và một nền nông nghiệp gần như tự cung tự cấp do bị cấm vận, đây cũng là một nền kinh tế gần như hoàn toàn thuộc Chính phủ và phát triển theo kế hoạch nhà nước. Quốc gia này có nền kinh tế kế hoạch hóa tập trung cao độ, đất nông nghiệp được tập thể hóa, ...: 400: Unable to parse form content

Error tokenizing text Chủ nghĩa tư bản (Tiếng Anh: capitalism) là một hệ thống kinh tế dựa trên quyền sở hữu tư nhân đối với tư liệu sản xuất và hoạt động sản xuất vì lợi nhuận. Các đặc điểm đặc trưng của chủ nghĩa tư bản bao gồm: tài sản tư nhân, tích lũy tư bản, lao động tiền lương, trao đổi tự nguyện, một hệ thống giá cả và thị trường cạnh tranh. Trong nền kinh tế thị trường tư bản, việc điều hành và đầu tư được quyết định bởi chủ

Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 1993,Doc 1994,Doc 1995,Doc 1996,Doc 1997,Doc 1998,Doc 1999,Doc 2000,Doc 2001,Doc 2002
00,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
000,0.007755,0.0,0.0,0.010932,0.0,0.0,0.0,0.0,0.0106,0.044452,...,0.013742,0.027241,0.0,0.01061,0.028694,0.0,0.0,0.0,0.0,0.0
00001,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
0000120172,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
00019,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
에스케이그룹,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
임금,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
중화인민공화국_위안,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
페소,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0000,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0


In [6]:
top_words_list = []

for doc_index in range(len(corpus)):
    important_words = df_tfidf.iloc[:, doc_index]
    vocab_size = (important_words > 0).sum()
    
    N = round(vocab_size * 0.15)
    
    top_important_words = important_words.sort_values(ascending=False).head(N)
    
    rounded_scores = [round(score, 4) for score in top_important_words.values.tolist()]
    
    top_words_list.append({
        "Document": f"Doc {doc_index + 1}",
        "Vocabulary Size": vocab_size,
        "N": N,
        "Top N Words": top_important_words.index.tolist(),
        "TF-IDF Scores": rounded_scores
    })

top_words_df = pd.DataFrame(top_words_list)
top_words_df

Unnamed: 0,Document,Vocabulary Size,N,Top N Words,TF-IDF Scores
0,Doc 1,421,63,"[đhqg, hcm, đào_tạo, đại_học, trường, luật, kh...","[0.4295, 0.3597, 0.3286, 0.3016, 0.268, 0.1838..."
1,Doc 2,96,14,"[lbo, mua, doanh_nghiệp, leveraged, lại, vay, ...","[0.4033, 0.3629, 0.2648, 0.2551, 0.2197, 0.213..."
2,Doc 3,694,104,"[fed, dự_trữ, liên_bang, ngân_hàng, lãi_suất, ...","[0.4592, 0.3641, 0.3329, 0.3112, 0.1681, 0.166..."
3,Doc 4,804,121,"[lừa_đảo, nạn_nhân, bọn, lừa, séc, nigeria, bị...","[0.4677, 0.3966, 0.3251, 0.2037, 0.195, 0.1737..."
4,Doc 5,162,24,"[công_ty, đại_chúng, phát_hành, chi_phí, publi...","[0.455, 0.3189, 0.3077, 0.1863, 0.1823, 0.1623..."
...,...,...,...,...,...
1997,Doc 1998,186,28,"[sữa, vinamilk, việt_nam, mai_kiều_liên, viii,...","[0.3809, 0.2503, 0.2021, 0.1978, 0.1665, 0.160..."
1998,Doc 1999,267,40,"[khai_hoang, miền, lào_cai, miền_xuôi, núi, ki...","[0.4484, 0.3086, 0.255, 0.2471, 0.2195, 0.1876..."
1999,Doc 2000,41,6,"[win, nguyên_tắc, thắng, đàm_phán, lợi, hợp_tác]","[0.8472, 0.3128, 0.1913, 0.1532, 0.127, 0.1094]"
2000,Doc 2001,234,35,"[madoff, vụ, lừa_đảo, ông, bernard, gian_lận, ...","[0.537, 0.1942, 0.1823, 0.1652, 0.1651, 0.1633..."


In [7]:
labeled_sentences = []

for doc_index in range(len(corpus)):
    document = corpus[doc_index]
    
    sentences = document.split('.')

    for sentence in sentences:
        if sentence == []: continue
        tokens = sentence.split()
        
        top_important_words = top_words_df.iloc[doc_index]["Top N Words"]
        
        sentence_labels = []
        for token in tokens:
            token_lower = token.lower()
            if token_lower in stop_words:
                sentence_labels.append(0)
            elif token_lower in top_important_words:
                sentence_labels.append(1)
            else:
                sentence_labels.append(0)
        
        labeled_sentences.append({
            "text": sentence,
            "tokens": tokens,
            "labels": sentence_labels
        })

labeled_df = pd.DataFrame(labeled_sentences)
labeled_df

Unnamed: 0,text,tokens,labels
0,Trường Đại_học Kinh_tế Luật UEL là trường đại_...,"[Trường, Đại_học, Kinh_tế, Luật, UEL, là, trườ...","[1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, ..."
1,Tiền_thân của Trường là Khoa_Kinh_tế thuộc Đạ...,"[Tiền_thân, của, Trường, là, Khoa_Kinh_tế, thu...","[0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0]"
2,Trường đã đạt được nhiều thành_tựu trong đào_...,"[Trường, đã, đạt, được, nhiều, thành_tựu, tron...","[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ..."
3,,[],[]
4,,[],[]
...,...,...,...
154515,Cựu sinh_viên Viện Công_nghệ Massachusetts,"[Cựu, sinh_viên, Viện, Công_nghệ, Massachusetts]","[1, 1, 1, 0, 1]"
154516,Cựu sinh_viên Trường Kinh_tế London,"[Cựu, sinh_viên, Trường, Kinh_tế, London]","[1, 1, 0, 1, 0]"
154517,Giáo_sư Đại_học Johns_Hopkins,"[Giáo_sư, Đại_học, Johns_Hopkins]","[1, 1, 0]"
154518,Cựu sinh_viên Đại_học British_Columbia,"[Cựu, sinh_viên, Đại_học, British_Columbia]","[1, 1, 1, 1]"


In [8]:
labeled_df = labeled_df[labeled_df['tokens'].apply(lambda x: x != [])]
labeled_df

Unnamed: 0,text,tokens,labels
0,Trường Đại_học Kinh_tế Luật UEL là trường đại_...,"[Trường, Đại_học, Kinh_tế, Luật, UEL, là, trườ...","[1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, ..."
1,Tiền_thân của Trường là Khoa_Kinh_tế thuộc Đạ...,"[Tiền_thân, của, Trường, là, Khoa_Kinh_tế, thu...","[0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0]"
2,Trường đã đạt được nhiều thành_tựu trong đào_...,"[Trường, đã, đạt, được, nhiều, thành_tựu, tron...","[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ..."
5,Lịch_sử hình_thành và phát_triển,"[Lịch_sử, hình_thành, và, phát_triển]","[0, 0, 0, 0]"
6,Ngày 9 tháng 7 năm 1996 theo Quyết_định số 28...,"[Ngày, 9, tháng, 7, năm, 1996, theo, Quyết_địn...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...
154515,Cựu sinh_viên Viện Công_nghệ Massachusetts,"[Cựu, sinh_viên, Viện, Công_nghệ, Massachusetts]","[1, 1, 1, 0, 1]"
154516,Cựu sinh_viên Trường Kinh_tế London,"[Cựu, sinh_viên, Trường, Kinh_tế, London]","[1, 1, 0, 1, 0]"
154517,Giáo_sư Đại_học Johns_Hopkins,"[Giáo_sư, Đại_học, Johns_Hopkins]","[1, 1, 0]"
154518,Cựu sinh_viên Đại_học British_Columbia,"[Cựu, sinh_viên, Đại_học, British_Columbia]","[1, 1, 1, 1]"


In [9]:
import torch
from transformers import RobertaForTokenClassification, Trainer, TrainingArguments
model = RobertaForTokenClassification.from_pretrained('vinai/phobert-base-v2', num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [10]:
!git clone --single-branch --branch fast_tokenizers_BARTpho_PhoBERT_BERTweet https://github.com/datquocnguyen/transformers.git

Cloning into 'transformers'...
remote: Enumerating objects: 138580, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 138580 (delta 27), reused 72 (delta 26), pack-reused 138504 (from 1)[K
Receiving objects: 100% (138580/138580), 160.44 MiB | 29.32 MiB/s, done.
Resolving deltas: 100% (95481/95481), done.


In [11]:
%cd transformers

/kaggle/working/transformers


In [12]:
pip install -e .

Obtaining file:///kaggle/working/transformers
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.0.dev0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: transformers
  Building editable for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.32.0.dev0-0.editable-py3-none-any.whl size=38321 sha256=29f06d60aaf1995a80ca024cd5679c513ef38

In [13]:
from src.transformers.models.phobert.tokenization_phobert_fast import PhobertTokenizerFast
from datasets import Dataset, DatasetDict
import torch

tokenizer = PhobertTokenizerFast.from_pretrained('vinai/phobert-base-v2')



vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
data = []

for index, row in labeled_df.iterrows():
    data.append({
        'tokens': row['tokens'],
        'labels': row['labels']
    })

dataset = Dataset.from_pandas(labeled_df)
split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [15]:
def tokenize_and_align_labels(row):
    tokenized_inputs = tokenizer(row['tokens'], padding='max_length', max_length=128, truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    
    labels = row['labels']
    aligned_labels = []
    
    for i, word_id in enumerate(word_ids):
        if word_id is None:
            aligned_labels.append(0)
        else:
            aligned_labels.append(labels[word_id])

    if len(aligned_labels) > 128:
        aligned_labels = aligned_labels[:128]
    elif len(aligned_labels) < 128:
        aligned_labels = aligned_labels + [0] * (128 - len(aligned_labels))
    
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
train_tokenized_dataset

Map:   0%|          | 0/83449 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'tokens', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 83449
})

In [16]:
# a = ['Do_đó', 'theo', 'nguyên_tắc', 'về', 'tính', 'đơn_giản', 'của', 'thuế', 'nhiều', 'nước', 'áp_dụng', 'thuế_suất', '0%', 'đối_với', 'nhóm', 'người', 'này', 'hay', 'nói', 'cách', 'khác', 'chỉ', 'thu', 'thuế', 'từ', 'những', 'người', 'có', 'thu_nhập', 'cao', 'hơn', 'một', 'ngưỡng', 'nhất_định']
# tokenized_inputs = tokenizer(a, padding='max_length', max_length=128, truncation=True, is_split_into_words=True)
# tokens = tokenized_inputs.tokens()
# word_ids = tokenized_inputs.word_ids()
# for token, word_id in zip(tokens, word_ids):
#     print(f"Token: {token}, Word ID: {word_id}")

# print(train_tokenized_dataset[1])

In [17]:
eval_tokenized_dataset = eval_dataset.map(tokenize_and_align_labels, batched=False)
eval_tokenized_dataset

Map:   0%|          | 0/20863 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'tokens', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 20863
})

In [18]:
model.train()

training_args = TrainingArguments(
    output_dir='/kaggle/working/',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    report_to='none',
    evaluation_strategy="epoch",
    eval_steps=None,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.0382,0.03665
2,0.0323,0.033749
3,0.0281,0.034055




TrainOutput(global_step=3912, training_loss=0.040314546635789615, metrics={'train_runtime': 3357.6155, 'train_samples_per_second': 74.561, 'train_steps_per_second': 1.165, 'total_flos': 1.6353714789646848e+16, 'train_loss': 0.040314546635789615, 'epoch': 3.0})

In [19]:
model.eval()
test_sentences = [
    "Trường đại học này dạy kinh tế rất tốt",
    "Tôi muốn học về quản trị kinh doanh",
    "Kinh tế vĩ mô và vi mô học rất quan trọng trong ngành tài chính",
    "Chính sách tiền tệ có ảnh hưởng trực tiếp đến nền kinh tế quốc gia",
    "Các công ty đa quốc gia ảnh hưởng lớn đến nền kinh tế toàn cầu",
    "Tăng trưởng kinh tế là một chỉ số quan trọng để đo lường sự phát triển của quốc gia",
    "Chúng ta cần có một chiến lược phát triển bền vững trong ngành nông nghiệp",
    "Lạm phát có thể ảnh hưởng nghiêm trọng đến sức mua của người dân",
    "Ngành tài chính cần phải có các biện pháp kiểm soát rủi ro để duy trì sự ổn định",
    "Cải cách thuế là một phần quan trọng trong việc tăng cường hiệu quả nền kinh tế"
]
for test_sentence in test_sentences:
    test_tokens = tokenize_text(test_sentence).split()
    inputs = tokenizer(test_tokens, truncation=True, padding='max_length', max_length=128, return_tensors="pt", is_split_into_words=True)
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)
    predicted_labels = predictions.cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    print(f"Testing Sentence: {test_sentence}")
    for token, label in zip(tokens, predicted_labels[0]):
        if token != '<pad>': 
            print(f"Token: {token}, Prediction: {label}")
    print("\n" + "-"*50 + "\n")

Testing Sentence: Trường đại học này dạy kinh tế rất tốt
Token: <s>, Prediction: 0
Token: Trường</w>, Prediction: 1
Token: đại_học</w>, Prediction: 1
Token: này</w>, Prediction: 0
Token: dạy</w>, Prediction: 0
Token: kinh_tế</w>, Prediction: 1
Token: rất</w>, Prediction: 0
Token: tốt</w>, Prediction: 0
Token: </s>, Prediction: 0

--------------------------------------------------

Testing Sentence: Tôi muốn học về quản trị kinh doanh
Token: <s>, Prediction: 0
Token: Tôi</w>, Prediction: 0
Token: muốn</w>, Prediction: 0
Token: học</w>, Prediction: 1
Token: về</w>, Prediction: 0
Token: quản_trị</w>, Prediction: 1
Token: kinh_doanh</w>, Prediction: 1
Token: </s>, Prediction: 0

--------------------------------------------------

Testing Sentence: Kinh tế vĩ mô và vi mô học rất quan trọng trong ngành tài chính
Token: <s>, Prediction: 0
Token: Kinh_tế</w>, Prediction: 1
Token: vĩ_mô</w>, Prediction: 0
Token: và</w>, Prediction: 0
Token: vi_mô</w>, Prediction: 0
Token: học</w>, Prediction: 0