In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Step 1: Preprocess Data

In [1]:
import pandas as pd
from transformers import BertTokenizerFast

# B∆∞·ªõc 1: T·∫£i v√† ƒë·ªãnh d·∫°ng d·ªØ li·ªáu
# H√†m load_data ƒë·ªçc c√°c c√¢u v√† nh√£n t·ª´ c√°c t·ªáp trong ƒë·ªãnh d·∫°ng NER c·ª• th·ªÉ:
# M·ªói d√≤ng ch·ª©a m·ªôt t·ª´ v√† th·∫ª t∆∞∆°ng ·ª©ng.
# C√°c c√¢u ƒë∆∞·ª£c ph√¢n t√°ch b·∫±ng c√°c d√≤ng tr·ªëng.
# Khi g·∫∑p d√≤ng tr·ªëng, c√¢u hi·ªán t·∫°i v√† c√°c nh√£n c·ªßa n√≥ s·∫Ω ƒë∆∞·ª£c th√™m v√†o c√°c danh s√°ch sentences v√† labels.
def load_data(filepath):
    sentences, labels = [], []
    with open(filepath, 'r') as f:
        sentence, label = [], []
        for line in f:
            if line.startswith('-DOCSTART-') or line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                sentence, label = [], []
            else:
                word, _, _, tag = line.strip().split()
                sentence.append(word)
                label.append(tag)
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Sau khi t·∫£i:
# C√°c bi·∫øn sau gi·ªØ c√°c danh s√°ch c√¢u v√† nh√£n NER t∆∞∆°ng ·ª©ng.
train_sentences, train_labels = load_data('/content/drive/MyDrive/Colab_Notebooks/MIDTERM_NLP/train.txt') #train.txt
test_sentences, test_labels = load_data('/content/drive/MyDrive/Colab_Notebooks/MIDTERM_NLP/test.txt')  # test.txt
valid_sentences, valid_labels = load_data('/content/drive/MyDrive/Colab_Notebooks/MIDTERM_NLP/valid.txt') #valid.txt

# Kh·ªüi t·∫°o c√°c tokenizer:
# BertTokenizerFast ƒë∆∞·ª£c s·ª≠ d·ª•ng ƒë·ªÉ token h√≥a c√°c c√¢u.
# is_split_into_words=True cho bi·∫øt r·∫±ng c√°c c√¢u ƒë√£ ƒë∆∞·ª£c chia th√†nh c√°c t·ª´ ri√™ng l·∫ª. => R·∫•t h·ªØu √≠ch khi ƒë√£ c√≥ danh s√°ch c√°c t·ª´ v√† mu·ªën token h√≥a ch√∫ng.
# return_offsets_mapping=True tr·∫£ v·ªÅ v·ªã tr√≠ k√Ω t·ª± b·∫Øt ƒë·∫ßu v√† k·∫øt th√∫c c·ªßa m·ªói token trong c√¢u => Gi√∫p cƒÉn ch·ªânh c√°c token v·ªõi c√°c nh√£n t∆∞∆°ng ·ª©ng.
# padding=True ƒë·∫£m b·∫£o r·∫±ng t·∫•t c·∫£ c√°c c√¢u ƒë·ªÅu c√≥ c√πng ƒë·ªô d√†i b·∫±ng c√°ch th√™m c√°c token ƒë·ªám (padding tokens) v√†o cu·ªëi c√°c c√¢u ng·∫Øn h∆°n.
# truncation=True c·∫Øt b·ªõt c√°c c√¢u d√†i h∆°n ƒë·ªô d√†i t·ªëi ƒëa cho ph√©p, ƒë·∫£m b·∫£o r·∫±ng t·∫•t c·∫£ c√°c c√¢u ƒë·ªÅu c√≥ ƒë·ªô d√†i ph√π h·ª£p.
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_sentences, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
test_encodings = tokenizer(test_sentences, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
valid_encodings = tokenizer(valid_sentences, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



# Step 2: Define the Model

In [2]:
from transformers import BertForTokenClassification
import torch

# T·∫°o m·ªôt danh s√°ch c√°c th·∫ª c√≥ th·ªÉ c√≥ (nh∆∞ O, B-ORG, ...) v√† √°nh x·∫° t·ª´ng th·∫ª v√†o m·ªôt s·ªë nguy√™n duy nh·∫•t.

# label_list ch·ª©a t·∫•t c·∫£ c√°c th·∫ª NER c√≥ th·ªÉ c√≥.
label_list = ["O", "B-ORG", "I-ORG", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]
# label_map l√† m·ªôt t·ª´ ƒëi·ªÉn √°nh x·∫° m·ªói th·∫ª v√†o m·ªôt ID s·ªë nguy√™n,
# gi√∫p d·ªÖ d√†ng chuy·ªÉn ƒë·ªïi nh√£n th√†nh ID ƒë·ªÉ model hu·∫•n luy·ªán.
label_map = {label: i for i, label in enumerate(label_list)}

# H√†m encode_labels cƒÉn ch·ªânh c√°c nh√£n v·ªõi c√°c token:
def encode_labels(labels, encodings):
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        #C√°c token kh√¥ng c√≥ nh√£n c·ª• th·ªÉ s·∫Ω ƒë∆∞·ª£c g√°n nh√£n "O" (ƒë·ªëi v·ªõi c√°c th·ª±c th·ªÉ ngo√†i).
        doc_enc_labels = [label_map["O"]] * len(doc_offset)
        # V·ªõi m·ªói token trong m·ªôt c√¢u, n√≥ g√°n m·ªôt ID nh√£n t∆∞∆°ng ·ª©ng t·ª´ label_map.
        for i, label in enumerate(doc_labels):
            doc_enc_labels[i] = label_map[label]
        encoded_labels.append(doc_enc_labels)
    return encoded_labels
train_labels = encode_labels(train_labels, train_encodings)
test_labels = encode_labels(test_labels, test_encodings)
valid_labels = encode_labels(valid_labels, valid_encodings)

# M√¥ h√¨nh BertForTokenClassification t·ª´ Hugging Face ƒë∆∞·ª£c kh·ªüi t·∫°o v·ªõi
# s·ªë l∆∞·ª£ng nh√£n ƒë·∫ßu ra l√† k√≠ch th∆∞·ªõc c·ªßa danh s√°ch c√°c ID t∆∞∆°ng ·ª©ng vs c√°c th·∫ª NER.
# M√¥ h√¨nh BERT ƒë∆∞·ª£c thi·∫øt l·∫≠p ƒë·ªÉ ph√¢n lo·∫°i token v·ªõi 9 nh√£n ƒë·∫ßu ra (m·ªôt cho m·ªói th·∫ª trong label_list).
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Step 3: Training

In [3]:
from transformers import Trainer, TrainingArguments

# Prepare data for training
# L·ªõp NERDataset bao b·ªçc c√°c ƒë·∫ßu v√†o ƒë√£ ƒë∆∞·ª£c token h√≥a v√† c√°c nh√£n ƒë√£ ƒë∆∞·ª£c m√£ h√≥a ƒë·ªÉ API Trainer c√≥ th·ªÉ s·ª≠ d·ª•ng n√≥
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # tr·∫£ v·ªÅ ƒë·∫ßu v√†o token h√≥a v√† c√°c nh√£n t·∫°i m·ªôt ch·ªâ m·ª•c nh·∫•t ƒë·ªãnh.
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    # tr·∫£ v·ªÅ s·ªë l∆∞·ª£ng m·ª•c trong t·∫≠p d·ªØ li·ªáu
    def __len__(self):
        return len(self.labels)

# C√°c t·∫≠p d·ªØ li·ªáu sau ƒë∆∞·ª£c t·∫°o cho train, test v√† valid
train_dataset = NERDataset(train_encodings, train_labels)
test_dataset = NERDataset(test_encodings, test_labels)
valid_dataset = NERDataset(valid_encodings, valid_labels)

# Define training arguments
# C√°c tham s·ªë hu·∫•n luy·ªán ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a b·∫±ng TrainingArguments:
training_args = TrainingArguments(
    # Th∆∞ m·ª•c l∆∞u tr·ªØ k·∫øt qu·∫£ hu·∫•n luy·ªán
    output_dir='./results',
    # evaluation_strategy="epoch" ch·ªâ ƒë·ªãnh vi·ªác ƒë√°nh gi√° v√†o cu·ªëi m·ªói epoch
    #(m·ªôt l·∫ßn duy·ªát qua to√†n b·ªô d·ªØ li·ªáu hu·∫•n luy·ªán).
    evaluation_strategy="epoch",
    # learning_rate=2e-5: T·ªëc ƒë·ªô h·ªçc c·ªßa m√¥ h√¨nh, x√°c ƒë·ªãnh m·ª©c ƒë·ªô ƒëi·ªÅu ch·ªânh tr·ªçng s·ªë sau m·ªói l·∫ßn c·∫≠p nh·∫≠t.
    learning_rate=2e-5,
    # K√≠ch th∆∞·ªõc batch cho m·ªói thi·∫øt b·ªã trong qu√° tr√¨nh hu·∫•n luy·ªán.
    per_device_train_batch_size=16,
    # K√≠ch th∆∞·ªõc batch cho m·ªói thi·∫øt b·ªã trong qu√° tr√¨nh ƒë√°nh gi√°.
    per_device_eval_batch_size=16,
    # S·ªë l∆∞·ª£ng epoch ƒë·ªÉ hu·∫•n luy·ªán m√¥ h√¨nh.
    num_train_epochs=3,
    # H·ªá s·ªë gi·∫£m tr·ªçng s·ªë, gi√∫p tr√°nh overfitting b·∫±ng c√°ch gi·∫£m gi√° tr·ªã c·ªßa c√°c tr·ªçng s·ªë l·ªõn.
    weight_decay=0.01,
)

# Define Trainer
# Kh·ªüi t·∫°o m·ªôt ƒë·ªëi t∆∞·ª£ng Trainer t·ª´ th∆∞ vi·ªán Transformers c·ªßa Hugging Face
#Trainer m·ªôt l·ªõp gi√∫p b·∫°n d·ªÖ d√†ng hu·∫•n luy·ªán v√† ƒë√°nh gi√° m√¥ h√¨nh.
trainer = Trainer(
    # model=model: ƒê√¢y l√† m√¥ h√¨nh b·∫°n mu·ªën hu·∫•n luy·ªán.
    #N√≥ c√≥ th·ªÉ l√† b·∫•t k·ª≥ m√¥ h√¨nh n√†o t·ª´ th∆∞ vi·ªán Transformers, ch·∫≥ng h·∫°n nh∆∞ BERT, GPT-2, v.v.
    model=model,
    # ƒê√¢y l√† c√°c tham s·ªë hu·∫•n luy·ªán m√† ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a ·ªü tr√™n,
    #ch·∫≥ng h·∫°n nh∆∞ t·ªëc ƒë·ªô h·ªçc, s·ªë l∆∞·ª£ng epoch, k√≠ch th∆∞·ªõc batch, v.v.
    args=training_args,
    # train_dataset=train_dataset l√† t·∫≠p d·ªØ li·ªáu hu·∫•n luy·ªán d√πng ƒë·ªÉ hu·∫•n luy·ªán m√¥ h√¨nh.
    train_dataset=train_dataset,
    # eval_dataset=test_dataset l√† t·∫≠p d·ªØ li·ªáu ƒë√°nh gi√° d√πng ƒë·ªÉ ƒë√°nh gi√° m√¥ h√¨nh trong qu√° tr√¨nh hu·∫•n luy·ªán.
    eval_dataset=test_dataset
)

# S·ª≠ d·ª•ng method train() c·ªßa trainer ƒë·ªÉ train model.
# Sau khi train xong, model s·∫Ω s·∫µn s√†ng ƒë·ªÉ ƒë√°nh gi√° v√† s·ª≠ d·ª•ng cho c√°c t√°c v·ª• d·ª± ƒëo√°n.
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.0774,0.03072
2,0.0219,0.026646
3,0.0154,0.023995


TrainOutput(global_step=2634, training_loss=0.03145351578271471, metrics={'train_runtime': 1430.0282, 'train_samples_per_second': 29.456, 'train_steps_per_second': 1.842, 'total_flos': 3525775734542472.0, 'train_loss': 0.03145351578271471, 'epoch': 3.0})

# Step 4: Evaluation

In [6]:
import numpy as np

# Evaluate the model on the validation dataset
# M√¥ h√¨nh ƒë∆∞·ª£c ƒë√°nh gi√° tr√™n t·∫≠p valid_dataset b·∫±ng c√°ch s·ª≠ d·ª•ng trainer.evaluate().
#N√≥ tr·∫£ v·ªÅ c√°c ch·ªâ s·ªë nh∆∞ ƒë·ªô ch√≠nh x√°c v√† ƒë·ªô m·∫•t m√°t tr√™n valid_dataset.
evaluation_results = trainer.evaluate(eval_dataset=valid_dataset)
# in ra {'eval_loss': 0.021987076848745346,
#'eval_runtime': 25.482,
#'eval_samples_per_second': 127.541,
#'eval_steps_per_second': 8.006,
#'epoch': 3.0}
print(evaluation_results)

# Get predictions on the test set
# method trainer.predict() ƒë∆∞·ª£c s·ª≠ d·ª•ng ƒë·ªÉ l·∫•y c√°c d·ª± ƒëo√°n tr√™n t·∫≠p test_dataset
#labels l√† ID nh√£n th·ª±c t·ª´ t·∫≠p test_dataset.
predictions, labels, _ = trainer.predict(test_dataset)
# predictions ch·ª©a c√°c ƒëi·ªÉm s·ªë th√¥ cho m·ªói l·ªõp,
#ƒë∆∞·ª£c chuy·ªÉn ƒë·ªïi th√†nh ID nh√£n b·∫±ng c√°ch l·∫•y argmax.
predictions = np.argmax(predictions, axis=2)

# Define a function to convert the label IDs back to label names
def convert_labels(predictions, labels, label_list):
    pred_list = []
    true_list = []
    for pred, label in zip(predictions, labels):
        pred_list.append([label_list[p] for p in pred])
        true_list.append([label_list[l] for l in label])
    return pred_list, true_list

# Convert the predictions and labels back to their original format
pred_labels, true_labels = convert_labels(predictions, test_labels, label_list)
# Generate a classification report
from sklearn.metrics import classification_report
# Flatten the lists for easier comparison
true_labels_flat = [item for sublist in true_labels for item in sublist]
pred_labels_flat = [item for sublist in pred_labels for item in sublist]
# Generate and print the classification report
print(classification_report(true_labels_flat, pred_labels_flat, target_names=label_list))
# ·ªû c·ªôt precision v·ªõi label_name l√† B-ORG l√† 0.64,
#nghƒ©a l√† 64% trong s·ªë c√°c d·ª± ƒëo√°n B-ORG c·ªßa m√¥ h√¨nh l√† ch√≠nh x√°c.
# ·ªû c·ªôt recall v·ªõi label_name l√† B-ORG l√† 0.57,
#nghƒ©a l√† m√¥ h√¨nh ch·ªâ nh·∫≠n di·ªán ƒë√∫ng 57% trong s·ªë c√°c m·∫´u th·ª±c s·ª± l√† B-ORG.
# f1-score: Trung b√¨nh ƒëi·ªÅu h√≤a gi·ªØa ƒë·ªô ch√≠nh x√°c v√† ƒë·ªô thu h·ªìi, d√πng ƒë·ªÉ c√¢n b·∫±ng hai ch·ªâ s·ªë n√†y.
# ·ªû c·ªôt f1-score v·ªõi label_name l√† B-ORG l√† 0.60,
#cho th·∫•y m√¥ h√¨nh c√≥ hi·ªáu su·∫•t ·ªü m·ª©c trung b√¨nh v√† d·ª± ƒëo√°n ƒë·∫ßu ra kh√¥ng ƒë∆∞·ª£c ch√≠nh x√°c l·∫Øm
# support: S·ªë l∆∞·ª£ng m·∫´u th·ª±c s·ª± thu·ªôc label_name ƒë√≥ trong t·∫≠p test_dataset.
#V√≠ d·ª•: B-ORG c√≥ 702 m·∫´u.
# T·ªïng k·∫øt (3 d√≤ng cu·ªëi):
# accuracy: ƒê·ªô ch√≠nh x√°c t·ªïng th·ªÉ c·ªßa m√¥ h√¨nh tr√™n t·∫≠p ki·ªÉm tra l√† 0.99,
#t·ª©c l√† m√¥ h√¨nh d·ª± ƒëo√°n ƒë√∫ng cho 99% c√°c m·∫´u.
# macro avg: Trung b√¨nh c√°c ch·ªâ s·ªë tr√™n cho t·∫•t c·∫£ c√°c label m√† kh√¥ng c√¢n nh·∫Øc ƒë·∫øn t·∫ßn su·∫•t c·ªßa label.
#ƒê·ªô ch√≠nh x√°c trung b√¨nh l√† 0.73, ƒë·ªô thu h·ªìi trung b√¨nh l√† 0.70, v√† f1-score trung b√¨nh l√† 0.71.
# weighted avg: Trung b√¨nh c√°c ch·ªâ s·ªë nh∆∞ng c√≥ c√¢n nh·∫Øc t·∫ßn su·∫•t c·ªßa m·ªói label.
#ƒê·ªô ch√≠nh x√°c v√† f1-score ·ªü ƒë√¢y r·∫•t cao (0.99) v√¨ c√°c label nh∆∞ I-MISC chi·∫øm ƒëa s·ªë.

{'eval_loss': 0.021987076848745346, 'eval_runtime': 24.9254, 'eval_samples_per_second': 130.389, 'eval_steps_per_second': 8.184, 'epoch': 3.0}
              precision    recall  f1-score   support

           O       0.78      0.72      0.75      1668
       B-ORG       0.64      0.57      0.60       702
       I-ORG       0.79      0.71      0.75      1661
       B-PER       0.79      0.69      0.74      1617
       I-PER       0.66      0.70      0.68       257
       B-LOC       0.47      0.50      0.48       216
       I-LOC       0.71      0.72      0.72       835
      B-MISC       0.71      0.72      0.71      1156
      I-MISC       1.00      1.00      1.00    475308

    accuracy                           0.99    483420
   macro avg       0.73      0.70      0.71    483420
weighted avg       0.99      0.99      0.99    483420

