##**Dataset**

In [None]:
!pip install -q datasets==3.2.0

In [None]:
from datasets import load_dataset

ds = load_dataset("thainq107/abte-restaurants")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/445 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/191k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/59.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

# Khởi tạo tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = []
    labels = []

    for tokens, tags in zip(examples['Tokens'], examples['Tags']):
        # Xử lý và phân tách tokens và tags
        tokens = tokens.replace("", "").strip("[]").split(', ')
        tags = tags.strip("[]").split(', ')

        bert_tokens = []
        bert_tags = []

        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i]) * len(t)]  # Nhân với số lượng token được tạo từ token gốc

        bert_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
        tokenized_inputs.append(bert_ids)
        labels.append(bert_tags)

    return {
        'input_ids': tokenized_inputs,
        'labels': labels
    }

preprocessed_ds = ds.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [None]:
!pip install -q seqeval ==1.2.2
import numpy as np
from seqeval . metrics import accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        str(p) for (p, l) in zip(predictions, labels) if l != -100
        for prediction in predictions
    ]

    true_labels = [
        str(l) for (p, l) in zip(predictions, labels) if l != -100
        for label in labels
    ]

    results = accuracy_score(true_predictions, true_labels)
    return {'accuracy': results}

In [None]:
from transformers import AutoModelForTokenClassification

id2label = {
    0: "O",
    1: "B-Term",
    2: "I-Term"
}
label2id = {
    "O": 0,
    "B-Term": 1,
    "I-Term": 2
}
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

In [None]:
import os
from transformers import TrainingArguments, Trainer

os.environ['WANDB_DISABLED'] = 'true'

# Thiết lập các tham số huấn luyện
training_args = TrainingArguments(
    output_dir="abte-restaurants-distilbert-base-uncased",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Bắt đầu huấn luyện
trainer.train()

In [None]:
from transformers import pipeline

# Khởi tạo token classifier
token_classifier = pipeline(
    model="thainq107/abte-restaurants-distilbert-base-uncased",
    aggregation_strategy="simple"
)

# Câu thử nghiệm
test_sentence = 'The bread is top notch as well'

# Gọi mô hình để phân loại
results = token_classifier(test_sentence)

# In ra kết quả
print(results)  # Kết quả sẽ bao gồm "bread"