## **1. Import Libraries**

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

## **2. Load Dataset**

In [None]:
data_path = "data/shuffled_dataset.csv"
df = pd.read_csv(data_path).sample(frac=1, random_state=42)  # Shuffle dataset
df.head(10)

Unnamed: 0,text,label
621,áo thun nam GAP trắng xanh giá bao nhiêu,product
1427,bạn like chơi bóng đá hay bóng rổ,else
188,Quần âu nam có màu xám không,product
307,Đổi hàng có cần chụp ảnh hóa đơn không,terms
964,áo thun nữ cổ tròn cotton đen còn hok,product
530,tôi muốn mua áo thun bé trai xanh size s,product
251,Hôm nay weather đẹp không bạn,else
240,bạn có biết nấu phở ngon không,else
780,bạn có thích đi bộ không,else
478,bạn biết cách làm đồ handmade không,else


## **3. Preprocess Data**

In [7]:
labels = ["product", "terms", "store_info", "else"]
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.25, random_state=42 
)

## **4. Tokenization**

In [9]:
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

def tokenize_texts(texts):
    tokens = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    del tokens["token_type_ids"]  # Bỏ `token_type_ids` để tránh lỗi
    return tokens


## **5. Create Dataset Class**

In [11]:
class IntentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenize_texts(texts)
        self.labels = torch.tensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

train_dataset = IntentDataset(train_texts, train_labels)
val_dataset = IntentDataset(val_texts, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

## **6. Build Transformer Model with MHA**

In [13]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_classes):
        super(TransformerClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.attention = nn.MultiheadAttention(embed_dim=768, num_heads=8)
        self.fc = nn.Linear(768, num_classes)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :].unsqueeze(0)
        attn_output, _ = self.attention(cls_embedding, cls_embedding, cls_embedding)
        output = self.fc(self.dropout(attn_output.squeeze(0)))
        return output

## **7. Training Setup**

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(labels)
model = TransformerClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

## **8. Training Loop**

In [17]:
def train_model(model, train_loader, val_loader, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss, total_acc = 0, 0
        for batch, labels in train_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            labels = labels.to(device)
            optimizer.zero_grad()
            batch.pop("token_type_ids", None)  # Loại bỏ token_type_ids nếu có
            outputs = model(**batch)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_acc += (outputs.argmax(dim=1) == labels).sum().item()
        print(f"Epoch {epoch+1}: Loss {total_loss / len(train_loader):.4f}, Acc {total_acc / len(train_loader.dataset):.4f}")

## **9. Train Model**

In [19]:
def save_model(model, filename="model.pkl"):
    print("Saving model")
    with open(filename, "wb") as f:
        pickle.dump(model, f)

def load_model(filename="model.pkl"):
    print("Loading model")
    with open(filename, "rb") as f:
        return pickle.load(f)

In [None]:
#model = load_model()
train_model(model, train_loader, val_loader, epochs=100)
save_model(model)

Epoch 1: Loss 1.3895, Acc 0.3426
Epoch 2: Loss 1.3598, Acc 0.3649
Epoch 3: Loss 1.3637, Acc 0.3432
Epoch 4: Loss 1.3594, Acc 0.3575
Epoch 5: Loss 1.3491, Acc 0.3587
Epoch 6: Loss 1.3586, Acc 0.3605
Epoch 7: Loss 1.3535, Acc 0.3618
Epoch 8: Loss 1.3542, Acc 0.3723
Epoch 9: Loss 1.3509, Acc 0.3556
Epoch 10: Loss 1.3538, Acc 0.3612
Epoch 11: Loss 1.3475, Acc 0.3655
Epoch 12: Loss 1.3491, Acc 0.3686


In [None]:
def predict(text, model):
    model.eval()
    inputs = tokenize_texts([text])
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_label = torch.argmax(outputs, dim=1).item()
    return label_encoder.inverse_transform([predicted_label])[0]

# Load model and test prediction
model = load_model()
print(predict("tôi muốn mua quần", model))