BƯỚC 0: THIẾT LẬP MÔI TRƯỜNG VÀ TẢI DỮ LIỆU

In [4]:
import pandas as pd

# Data paths
train_path = "C:\\Users\\DoubleDD\\HUS\\NLP&DL\\datasets\\hwu\\train.csv"
val_path = "C:\\Users\\DoubleDD\\HUS\\NLP&DL\\datasets\\hwu\\val.csv"
test_path = "C:\\Users\\DoubleDD\\HUS\\NLP&DL\\datasets\\hwu\\test.csv"

# Đọc các file dữ liệu
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)
train_df.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [None]:
# LabelEncoder theo đúng nguyên tắc ML:
    # fit trên toàn bộ tập dữ liệu “category” (gồm train + val + test)
    # sau đó transform riêng cho train/val/test
    # tránh lỗi “unseen labels” và đảm bảo mapping nhất quán.

from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 3 dataframe:
# train_df, val_df, test_df
# và mỗi dataframe có cột: "category"

# 1. Gộp toàn bộ giá trị category để fit encoder (không ghép dataframe)
all_categories = pd.concat([
    train_df["category"],
    val_df["category"],
    test_df["category"]
]).astype(str)

# 2. Fit LabelEncoder trên toàn bộ unique categories
le = LabelEncoder()
le.fit(all_categories)

# 3. Transform từng tập riêng biệt
train_df["category_encoded"] = le.transform(train_df["category"].astype(str))
val_df["category_encoded"] = le.transform(val_df["category"].astype(str))
test_df["category_encoded"] = le.transform(test_df["category"].astype(str))

# Kiểm tra mapping
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(mapping)


{'alarm_query': 0, 'alarm_remove': 1, 'alarm_set': 2, 'audio_volume_down': 3, 'audio_volume_mute': 4, 'audio_volume_up': 5, 'calendar_query': 6, 'calendar_remove': 7, 'calendar_set': 8, 'cooking_recipe': 9, 'datetime_convert': 10, 'datetime_query': 11, 'email_addcontact': 12, 'email_query': 13, 'email_querycontact': 14, 'email_sendemail': 15, 'general_affirm': 16, 'general_commandstop': 17, 'general_confirm': 18, 'general_dontcare': 19, 'general_explain': 20, 'general_joke': 21, 'general_negate': 22, 'general_praise': 23, 'general_quirky': 24, 'general_repeat': 25, 'iot_cleaning': 26, 'iot_coffee': 27, 'iot_hue_lightchange': 28, 'iot_hue_lightdim': 29, 'iot_hue_lightoff': 30, 'iot_hue_lighton': 31, 'iot_hue_lightup': 32, 'iot_wemo_off': 33, 'iot_wemo_on': 34, 'lists_createoradd': 35, 'lists_query': 36, 'lists_remove': 37, 'music_likeness': 38, 'music_query': 39, 'music_settings': 40, 'news_query': 41, 'play_audiobook': 42, 'play_game': 43, 'play_music': 44, 'play_podcasts': 45, 'play_r

Nhiệm vụ 1: (Warm-up Ôn bài cũ) Pipeline TF-IDF + Logistic Regression

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

X_train = train_df["text"]
y_train = train_df["category_encoded"]

X_test = test_df["text"]
y_test = test_df["category_encoded"]
# 1. Tạo pipeline TF-IDF + Logistic Regression
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)

# 2. Huấn luyện pipeline trên tập train
tfidf_lr_pipeline.fit(X_train, y_train)

# 3. Dự đoán trên tập test
y_pred = tfidf_lr_pipeline.predict(X_test)

# 4. Đánh giá mô hình
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.77      0.89      0.83        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.45      0.53      0.49        19
           7       0.89      0.89      0.89        19
           8       0.87      0.68      0.76        19
           9       0.59      0.68      0.63        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.78      0.88      0.82         8
          13       0.83      0.79      0.81        19
          14       0.92      0.63      0.75        19
          15       0.81      0.89      0.85        19
          16       1.00      1.00      1.00        19
          17       1.00    

Nhiệm vụ 2: (Warm-up Ôn bài cũ) Pipeline Word2Vec (Trung bình) + Dense
Layer