BƯỚC 0: THIẾT LẬP MÔI TRƯỜNG VÀ TẢI DỮ LIỆU

In [1]:
import pandas as pd

# Data paths
train_path = "C:\\Users\\DoubleDD\\HUS\\NLP&DL\\datasets\\hwu\\train.csv"
val_path = "C:\\Users\\DoubleDD\\HUS\\NLP&DL\\datasets\\hwu\\val.csv"
test_path = "C:\\Users\\DoubleDD\\HUS\\NLP&DL\\datasets\\hwu\\test.csv"

# Đọc các file dữ liệu
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)
train_df.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [2]:
# LabelEncoder theo đúng nguyên tắc ML:
    # fit trên toàn bộ tập dữ liệu “category” (gồm train + val + test)
    # sau đó transform riêng cho train/val/test
    # tránh lỗi “unseen labels” và đảm bảo mapping nhất quán.

from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 3 dataframe:
# train_df, val_df, test_df
# và mỗi dataframe có cột: "category"

# 1. Gộp toàn bộ giá trị category để fit encoder (không ghép dataframe)
all_categories = pd.concat([
    train_df["category"],
    val_df["category"],
    test_df["category"]
]).astype(str)

# 2. Fit LabelEncoder trên toàn bộ unique categories
le = LabelEncoder()
le.fit(all_categories)

# 3. Transform từng tập riêng biệt
train_df["category_encoded"] = le.transform(train_df["category"].astype(str))
val_df["category_encoded"] = le.transform(val_df["category"].astype(str))
test_df["category_encoded"] = le.transform(test_df["category"].astype(str))

# Kiểm tra mapping
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(mapping)

# Số lượng categories
import numpy as np
import pandas as pd

num_classes = len(train_df["category_encoded"].unique())



{'alarm_query': 0, 'alarm_remove': 1, 'alarm_set': 2, 'audio_volume_down': 3, 'audio_volume_mute': 4, 'audio_volume_up': 5, 'calendar_query': 6, 'calendar_remove': 7, 'calendar_set': 8, 'cooking_recipe': 9, 'datetime_convert': 10, 'datetime_query': 11, 'email_addcontact': 12, 'email_query': 13, 'email_querycontact': 14, 'email_sendemail': 15, 'general_affirm': 16, 'general_commandstop': 17, 'general_confirm': 18, 'general_dontcare': 19, 'general_explain': 20, 'general_joke': 21, 'general_negate': 22, 'general_praise': 23, 'general_quirky': 24, 'general_repeat': 25, 'iot_cleaning': 26, 'iot_coffee': 27, 'iot_hue_lightchange': 28, 'iot_hue_lightdim': 29, 'iot_hue_lightoff': 30, 'iot_hue_lighton': 31, 'iot_hue_lightup': 32, 'iot_wemo_off': 33, 'iot_wemo_on': 34, 'lists_createoradd': 35, 'lists_query': 36, 'lists_remove': 37, 'music_likeness': 38, 'music_query': 39, 'music_settings': 40, 'news_query': 41, 'play_audiobook': 42, 'play_game': 43, 'play_music': 44, 'play_podcasts': 45, 'play_r

Nhiệm vụ 1: (Warm-up Ôn bài cũ) Pipeline TF-IDF + Logistic Regression

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

X_train = train_df["text"]
y_train = train_df["category_encoded"]

X_test = test_df["text"]
y_test = test_df["category_encoded"]
# 1. Tạo pipeline TF-IDF + Logistic Regression
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)

# 2. Huấn luyện pipeline trên tập train
tfidf_lr_pipeline.fit(X_train, y_train)

# 3. Dự đoán trên tập test
y_pred = tfidf_lr_pipeline.predict(X_test)

# 4. Đánh giá mô hình
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.77      0.89      0.83        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.45      0.53      0.49        19
           7       0.89      0.89      0.89        19
           8       0.87      0.68      0.76        19
           9       0.59      0.68      0.63        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.78      0.88      0.82         8
          13       0.83      0.79      0.81        19
          14       0.92      0.63      0.75        19
          15       0.81      0.89      0.85        19
          16       1.00      1.00      1.00        19
          17       1.00    

Nhiệm vụ 2: (Warm-up Ôn bài cũ) Pipeline Word2Vec (Trung bình) + Dense
Layer

In [4]:
# 2.1 Huấn luyện word2vec
from gensim.models import Word2Vec

sentences = [text.split() for text in train_df["text"]]

w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)


In [5]:
# 2.2 Hàm chuyển câu → vector trung bình
def sentence_to_avg_vector(text, model):
    words = text.split()
    vectors = [model.wv[w] for w in words if w in model.wv]
    
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    
    return np.mean(vectors, axis=0)


In [6]:
# 2.3 Tạo dữ liệu vector cho train / val / test
X_train_avg = np.vstack([
    sentence_to_avg_vector(text, w2v_model)
    for text in train_df["text"]
])

X_val_avg = np.vstack([
    sentence_to_avg_vector(text, w2v_model)
    for text in val_df["text"]
])

X_test_avg = np.vstack([
    sentence_to_avg_vector(text, w2v_model)
    for text in test_df["text"]
])

y_train = train_df["category_encoded"]
y_val = val_df["category_encoded"]
y_test = test_df["category_encoded"]


In [None]:
# 2.4 Mô hình Dense
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

dense_model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

dense_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, Input

# dense_model = Sequential([
#     Input(shape=(w2v_model.vector_size,)),   # Khai báo input rõ ràng
#     Dense(128, activation='relu'),
#     Dropout(0.5),
#     Dense(num_classes, activation='softmax')
# ])

# dense_model.compile(
#     optimizer='adam',
#     loss='sparse_categorical_crossentropy',
#     metrics=['accuracy']
# )


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# 2.5 Huấn luyện và đánh giá
dense_model.fit(
    X_train_avg, y_train,
    validation_data=(X_val_avg, y_val),
    epochs=20,
    batch_size=32,
    verbose=1
)

dense_model.evaluate(X_test_avg, y_test)


Epoch 1/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.0219 - loss: 4.1472 - val_accuracy: 0.0344 - val_loss: 4.1140
Epoch 2/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0362 - loss: 4.1086 - val_accuracy: 0.0716 - val_loss: 4.0670
Epoch 3/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0506 - loss: 4.0408 - val_accuracy: 0.0669 - val_loss: 3.9751
Epoch 4/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0561 - loss: 3.9497 - val_accuracy: 0.0641 - val_loss: 3.8679
Epoch 5/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0640 - loss: 3.8551 - val_accuracy: 0.0892 - val_loss: 3.7616
Epoch 6/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0726 - loss: 3.7680 - val_accuracy: 0.0985 - val_loss: 3.6706
Epoch 7/20
[1m280/280[0m 

[3.203279733657837, 0.17750929296016693]

Nhiệm vụ 3: Mô hình Nâng cao (Embedding Pre-trained + LSTM)

In [9]:
# 3.1 Tokenizer và padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
max_len = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(train_df["text"])

X_train_seq = tokenizer.texts_to_sequences(train_df["text"])
X_val_seq = tokenizer.texts_to_sequences(val_df["text"])
X_test_seq = tokenizer.texts_to_sequences(test_df["text"])

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


In [None]:
# 3.2 Tạo embedding matrix từ Word2vec
embedding_dim = w2v_model.vector_size
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]


In [11]:
# 3.3 Mô hình LSTM (Embedding pre-trained)
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.callbacks import EarlyStopping

lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])

lstm_model_pretrained.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)




In [12]:
# 3.4 Huấn luyện và đánh giá
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

lstm_model_pretrained.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stop]
)

lstm_model_pretrained.evaluate(X_test_pad, y_test)


Epoch 1/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.0198 - loss: 4.1334 - val_accuracy: 0.0400 - val_loss: 4.0386
Epoch 2/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - accuracy: 0.0356 - loss: 4.0026 - val_accuracy: 0.0502 - val_loss: 3.8625
Epoch 3/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.0380 - loss: 3.9966 - val_accuracy: 0.0483 - val_loss: 3.8370
Epoch 4/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 0.0482 - loss: 3.8708 - val_accuracy: 0.0539 - val_loss: 3.7793
Epoch 5/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - accuracy: 0.0533 - loss: 3.8285 - val_accuracy: 0.0743 - val_loss: 3.7240
Epoch 6/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 30ms/step - accuracy: 0.0586 - loss: 3.7932 - val_accuracy: 0.0716 - val_loss: 3.6785
Epoch 7/20
[1m280/2

[3.3832900524139404, 0.10408922284841537]

Nhiệm vụ 4: Mô hình Nâng cao (Embedding học từ đầu + LSTM)

In [13]:
lstm_model_scratch = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100,
        input_length=max_len
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])

lstm_model_scratch.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

lstm_model_scratch.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stop]
)

lstm_model_scratch.evaluate(X_test_pad, y_test)


Epoch 1/20




[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - accuracy: 0.0153 - loss: 4.1441 - val_accuracy: 0.0177 - val_loss: 4.1289
Epoch 2/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.0152 - loss: 4.1367 - val_accuracy: 0.0177 - val_loss: 4.1299
Epoch 3/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 34ms/step - accuracy: 0.0168 - loss: 4.1349 - val_accuracy: 0.0177 - val_loss: 4.1271
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0177 - loss: 4.1289


[4.128947734832764, 0.017657993361353874]