In [48]:
import pandas as pd
import numpy as np

# Load dataset

In [49]:
train = pd.read_csv("./VLSP2018_Hotel/Preprocessed/1-VLSP2018-SA-Hotel-train-clean.csv")
dev = pd.read_csv("./VLSP2018_Hotel/Preprocessed/2-VLSP2018-SA-Hotel-dev-clean.csv")
test = pd.read_csv("./VLSP2018_Hotel/Preprocessed/3-VLSP2018-SA-Hotel-test-clean.csv")

X_train = train['review_clean'].tolist()
y_train = train.drop(columns=['review_clean'])

X_dev = dev['review_clean'].tolist()
y_dev = dev.drop(columns=['review_clean'])

X_test = test['review_clean'].tolist()
y_test = test.drop(columns=['review_clean'])

In [None]:
y_train['ROOM_AMENITIES#PRICES'][0] = 1

# TF-IDF

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.9)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev_tfidf = vectorizer.transform(X_dev)
X_test_tfidf = vectorizer.transform(X_test)

# Machine Learning Model

## Score Definition

In [52]:
from sklearn.metrics import f1_score, classification_report

# Chuyển ma trận (N, K) thành ma trận nhị phân (N, 3*K) để tính F1-score
def multioutput_to_multilabel(y_sentiment_indices):
    if isinstance(y_sentiment_indices, pd.DataFrame):
        y_sentiment_indices = y_sentiment_indices.values

    nrow = y_sentiment_indices.shape[0] # Số lượng mẫu.
    ncol = y_sentiment_indices.shape[1] # Số lượng aspect.

    # Khởi tạo mảng Multi-label (Boolean) với kích thước: Hàng x (3 * Cột).
    multilabel = np.zeros((nrow, 3 * ncol), dtype=bool)
    for i in range(nrow):
        for j in range(ncol):
            sentiment_idx = y_sentiment_indices[i, j]
            if sentiment_idx != 0:
                pos = j * 3 + (sentiment_idx - 1)
                multilabel[i, pos] = True
    return multilabel

# Tính F1-score dựa trên ma trận nhị phân
def custom_f1_score(y_true, y_pred, average='micro', **kwargs):
    y_true_ml = multioutput_to_multilabel(y_true)
    y_pred_ml = multioutput_to_multilabel(y_pred)
    return round(f1_score(y_true_ml, y_pred_ml, average=average, **kwargs), 4)

# Tạo báo cáo phân loại dựa trên ma trận nhị phân
def custom_classification_report(y_true, y_pred, **kwargs):
    y_true_ml = multioutput_to_multilabel(y_true)
    y_pred_ml = multioutput_to_multilabel(y_pred)
    return classification_report(y_true_ml, y_pred_ml, **kwargs)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier as MOC

## Base Linear SVM

In [55]:
clf1 = MOC(LinearSVC(random_state=42))
clf1.fit(X_train_tfidf, y_train)

print("Train F1-score:", custom_f1_score(y_train, clf1.predict(X_train_tfidf)))
print("Dev F1-score:", custom_f1_score(y_dev, clf1.predict(X_dev_tfidf)))
print("Test F1-score:", custom_f1_score(y_test, clf1.predict(X_test_tfidf)))

Train F1-score: 0.9961
Dev F1-score: 0.6068
Test F1-score: 0.5911


## Search Hyperparameter Optimization

In [56]:
import optuna
from optuna.samplers import TPESampler

In [57]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_model', value=trial.user_attrs['model'])

### Linear SVM

In [58]:
def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_float('C', 1e-9, 1e2, log=True),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=2000,
        random_state=42
    )

    clf = MOC(LinearSVC(**params))
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)
    
    y_pred = clf.predict(X_dev_tfidf)
    return custom_f1_score(y_dev, y_pred)

sampler = TPESampler(seed=22)
linearsvc_study = optuna.create_study(sampler=sampler, direction='maximize')
linearsvc_study.optimize(linearsvc_objective, n_trials=30, callbacks=[callback])

clf2 = linearsvc_study.user_attrs['best_model']

[I 2025-11-09 11:49:39,670] A new study created in memory with name: no-name-251da09c-bf4f-4fa3-9958-e093d27f7323
[I 2025-11-09 11:49:40,138] Trial 0 finished with value: 0.1862 and parameters: {'C': 1.9636582699290402e-07, 'class_weight': 'balanced', 'loss': 'hinge'}. Best is trial 0 with value: 0.1862.
[I 2025-11-09 11:49:40,694] Trial 1 finished with value: 0.2441 and parameters: {'C': 5.339536586472381e-06, 'class_weight': None, 'loss': 'squared_hinge'}. Best is trial 1 with value: 0.2441.
[I 2025-11-09 11:49:41,025] Trial 2 finished with value: 0.2441 and parameters: {'C': 1.3055563380836963e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 1 with value: 0.2441.
[I 2025-11-09 11:49:41,348] Trial 3 finished with value: 0.2441 and parameters: {'C': 1.1682869614143264e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 1 with value: 0.2441.
[I 2025-11-09 11:49:44,411] Trial 4 finished with value: 0.5598 and parameters: {'C': 0.2804917948703948, 'class_weight': 'balanc

In [59]:
print("Linear SVM with Best Hyperparameters F1-scores:")
print('train:', custom_f1_score(y_train, clf2.predict(X_train_tfidf)))
print('dev:  ', custom_f1_score(y_dev  , clf2.predict(X_dev_tfidf)))
print('test: ', custom_f1_score(y_test , clf2.predict(X_test_tfidf)))

print(clf2.estimators_[0].get_params())

print("\nBest hyperparameters:")
print(linearsvc_study.best_params)

Linear SVM with Best Hyperparameters F1-scores:
train: 0.9963
dev:   0.6287
test:  0.6173
{'C': 2.23635063517357, 'class_weight': 'balanced', 'dual': 'auto', 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 42, 'tol': 0.0001, 'verbose': 0}

Best hyperparameters:
{'C': 2.23635063517357, 'class_weight': 'balanced', 'loss': 'squared_hinge'}


### Non-Linear SVM

In [64]:
def svc_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        kernel=trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        gamma=trial.suggest_categorical('gamma', ['auto', 'scale']),
        max_iter=2000,
        random_state=42
    )

    clf = MOC(SVC(**params))
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_dev_tfidf)
    return custom_f1_score(y_dev, y_pred)

sampler = TPESampler(seed=22)
svc_study = optuna.create_study(direction='maximize')
svc_study.optimize(svc_objective, n_trials=10, callbacks=[callback])

clf3 = svc_study.user_attrs['best_model']

[I 2025-11-09 12:11:33,885] A new study created in memory with name: no-name-401063b0-0d95-448c-86fa-eb25c4a31450
[I 2025-11-09 12:15:21,255] Trial 0 finished with value: 0.4721 and parameters: {'class_weight': None, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.4721.
[I 2025-11-09 12:20:12,052] Trial 1 finished with value: 0.2456 and parameters: {'class_weight': None, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: 0.4721.
[I 2025-11-09 12:21:11,052] Trial 2 finished with value: 0.2441 and parameters: {'class_weight': None, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 0 with value: 0.4721.
[I 2025-11-09 12:26:25,279] Trial 3 finished with value: 0.2846 and parameters: {'class_weight': 'balanced', 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: 0.4721.
[I 2025-11-09 12:31:50,867] Trial 4 finished with value: 0.2456 and parameters: {'class_weight': None, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: 0.4721.
[I

In [65]:
print("Non-Linear SVM with Best Hyperparameters F1-scores:")
print('train:', custom_f1_score(y_train, clf3.predict(X_train_tfidf)))
print('dev:  ', custom_f1_score(y_dev  , clf3.predict(X_dev_tfidf)))
print('test: ', custom_f1_score(y_test , clf3.predict(X_test_tfidf)))

print(clf3.estimators_[0].get_params())
print("\nBest hyperparameters:")
print(svc_study.best_params)

Non-Linear SVM with Best Hyperparameters F1-scores:
train: 0.9198
dev:   0.615
test:  0.6147
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': 'balanced', 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': 2000, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}

Best hyperparameters:
{'class_weight': 'balanced', 'kernel': 'sigmoid', 'gamma': 'scale'}


### LogisticRegression

In [60]:
def logistic_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        C=trial.suggest_float('C', 1e-5, 20),
        random_state=42,
        max_iter=200
    )    

    clf = MOC(LogisticRegression(**params))
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_dev_tfidf)
    return custom_f1_score(y_dev, y_pred)

sampler = TPESampler(seed=22)
logistic_study = optuna.create_study(sampler=sampler, direction='maximize')
logistic_study.optimize(logistic_objective, n_trials=10, callbacks=[callback])

clf4 = logistic_study.user_attrs['best_model']

[I 2025-11-09 11:53:15,574] A new study created in memory with name: no-name-9069ccf9-0674-4dfb-b099-d05c07869c75
[I 2025-11-09 11:53:35,682] Trial 0 finished with value: 0.5929 and parameters: {'class_weight': None, 'C': 8.41076650090714}. Best is trial 0 with value: 0.5929.
[I 2025-11-09 11:53:54,553] Trial 1 finished with value: 0.626 and parameters: {'class_weight': 'balanced', 'C': 6.777285823434402}. Best is trial 1 with value: 0.626.
[I 2025-11-09 11:54:13,368] Trial 2 finished with value: 0.5809 and parameters: {'class_weight': None, 'C': 4.408098128709346}. Best is trial 1 with value: 0.626.
[I 2025-11-09 11:54:36,095] Trial 3 finished with value: 0.6272 and parameters: {'class_weight': 'balanced', 'C': 11.224078321223027}. Best is trial 3 with value: 0.6272.
[I 2025-11-09 11:54:59,969] Trial 4 finished with value: 0.6236 and parameters: {'class_weight': 'balanced', 'C': 3.7822352140976876}. Best is trial 3 with value: 0.6272.
[I 2025-11-09 11:55:26,764] Trial 5 finished with 

In [61]:
print("Logistic Regression with Best Hyperparameters F1-scores:")
print('train:', custom_f1_score(y_train, clf4.predict(X_train_tfidf)))
print('dev:  ', custom_f1_score(y_dev  , clf4.predict(X_dev_tfidf)))
print('test: ', custom_f1_score(y_test , clf4.predict(X_test_tfidf)))

print(clf4.estimators_[0].get_params())
print("\nBest hyperparameters:")
print(logistic_study.best_params)

Logistic Regression with Best Hyperparameters F1-scores:
train: 0.9937
dev:   0.6289
test:  0.6204
{'C': 15.3598571591844, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Best hyperparameters:
{'class_weight': 'balanced', 'C': 15.3598571591844}


# Export model

In [None]:
# Xóa cache mô-đun vietnamese_processor
import sys
sys.modules.pop('processors.vietnamese_processor', None)

<module 'processors.vietnamese_processor' from 'D:\\DS102\\processors\\vietnamese_processor.py'>

In [None]:
from processors.vietnamese_processor import CustomPreprocessorTransformer
from sklearn.pipeline import make_pipeline
import joblib

pipe = make_pipeline(
    CustomPreprocessorTransformer(use_vncorenlp=True, vncorenlp_dir="./processors/VnCoreNLP"),
    vectorizer,
    clf4
)

joblib.dump(pipe, 'pipe.joblib')

['pipe.joblib']

# Test

In [95]:
text = "Nhân viên thân thiện, phục vụ tốt, khách sạn xa trung tâm, phòng rộng rãi, sạch sẽ"

pred = pipe.predict([text])
print(pred)

VnCoreNLP word segmenter is loaded successfully.
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1]]
