In [1]:
import pandas as pd
import numpy as np

In [3]:
from preprocess import preprocess_fn

In [4]:
data = pd.read_csv(r'relabel\restaurant\Restaurant-train.csv')

In [6]:
X_train = data['Review'].apply(preprocess_fn)
X_train

0       _ ảnh chụp từ hôm_qua đi chơi với gia_đình và ...
1       _hương_vị thơm ngon ăn cay_cay rất thích nêm_n...
2       1 bàn tiệc hoành_tráng 3 đứa ăn_no muốn tắt_th...
3       các bạn nhìn cái chảo này có to không face_wit...
4       cháo có nhiều hương cho các bạn chọn nhưng mìn...
                              ...                        
2956                                 y hệt_vị đà_lạt luôn
2957    yaourt trái_cây mát_lạnh có thêm viên kem ở tr...
2958                   zumi zumi lễ vẫn bán nhé mọi người
2959    set này có 2 tầng bánh và 1 ấm trà mà chỉ có 1...
2960    lạnh trời thế_này mà ngồi ăn_chả cá lăng_xèo x...
Name: Review, Length: 2961, dtype: object

In [8]:
data.head(3)

Unnamed: 0,Review,AMBIENCE,QUALITY,PRICES,LOCATION,SERVICE
0,"_ Ảnh chụp từ hôm qua, đi chơi với gia đình và...",0,1,0,0,0
1,"_Hương vị thơm ngon, ăn cay cay rất thích, nêm...",1,1,1,0,1
2,- 1 bàn tiệc hoành tráng 3 đứa ăn no muốn tắt ...,1,1,1,1,1


In [10]:
key = ['Review','AMBIENCE', 'QUALITY', 'PRICES', 'LOCATION', 'SERVICE']
y_train = data[key[1:]]
y_train

Unnamed: 0,AMBIENCE,QUALITY,PRICES,LOCATION,SERVICE
0,0,1,0,0,0
1,1,1,1,0,1
2,1,1,1,1,1
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
2956,0,1,0,0,0
2957,0,1,1,1,0
2958,0,0,0,0,0
2959,1,0,1,0,0


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             min_df=0.05, max_df=0.9)

In [14]:
X_trainVec = vectorizer.fit_transform(X_train)

In [15]:
X_trainVec.shape

(2961, 166)

In [60]:
dataDev = pd.read_csv(r'relabel\restaurant\Restaurant-dev.csv')
dataTest = pd.read_csv(r'relabel\restaurant\Restaurant-train.csv')

In [61]:
X_dev = dataDev['Review'].apply(preprocess_fn)
X_devVec = vectorizer.transform(X_dev)

In [62]:
y_dev = dataDev[key[1:]]

In [None]:
X_test = dataTest['Review'].apply(preprocess_fn)
X_testVec = vectorizer.transform(X_test)

In [None]:
y_test = dataTest[key[1:]]

In [80]:
from sklearn.metrics import f1_score
# ['weighted', 'macro', 'micro']
def f1_up(y_test, y_pred, average='weighted'):
    return round(f1_score(y_test, y_pred, average=average, zero_division=0), 4) 

In [90]:
from sklearn.multioutput import MultiOutputClassifier as MOC
from eval3 import aspect_eval

### LinearSVC

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [63]:
clf0 = MOC(LinearSVC(random_state=5))

In [64]:
clf0.fit(X_trainVec, y_train)

In [69]:
y_pred = clf0.predict(X_testVec)

In [50]:
aspect_eval(y_test, y_pred)

In [81]:
f1_up(y_test, y_pred)

0.8124

### Decision Tree

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [57]:
clf1 = MOC(DecisionTreeClassifier(random_state=5))

In [53]:
clf1.fit(X_trainVec, y_train)

In [54]:
y_pred = clf1.predict(X_testVec)

In [56]:
aspect_eval(y_test, y_pred)

### Tuning

In [58]:
import optuna
from optuna.samplers import TPESampler

  from .autonotebook import tqdm as notebook_tqdm


In [59]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_model', value=trial.user_attrs['model'])

In [95]:
def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_float('C', 1e-9, 1e2, log=True),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=2000,
        random_state=5
    )

    clf = MOC(LinearSVC(**params))
    clf.fit(X_trainVec, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_devVec)
    return f1_up(y_dev, y_pred)

sampler = TPESampler(seed=22)
linearsvc_study = optuna.create_study(sampler=sampler, direction='maximize')
linearsvc_study.optimize(linearsvc_objective, n_trials=50, callbacks=[callback])


clf2 = linearsvc_study.user_attrs['best_model']

print(clf2.estimators_[0].get_params())
print(linearsvc_study.best_params)

[I 2024-08-05 12:54:49,956] A new study created in memory with name: no-name-7d309f3d-1808-4681-85ae-4c497cdd5fe2
[I 2024-08-05 12:54:49,980] Trial 0 finished with value: 0.7355 and parameters: {'C': 1.9636582699290402e-07, 'class_weight': 'balanced', 'loss': 'hinge'}. Best is trial 0 with value: 0.7355.
[I 2024-08-05 12:54:49,995] Trial 1 finished with value: 0.6731 and parameters: {'C': 5.339536586472381e-06, 'class_weight': None, 'loss': 'squared_hinge'}. Best is trial 0 with value: 0.7355.
[I 2024-08-05 12:54:50,008] Trial 2 finished with value: 0.6731 and parameters: {'C': 1.3055563380836963e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.7355.
[I 2024-08-05 12:54:50,023] Trial 3 finished with value: 0.6731 and parameters: {'C': 1.1682869614143264e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.7355.
[I 2024-08-05 12:54:50,051] Trial 4 finished with value: 0.7854 and parameters: {'C': 0.2804917948703948, 'class_weight': 'balanc

{'C': 5.028833918716646, 'class_weight': None, 'dual': 'auto', 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'hinge', 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 5, 'tol': 0.0001, 'verbose': 0}
{'C': 5.028833918716646, 'class_weight': None, 'loss': 'hinge'}


In [96]:
y_pred = clf2.predict(X_testVec)

In [97]:
aspect_eval(y_test, y_pred,1)

### Non Linear

In [101]:
from sklearn.svm import SVC

def svc_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        kernel=trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        gamma=trial.suggest_categorical('gamma', ['auto', 'scale']),
        max_iter=3000,
        random_state=32
    )

    clf = MOC(SVC(**params))
    clf.fit(X_trainVec, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_devVec)
    return f1_up(y_dev, y_pred)

sampler = TPESampler(seed=0)
svc_study = optuna.create_study(direction='maximize')
svc_study.optimize(svc_objective, n_trials=20, callbacks=[callback])


clf3 = svc_study.user_attrs['best_model']

print(clf3.estimators_[0].get_params())
print(svc_study.best_params)

[I 2024-08-05 13:01:10,076] A new study created in memory with name: no-name-faecf45a-0d58-4d64-a60b-58bd86bfc02d
[I 2024-08-05 13:01:16,267] Trial 0 finished with value: 0.8215 and parameters: {'class_weight': 'balanced', 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: 0.8215.
[I 2024-08-05 13:01:20,087] Trial 1 finished with value: 0.6731 and parameters: {'class_weight': None, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 0 with value: 0.8215.
[I 2024-08-05 13:01:23,121] Trial 2 finished with value: 0.8248 and parameters: {'class_weight': None, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 2 with value: 0.8248.
[I 2024-08-05 13:01:29,203] Trial 3 finished with value: 0.8215 and parameters: {'class_weight': 'balanced', 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 2 with value: 0.8248.
[I 2024-08-05 13:01:33,603] Trial 4 finished with value: 0.7717 and parameters: {'class_weight': 'balanced', 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 2 wit

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': 3000, 'probability': False, 'random_state': 32, 'shrinking': True, 'tol': 0.001, 'verbose': False}
{'class_weight': None, 'kernel': 'sigmoid', 'gamma': 'scale'}


In [102]:
y_pred = clf3.predict(X_testVec)

In [103]:
aspect_eval(y_test, y_pred,2)