In [1]:
import pandas as pd
import numpy as np

# Load dataset

In [None]:
train = pd.read_csv("./Data/Preprocessed/ViCTSD_train-clean.csv")
val = pd.read_csv("./Data/Preprocessed/ViCTSD_valid-clean.csv")
test = pd.read_csv("./Data/Preprocessed/ViCTSD_test-clean.csv")

X_train = train['Comment_clean'].tolist()
y_train = train['Constructiveness']

X_val = val['Comment_clean'].tolist()
y_val = val['Constructiveness']

X_test = test['Comment_clean'].tolist()
y_test = test['Constructiveness']

# **Machine Learning Model**

## TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.9)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

## PhoW2V Embedding

In [None]:
train_phow2v = pd.read_csv('./Data/Embedding/ViCTSD_train-phoW2V.csv')
val_phow2v = pd.read_csv('./Data/Embedding/ViCTSD_val-phoW2V.csv')
test_phow2v = pd.read_csv('./Data/Embedding/ViCTSD_test-phoW2V.csv')

In [5]:
X_train_phow2v = train_phow2v.iloc[:, :100]
X_val_phow2v = val_phow2v.iloc[:, :100]
X_test_phow2v = test_phow2v.iloc[:, :100]

## Search Hyperparameter Optimization

In [6]:
import optuna
from optuna.samplers import TPESampler

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_model', value=trial.user_attrs['model'])

## Logistic Regression

In [8]:
from sklearn.metrics import f1_score, classification_report

In [None]:
from sklearn.linear_model import LogisticRegression

### TF-IDF

In [None]:
def logistic_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        C=trial.suggest_float('C', 1e-5, 20),
        random_state=42,
        max_iter=200
    )    

    clf = LogisticRegression(**params)
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_val_tfidf)
    return f1_score(y_val, y_pred)

sampler = TPESampler(seed=22)
logistic_study = optuna.create_study(sampler=sampler, direction='maximize')
logistic_study.optimize(logistic_objective, n_trials=20, callbacks=[callback])

lgr = logistic_study.user_attrs['best_model']

[I 2025-12-22 10:27:05,943] A new study created in memory with name: no-name-0dd9377a-029b-4fa8-b11c-efff62d88bd0


[I 2025-12-22 10:27:06,195] Trial 0 finished with value: 0.7088255733148019 and parameters: {'class_weight': None, 'C': 8.41076650090714}. Best is trial 0 with value: 0.7088255733148019.
[I 2025-12-22 10:27:06,266] Trial 1 finished with value: 0.7113665389527458 and parameters: {'class_weight': 'balanced', 'C': 6.777285823434402}. Best is trial 1 with value: 0.7113665389527458.
[I 2025-12-22 10:27:06,567] Trial 2 finished with value: 0.7001414427157001 and parameters: {'class_weight': None, 'C': 4.408098128709346}. Best is trial 1 with value: 0.7113665389527458.
[I 2025-12-22 10:27:06,688] Trial 3 finished with value: 0.7092198581560284 and parameters: {'class_weight': 'balanced', 'C': 11.224078321223027}. Best is trial 1 with value: 0.7113665389527458.
[I 2025-12-22 10:27:06,789] Trial 4 finished with value: 0.7130214917825537 and parameters: {'class_weight': 'balanced', 'C': 3.7822352140976876}. Best is trial 4 with value: 0.7130214917825537.
[I 2025-12-22 10:27:07,069] Trial 5 finis

In [16]:
print("Logistic Regression with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, lgr.predict(X_train_tfidf), average='macro'))
print('val:  ', f1_score(y_val  , lgr.predict(X_val_tfidf), average='macro'))
print('val:  ', classification_report(y_val  , lgr.predict(X_val_tfidf)))
print('test: ', f1_score(y_test , lgr.predict(X_test_tfidf), average='macro'))
print('test: ', classification_report(y_test , lgr.predict(X_test_tfidf)))

print(lgr.get_params())
print("\nBest hyperparameters:")
print(logistic_study.best_params)

Logistic Regression with Best Hyperparameters F1-scores:
train: 0.9973599930552288
val:   0.7701563885453535
val:                 precision    recall  f1-score   support

           0       0.85      0.79      0.82      1271
           1       0.68      0.76      0.72       729

    accuracy                           0.78      2000
   macro avg       0.77      0.78      0.77      2000
weighted avg       0.79      0.78      0.78      2000

test:  0.78658855528821
test:                precision    recall  f1-score   support

           0       0.88      0.79      0.83       636
           1       0.69      0.80      0.74       364

    accuracy                           0.80      1000
   macro avg       0.78      0.80      0.79      1000
weighted avg       0.81      0.80      0.80      1000

{'C': 15.423831970037199, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'deprecated', 'n_jobs': None, 'pe

### PhoW2V

In [17]:
def logistic_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        C=trial.suggest_float('C', 1e-5, 20),
        random_state=42,
        max_iter=200
    )    

    clf = LogisticRegression(**params)
    clf.fit(X_train_phow2v, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_val_phow2v)
    return f1_score(y_val, y_pred)

sampler = TPESampler(seed=22)
logistic_study = optuna.create_study(sampler=sampler, direction='maximize')
logistic_study.optimize(logistic_objective, n_trials=20, callbacks=[callback])

lgr = logistic_study.user_attrs['best_model']

[I 2025-12-22 10:31:24,189] A new study created in memory with name: no-name-63163a58-d55d-4c08-9b42-5bc78601aeb4


[I 2025-12-22 10:31:24,279] Trial 0 finished with value: 0.5197472353870458 and parameters: {'class_weight': None, 'C': 8.41076650090714}. Best is trial 0 with value: 0.5197472353870458.
[I 2025-12-22 10:31:24,354] Trial 1 finished with value: 0.6190754827384435 and parameters: {'class_weight': 'balanced', 'C': 6.777285823434402}. Best is trial 1 with value: 0.6190754827384435.
[I 2025-12-22 10:31:24,389] Trial 2 finished with value: 0.5231259968102073 and parameters: {'class_weight': None, 'C': 4.408098128709346}. Best is trial 1 with value: 0.6190754827384435.
[I 2025-12-22 10:31:24,449] Trial 3 finished with value: 0.6206896551724138 and parameters: {'class_weight': 'balanced', 'C': 11.224078321223027}. Best is trial 3 with value: 0.6206896551724138.
[I 2025-12-22 10:31:24,502] Trial 4 finished with value: 0.6194073213248111 and parameters: {'class_weight': 'balanced', 'C': 3.7822352140976876}. Best is trial 3 with value: 0.6206896551724138.
[I 2025-12-22 10:31:24,583] Trial 5 finis

In [19]:
print("Logistic Regression with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, lgr.predict(X_train_phow2v), average='macro'))
print('val:  ', f1_score(y_val  , lgr.predict(X_val_phow2v), average='macro'))
print('val:  ', classification_report(y_val  , lgr.predict(X_val_phow2v)))
print('test: ', f1_score(y_test , lgr.predict(X_test_phow2v), average='macro'))
print('test: ', classification_report(y_test , lgr.predict(X_test_phow2v)))

print(lgr.get_params())
print("\nBest hyperparameters:")
print(logistic_study.best_params)

Logistic Regression with Best Hyperparameters F1-scores:
train: 0.6924891327619445
val:   0.6721060292653029
val:                 precision    recall  f1-score   support

           0       0.81      0.65      0.72      1271
           1       0.54      0.73      0.62       729

    accuracy                           0.68      2000
   macro avg       0.68      0.69      0.67      2000
weighted avg       0.71      0.68      0.68      2000

test:  0.6722265242490915
test:                precision    recall  f1-score   support

           0       0.81      0.66      0.72       636
           1       0.55      0.72      0.62       364

    accuracy                           0.68      1000
   macro avg       0.68      0.69      0.67      1000
weighted avg       0.71      0.68      0.69      1000

{'C': 12.304120349802874, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'deprecated', 'n_jobs': None, '

## Linear SVC

In [20]:
from sklearn.svm import LinearSVC

### TF-IDF

In [21]:
def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_float('C', 1e-9, 1e2, log=True),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=10000,
        random_state=42
    )
    
    if params['loss'] == 'hinge':
        params['dual'] = True
    clf = (LinearSVC(**params))
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)
    
    y_pred = clf.predict(X_val_tfidf)
    return f1_score(y_val, y_pred)

sampler = TPESampler(seed=22)
svc_study = optuna.create_study(sampler=sampler, direction='maximize')
svc_study.optimize(linearsvc_objective, n_trials=20, callbacks=[callback])

svc_linear = svc_study.user_attrs['best_model']

[I 2025-12-22 10:33:31,231] A new study created in memory with name: no-name-6e760abd-5fb3-4492-8b5c-f6c497ca44e2
[I 2025-12-22 10:33:31,255] Trial 0 finished with value: 0.5721021611001965 and parameters: {'C': 1.9636582699290402e-07, 'class_weight': 'balanced', 'loss': 'hinge'}. Best is trial 0 with value: 0.5721021611001965.
[I 2025-12-22 10:33:31,268] Trial 1 finished with value: 0.0 and parameters: {'C': 5.339536586472381e-06, 'class_weight': None, 'loss': 'squared_hinge'}. Best is trial 0 with value: 0.5721021611001965.
[I 2025-12-22 10:33:31,277] Trial 2 finished with value: 0.0 and parameters: {'C': 1.3055563380836963e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.5721021611001965.
[I 2025-12-22 10:33:31,288] Trial 3 finished with value: 0.0 and parameters: {'C': 1.1682869614143264e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.5721021611001965.
[I 2025-12-22 10:33:31,304] Trial 4 finished with value: 0.7052307692307692 an

In [None]:
print("Linear SVC with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, svc_linear.predict(X_train_tfidf), average='macro'))
print('val:  ', f1_score(y_val  , svc_linear.predict(X_val_tfidf), average='macro'))
print('val:  ', classification_report(y_val  , svc_linear.predict(X_val_tfidf)))
print('test: ', f1_score(y_test , svc_linear.predict(X_test_tfidf), average='macro'))
print('test: ', classification_report(y_test , svc_linear.predict(X_test_tfidf)))

print(svc_linear.get_params())
print("\nBest hyperparameters:")
print(svc_study.best_params)

Linear SVC with Best Hyperparameters F1-scores:
train: 0.9726172502532795
val:   0.7660341825629515
val:                 precision    recall  f1-score   support

           0       0.86      0.78      0.82      1271
           1       0.67      0.78      0.72       729

    accuracy                           0.78      2000
   macro avg       0.76      0.78      0.77      2000
weighted avg       0.79      0.78      0.78      2000

test:  0.7824744873648752
test:                precision    recall  f1-score   support

           0       0.89      0.77      0.82       636
           1       0.67      0.83      0.74       364

    accuracy                           0.79      1000
   macro avg       0.78      0.80      0.78      1000
weighted avg       0.81      0.79      0.79      1000

{'C': 0.35855697016049437, 'class_weight': 'balanced', 'dual': 'auto', 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 10000, 'multi_class': 'ovr', 'penalty': 'l2', 'rand

### PhoW2V

In [24]:
def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_float('C', 1e-9, 1e2, log=True),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=100000,
        random_state=42
    )
    
    if params['loss'] == 'hinge':
        params['dual'] = True
    clf = (LinearSVC(**params))
    clf.fit(X_train_phow2v, y_train)
    trial.set_user_attr(key="model", value=clf)
    
    y_pred = clf.predict(X_val_phow2v)
    return f1_score(y_val, y_pred)
sampler = TPESampler(seed=22)
svc_study = optuna.create_study(sampler=sampler, direction='maximize')
svc_study.optimize(linearsvc_objective, n_trials=20, callbacks=[callback])

svc_linear = svc_study.user_attrs['best_model']

[I 2025-12-22 10:34:44,882] A new study created in memory with name: no-name-fb4175e5-bdbc-4c2c-9eea-343049af156a
[I 2025-12-22 10:34:44,916] Trial 0 finished with value: 0.5580322828593389 and parameters: {'C': 1.9636582699290402e-07, 'class_weight': 'balanced', 'loss': 'hinge'}. Best is trial 0 with value: 0.5580322828593389.
[I 2025-12-22 10:34:44,956] Trial 1 finished with value: 0.0 and parameters: {'C': 5.339536586472381e-06, 'class_weight': None, 'loss': 'squared_hinge'}. Best is trial 0 with value: 0.5580322828593389.
[I 2025-12-22 10:34:44,976] Trial 2 finished with value: 0.0 and parameters: {'C': 1.3055563380836963e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.5580322828593389.
[I 2025-12-22 10:34:44,998] Trial 3 finished with value: 0.0 and parameters: {'C': 1.1682869614143264e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.5580322828593389.
[I 2025-12-22 10:34:45,070] Trial 4 finished with value: 0.6150490730643402 an

In [25]:
print("Linear SVC with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, svc_linear.predict(X_train_phow2v), average='macro'))
print('val:  ', f1_score(y_val  , svc_linear.predict(X_val_phow2v), average='macro'))
print('val:  ', classification_report(y_val  , svc_linear.predict(X_val_phow2v)))
print('test: ', f1_score(y_test , svc_linear.predict(X_test_phow2v), average='macro'))
print('test: ', classification_report(y_test , svc_linear.predict(X_test_phow2v)))

print(svc_linear.get_params())
print("\nBest hyperparameters:")
print(svc_study.best_params)

Linear SVC with Best Hyperparameters F1-scores:
train: 0.6873688897951102
val:   0.6661054540156384
val:                 precision    recall  f1-score   support

           0       0.82      0.61      0.70      1271
           1       0.53      0.77      0.63       729

    accuracy                           0.67      2000
   macro avg       0.68      0.69      0.67      2000
weighted avg       0.72      0.67      0.68      2000

test:  0.6633119556707596
test:                precision    recall  f1-score   support

           0       0.82      0.62      0.70       636
           1       0.53      0.76      0.62       364

    accuracy                           0.67      1000
   macro avg       0.67      0.69      0.66      1000
weighted avg       0.71      0.67      0.67      1000

{'C': 80.12422295059812, 'class_weight': 'balanced', 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'hinge', 'max_iter': 100000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 

## Non-Linear SVC

In [26]:
from sklearn.svm import SVC

### TF-IDF

In [27]:
def nonlinear_svc_objective(trial):
    kernel_choice = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])
    
    params = dict(
        C=trial.suggest_float('C', 1e-3, 100, log=True),
        kernel=kernel_choice,
        gamma=trial.suggest_categorical('gamma', ['scale', 'auto']), 
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        random_state=42,
        max_iter=10000 
    )
    
    if kernel_choice == 'poly':
        params['degree'] = trial.suggest_int('degree', 2, 4)

    clf = (SVC(**params))
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_val_tfidf)
    return f1_score(y_val, y_pred)

sampler = TPESampler(seed=22)
svc_nonlinear_study = optuna.create_study(sampler=sampler, direction='maximize')
svc_nonlinear_study.optimize(nonlinear_svc_objective, n_trials=5, callbacks=[callback])

nonlinear_svc = svc_nonlinear_study.user_attrs['best_model']

[I 2025-12-22 10:35:29,956] A new study created in memory with name: no-name-79293957-96ac-4af2-bb99-c25144a7403d
[I 2025-12-22 10:35:40,498] Trial 0 finished with value: 0.0 and parameters: {'kernel': 'poly', 'C': 19.765599562374707, 'gamma': 'auto', 'class_weight': None, 'degree': 2}. Best is trial 0 with value: 0.0.
[I 2025-12-22 10:35:54,698] Trial 1 finished with value: 0.6992805755395683 and parameters: {'kernel': 'rbf', 'C': 11.71199658483352, 'gamma': 'scale', 'class_weight': None}. Best is trial 1 with value: 0.6992805755395683.
[I 2025-12-22 10:36:07,456] Trial 2 finished with value: 0.6997840172786177 and parameters: {'kernel': 'rbf', 'C': 6.9177316302209935, 'gamma': 'scale', 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6997840172786177.
[I 2025-12-22 10:36:19,868] Trial 3 finished with value: 0.6120313862249346 and parameters: {'kernel': 'poly', 'C': 41.338016019468874, 'gamma': 'scale', 'class_weight': 'balanced', 'degree': 2}. Best is trial 2 with value: 0.

In [28]:
print("Non-Linear SVC with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, nonlinear_svc.predict(X_train_tfidf), average='macro'))
print('val:  ', f1_score(y_val  , nonlinear_svc.predict(X_val_tfidf), average='macro'))
print('val:  ', classification_report(y_val  , nonlinear_svc.predict(X_val_tfidf)))
print('test: ', f1_score(y_test , nonlinear_svc.predict(X_test_tfidf), average='macro'))
print('test: ', classification_report(y_test , nonlinear_svc.predict(X_test_tfidf)))

print(nonlinear_svc.get_params())
print("\nBest hyperparameters:")
print(svc_nonlinear_study.best_params)

Non-Linear SVC with Best Hyperparameters F1-scores:
train: 0.9995337055295797
val:   0.7700375467473135
val:                 precision    recall  f1-score   support

           0       0.82      0.86      0.84      1271
           1       0.74      0.67      0.70       729

    accuracy                           0.79      2000
   macro avg       0.78      0.76      0.77      2000
weighted avg       0.79      0.79      0.79      2000

test:  0.790960617196901
test:                precision    recall  f1-score   support

           0       0.85      0.85      0.85       636
           1       0.74      0.73      0.73       364

    accuracy                           0.81      1000
   macro avg       0.79      0.79      0.79      1000
weighted avg       0.81      0.81      0.81      1000

{'C': 6.9177316302209935, 'break_ties': False, 'cache_size': 200, 'class_weight': 'balanced', 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1

### PhoW2V

In [29]:
def nonlinear_svc_objective(trial):
    kernel_choice = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])
    
    params = dict(
        C=trial.suggest_float('C', 1e-3, 100, log=True),
        kernel=kernel_choice,
        gamma=trial.suggest_categorical('gamma', ['scale', 'auto']), 
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        random_state=42,
        max_iter=10000 
    )
    
    if kernel_choice == 'poly':
        params['degree'] = trial.suggest_int('degree', 2, 4)

    clf = (SVC(**params))
    clf.fit(X_train_phow2v, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_val_phow2v)
    return f1_score(y_val, y_pred)

sampler = TPESampler(seed=22)
svc_nonlinear_study = optuna.create_study(sampler=sampler, direction='maximize')
svc_nonlinear_study.optimize(nonlinear_svc_objective, n_trials=5, callbacks=[callback])

nonlinear_svc = svc_nonlinear_study.user_attrs['best_model']

[I 2025-12-22 10:37:21,335] A new study created in memory with name: no-name-0b20271b-b115-4b7e-8ff5-88d8f1cc2749
[I 2025-12-22 10:37:23,542] Trial 0 finished with value: 0.0 and parameters: {'kernel': 'poly', 'C': 19.765599562374707, 'gamma': 'auto', 'class_weight': None, 'degree': 2}. Best is trial 0 with value: 0.0.
[I 2025-12-22 10:37:26,665] Trial 1 finished with value: 0.7051630434782609 and parameters: {'kernel': 'rbf', 'C': 11.71199658483352, 'gamma': 'scale', 'class_weight': None}. Best is trial 1 with value: 0.7051630434782609.
[I 2025-12-22 10:37:29,749] Trial 2 finished with value: 0.7100115074798619 and parameters: {'kernel': 'rbf', 'C': 6.9177316302209935, 'gamma': 'scale', 'class_weight': 'balanced'}. Best is trial 2 with value: 0.7100115074798619.
[I 2025-12-22 10:37:32,341] Trial 3 finished with value: 0.695067264573991 and parameters: {'kernel': 'poly', 'C': 41.338016019468874, 'gamma': 'scale', 'class_weight': 'balanced', 'degree': 2}. Best is trial 2 with value: 0.7

In [30]:
print("Non-Linear SVC with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, nonlinear_svc.predict(X_train_phow2v), average='macro'))
print('val:  ', f1_score(y_val  , nonlinear_svc.predict(X_val_phow2v), average='macro'))
print('val:  ', classification_report(y_val  , nonlinear_svc.predict(X_val_phow2v)))
print('test: ', f1_score(y_test , nonlinear_svc.predict(X_test_phow2v), average='macro'))
print('test: ', classification_report(y_test , nonlinear_svc.predict(X_test_phow2v)))

print(nonlinear_svc.get_params())
print("\nBest hyperparameters:")
print(svc_nonlinear_study.best_params)

Non-Linear SVC with Best Hyperparameters F1-scores:
train: 0.780850815099297
val:   0.7435999181961643
val:                 precision    recall  f1-score   support

           0       0.89      0.69      0.78      1271
           1       0.61      0.85      0.71       729

    accuracy                           0.75      2000
   macro avg       0.75      0.77      0.74      2000
weighted avg       0.79      0.75      0.75      2000

test:  0.75506105103303
test:                precision    recall  f1-score   support

           0       0.89      0.71      0.79       636
           1       0.63      0.85      0.72       364

    accuracy                           0.76      1000
   macro avg       0.76      0.78      0.76      1000
weighted avg       0.79      0.76      0.76      1000

{'C': 6.9177316302209935, 'break_ties': False, 'cache_size': 200, 'class_weight': 'balanced', 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 100

## MultinomialNB

In [31]:
from sklearn.naive_bayes import MultinomialNB

### TF-IDF

In [32]:
def multinomial_nb_objective(trial):
    params = dict(
        alpha=trial.suggest_float('alpha', 1e-3, 10, log=True),
        fit_prior=trial.suggest_categorical('fit_prior', [True, False])
    )
    
    clf = (MultinomialNB(**params))
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_val_tfidf)
    return f1_score(y_val, y_pred)

sampler = TPESampler(seed=22)
nb_study = optuna.create_study(sampler=sampler, direction='maximize')
nb_study.optimize(multinomial_nb_objective, n_trials=50, callbacks=[callback])

nb_model = nb_study.user_attrs['best_model']

[I 2025-12-22 10:38:29,921] A new study created in memory with name: no-name-81eaedeb-346b-4927-937f-91004cb50043
[I 2025-12-22 10:38:29,941] Trial 0 finished with value: 0.6450809464508095 and parameters: {'alpha': 0.006820907334120959, 'fit_prior': True}. Best is trial 0 with value: 0.6450809464508095.
[I 2025-12-22 10:38:29,955] Trial 1 finished with value: 0.6077998528329654 and parameters: {'alpha': 2.733556118022411, 'fit_prior': False}. Best is trial 0 with value: 0.6450809464508095.
[I 2025-12-22 10:38:29,990] Trial 2 finished with value: 0.6425855513307985 and parameters: {'alpha': 0.012081791403069187, 'fit_prior': True}. Best is trial 0 with value: 0.6450809464508095.
[I 2025-12-22 10:38:29,999] Trial 3 finished with value: 0.6188466947960619 and parameters: {'alpha': 1.7693089816650007, 'fit_prior': False}. Best is trial 0 with value: 0.6450809464508095.
[I 2025-12-22 10:38:30,016] Trial 4 finished with value: 0.46277665995975853 and parameters: {'alpha': 1.7984764264034472

In [34]:
print("MultinomialNB with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, nb_model.predict(X_train_tfidf), average='macro'))
print('val:  ', f1_score(y_val  , nb_model.predict(X_val_tfidf), average='macro'))
print('val:  ', classification_report(y_val  , nb_model.predict(X_val_tfidf)))
print('test: ', f1_score(y_test , nb_model.predict(X_test_tfidf), average='macro'))
print('test: ', classification_report(y_test , nb_model.predict(X_test_tfidf)))

print(nb_model.get_params())
print("\nBest hyperparameters:")
print(nb_study.best_params)

MultinomialNB with Best Hyperparameters F1-scores:
train: 0.9828919288030509
val:   0.7011037146799258
val:                 precision    recall  f1-score   support

           0       0.81      0.71      0.76      1271
           1       0.58      0.72      0.65       729

    accuracy                           0.71      2000
   macro avg       0.70      0.71      0.70      2000
weighted avg       0.73      0.71      0.72      2000

test:  0.7014607973041501
test:                precision    recall  f1-score   support

           0       0.84      0.67      0.75       636
           1       0.57      0.77      0.66       364

    accuracy                           0.71      1000
   macro avg       0.70      0.72      0.70      1000
weighted avg       0.74      0.71      0.71      1000

{'alpha': 0.0033857817135395417, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}

Best hyperparameters:
{'alpha': 0.0033857817135395417, 'fit_prior': True}


## Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

### TF-IDF

In [36]:
def rf_objective(trial):
    params = dict(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 10, 100),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 15),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
        n_jobs=-1, 
        random_state=42
    )
    
    clf = (RandomForestClassifier(**params))
    clf.fit(X_train_tfidf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_val_tfidf)
    return f1_score(y_val, y_pred)

sampler = TPESampler(seed=22)
rf_study = optuna.create_study(sampler=sampler, direction='maximize')
rf_study.optimize(rf_objective, n_trials=10, callbacks=[callback])

rf_model = rf_study.user_attrs['best_model']

[I 2025-12-22 10:39:24,104] A new study created in memory with name: no-name-d7e6092e-22d4-4779-b807-35a215f72720
[I 2025-12-22 10:39:24,442] Trial 0 finished with value: 0.7070967741935484 and parameters: {'n_estimators': 81, 'max_depth': 53, 'min_samples_split': 7, 'min_samples_leaf': 9, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.7070967741935484.
[I 2025-12-22 10:39:25,431] Trial 1 finished with value: 0.6836734693877551 and parameters: {'n_estimators': 154, 'max_depth': 30, 'min_samples_split': 13, 'min_samples_leaf': 1, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.7070967741935484.
[I 2025-12-22 10:39:25,631] Trial 2 finished with value: 0.2823803967327888 and parameters: {'n_estimators': 78, 'max_depth': 10, 'min_samples_split': 12, 'min_samples_leaf': 10, 'class_weight': None}. Best is trial 0 with value: 0.7070967741935484.
[I 2025-12-22 10:39:26,327] Trial 3 finished with value: 0.716514954486346 and parameters: {'n_estimators'

In [37]:
print("RandomForestClassifier with Best Hyperparameters F1-scores:")
print('train:', f1_score(y_train, rf_model.predict(X_train_tfidf), average='macro'))
print('val:  ', f1_score(y_val  , rf_model.predict(X_val_tfidf), average='macro'))
print('val:  ', classification_report(y_val  , rf_model.predict(X_val_tfidf)))
print('test: ', f1_score(y_test , rf_model.predict(X_test_tfidf), average='macro'))
print('test: ', classification_report(y_test , rf_model.predict(X_test_tfidf)))

print(rf_model.get_params())
print("\nBest hyperparameters:")
print(rf_study.best_params)

RandomForestClassifier with Best Hyperparameters F1-scores:
train: 0.8093801326459344
val:   0.770439728740657
val:                 precision    recall  f1-score   support

           0       0.86      0.79      0.82      1271
           1       0.68      0.77      0.72       729

    accuracy                           0.78      2000
   macro avg       0.77      0.78      0.77      2000
weighted avg       0.79      0.78      0.78      2000

test:  0.7900017865519651
test:                precision    recall  f1-score   support

           0       0.88      0.79      0.83       636
           1       0.69      0.81      0.75       364

    accuracy                           0.80      1000
   macro avg       0.79      0.80      0.79      1000
weighted avg       0.81      0.80      0.80      1000

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 56, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease