In [1]:
from bert_logistic import prepare_data_for_model, read_texts_from_dir
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train"
test_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/test"
gt_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train.csv"
print("Loading data...")
df_train = read_texts_from_dir(train_path)
df_test = read_texts_from_dir(test_path)
df_train_gt = pd.read_csv(gt_path)
y_train = df_train_gt["real_text_id"].values


# DATA AUGMENTATION

# Prepare the training dataframe
df_train['label'] = y_train - 1

# SWAP DATA
df_swap = df_train.copy()
df_swap['file_1'], df_swap['file_2'] = df_swap['file_2'], df_swap['file_1']
df_swap['label'] = 1 - df_swap['label']
# CONCAT AUGMENTED DATA TO REAL DATA
df_train = pd.concat((df_train, df_swap), axis=0).reset_index(drop=True)

# Update y_train to match the new df_train
y_train = df_train['label'].values + 1
df_train.drop(columns=['label'], inplace=True)

df_train.shape, y_train.shape

Loading data...
Number of directories: 95
Number of directories: 1068


((190, 2), (190,))

In [3]:
df_train.tail()

Unnamed: 0,file_1,file_2
185,A key focus of modern cosmology is to understa...,A main focus of modern cosmology is to underst...
186,"APEX, as its name suggests, serves as a guide ...","APEX, as its name suggests, serves as a guide ..."
187,FORS1 and FORS2 are early instruments of the V...,FORS1 and FORS2 are early instruments of the V...
188,The observations of the Pluto-Charon binary an...,The observations of the Pluto-Charon system an...
189,The new detector system was first tested on 30...,The new detector system was first tested on 30...


In [4]:
np.unique_counts(df_train_gt["real_text_id"].values)

UniqueCountsResult(values=array([1, 2]), counts=array([46, 49]))

In [5]:
np.unique_counts(y_train)

UniqueCountsResult(values=array([1, 2]), counts=array([95, 95]))

In [3]:
model_name = 'intfloat/multilingual-e5-small'

print("Preparing training data...")
X_train, embedding_extractor = prepare_data_for_model(
    df_train, 
    fit_embedding=True, 
    model_name=model_name
)

print(f"Feature matrix shape: {X_train.shape}")

Preparing training data...
Using 8 CPU cores for feature extraction...
Step 1: Extracting top importance features...
Using 8 cores for top features extraction...


Extracting top features (multi-threaded): 100%|██████████| 190/190 [00:01<00:00, 123.66it/s]

Step 2: Extracting rule-based features...
Using 8 cores for rule-based features extraction...



Extracting rule-based features (multi-threaded): 100%|██████████| 190/190 [01:31<00:00,  2.08it/s]


Step 3: Extracting statistical features...
Step 4: Extracting embedding features...
Loading embedding model: intfloat/multilingual-e5-small
Loaded as SentenceTransformer model
Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 3/3 [00:10<00:00,  3.44s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 3/3 [00:11<00:00,  3.73s/it]

Step 5: Extracting pairwise features...
Using 8 cores for pairwise features extraction...



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- A

Step 6: Combining features...
Final feature matrix shape: (190, 1658)
Top features: 26, Rule: 81, Stat: 6, Embedding: 1538, Pairwise: 7
Feature matrix shape: (190, 1658)


In [4]:
X = X_train.copy()
y = y_train.copy()

### Finetune LightGBM

In [11]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb


def objective(trial):
    # 1. Định nghĩa không gian tìm kiếm cho các siêu tham số
    params = {
        'objective': 'binary',
        'random_state': 42,
        'n_estimators': 1000,
        'verbosity': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    }
    
    
    # 2. Chạy Cross-Validation với bộ tham số được gợi ý
    NFOLDS = 5
    folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    oof_preds = np.zeros(X.shape[0])

    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='accuracy',
                  callbacks=[lgb.early_stopping(100, verbose=False)])
        
        val_preds = model.predict(X_val)
        oof_preds[val_idx] = val_preds

    # 3. Trả về kết quả để Optuna đánh giá
    cv_accuracy = accuracy_score(y, oof_preds)
    return cv_accuracy

In [12]:
study = optuna.create_study(direction='maximize')

# Bắt đầu quá trình tối ưu, ví dụ chạy 50 lần thử nghiệm (trials)
# Bạn có thể tăng/giảm n_trials. Càng nhiều thì càng có khả năng tìm ra kết quả tốt hơn.
study.optimize(objective, n_trials=50)

# In ra kết quả
print("\n🏁 Quá trình tối ưu hóa hoàn tất!")
print("Số lần thử nghiệm:", len(study.trials))
print("Tham số tốt nhất tìm được:")
best_params = study.best_params
print(best_params)

print(f"\nĐiểm CV Accuracy cao nhất: {study.best_value:.4f}")

[I 2025-08-18 11:16:43,482] A new study created in memory with name: no-name-24cc3acd-52bd-4abb-98e2-f2f67c2d1748
[I 2025-08-18 11:16:44,722] Trial 0 finished with value: 0.9052631578947369 and parameters: {'learning_rate': 0.014601804866941613, 'num_leaves': 67, 'max_depth': 3, 'feature_fraction': 0.6519543810190966, 'bagging_fraction': 0.9963083185438855, 'bagging_freq': 2, 'lambda_l1': 2.1756495426628035, 'lambda_l2': 5.2069357136333174e-05}. Best is trial 0 with value: 0.9052631578947369.
[I 2025-08-18 11:16:46,002] Trial 1 finished with value: 0.9315789473684211 and parameters: {'learning_rate': 0.03659073491750704, 'num_leaves': 189, 'max_depth': 10, 'feature_fraction': 0.8756079858227328, 'bagging_fraction': 0.6815724620295649, 'bagging_freq': 4, 'lambda_l1': 9.11285226962535e-06, 'lambda_l2': 0.05272980275968911}. Best is trial 1 with value: 0.9315789473684211.
[I 2025-08-18 11:16:46,862] Trial 2 finished with value: 0.9263157894736842 and parameters: {'learning_rate': 0.056791


🏁 Quá trình tối ưu hóa hoàn tất!
Số lần thử nghiệm: 50
Tham số tốt nhất tìm được:
{'learning_rate': 0.09077072379825946, 'num_leaves': 182, 'max_depth': 7, 'feature_fraction': 0.6020650991897541, 'bagging_fraction': 0.6799741863195304, 'bagging_freq': 3, 'lambda_l1': 0.003973275927695599, 'lambda_l2': 3.763939223557607e-06}

Điểm CV Accuracy cao nhất: 0.9526


In [None]:
lgbm_params = {'learning_rate': 0.09077072379825946, 'num_leaves': 182, 'max_depth': 7, 'feature_fraction': 0.6020650991897541, 'bagging_fraction': 0.6799741863195304, 'bagging_freq': 3, 'lambda_l1': 0.003973275927695599, 'lambda_l2': 3.763939223557607e-06}

### Finetune Catboost

In [18]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

def objective(trial):
    # 1. Định nghĩa không gian tìm kiếm cho các siêu tham số
    # SỬA LỖI: Bỏ 'subsample' ra khỏi định nghĩa ban đầu
    params = {
        'objective': 'Logloss',
        'eval_metric': 'Accuracy',
        'random_seed': 42,
        'n_estimators': 1000,
        'verbose': False,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 12),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
    }

    # SỬA LỖI: Chỉ thêm 'subsample' khi bootstrap_type không phải là 'Bayesian'
    if params['bootstrap_type'] in ['Bernoulli', 'MVS']:
        params['subsample'] = trial.suggest_float('subsample', 0.6, 1.0)

    # 2. Chạy Cross-Validation
    NFOLDS = 5
    folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    oof_preds = np.zeros(X.shape[0])

    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]
        
        model = CatBoostClassifier(**params)
        
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=100,
                  verbose=False)
        
        val_preds = model.predict(X_val)
        oof_preds[val_idx] = val_preds.flatten()

    # 3. Trả về kết quả
    cv_accuracy = accuracy_score(y, oof_preds)
    return cv_accuracy

In [19]:
study = optuna.create_study(direction='maximize')

# Bắt đầu quá trình tối ưu, ví dụ chạy 50 lần thử nghiệm (trials)
# Bạn có thể tăng/giảm n_trials. Càng nhiều thì càng có khả năng tìm ra kết quả tốt hơn.
study.optimize(objective, n_trials=50)

# In ra kết quả
print("\n🏁 Quá trình tối ưu hóa hoàn tất!")
print("Số lần thử nghiệm:", len(study.trials))
print("Tham số tốt nhất tìm được:")
best_params = study.best_params
print(best_params)

print(f"\nĐiểm CV Accuracy cao nhất: {study.best_value:.4f}")

[I 2025-08-18 11:39:36,177] A new study created in memory with name: no-name-9309d416-9cfd-45ac-a5f1-122b79fb1b95
[I 2025-08-18 11:50:58,752] Trial 0 finished with value: 0.9263157894736842 and parameters: {'learning_rate': 0.02518068504774169, 'depth': 12, 'colsample_bylevel': 0.6248190648610378, 'l2_leaf_reg': 4.278948080289902e-05, 'bootstrap_type': 'Bernoulli', 'subsample': 0.753760284625704}. Best is trial 0 with value: 0.9263157894736842.
[I 2025-08-18 11:57:09,319] Trial 1 finished with value: 0.9263157894736842 and parameters: {'learning_rate': 0.08581705792546754, 'depth': 11, 'colsample_bylevel': 0.77086456206118, 'l2_leaf_reg': 0.36256072745165324, 'bootstrap_type': 'Bayesian'}. Best is trial 0 with value: 0.9263157894736842.
[I 2025-08-18 11:57:14,402] Trial 2 finished with value: 0.9526315789473684 and parameters: {'learning_rate': 0.04099455217142849, 'depth': 3, 'colsample_bylevel': 0.7731638752043458, 'l2_leaf_reg': 1.401756739378074e-06, 'bootstrap_type': 'MVS', 'subsa


🏁 Quá trình tối ưu hóa hoàn tất!
Số lần thử nghiệm: 50
Tham số tốt nhất tìm được:
{'learning_rate': 0.031472603665629095, 'depth': 3, 'colsample_bylevel': 0.7687785312158301, 'l2_leaf_reg': 1.994119921149311e-07, 'bootstrap_type': 'MVS', 'subsample': 0.8344833223801289}

Điểm CV Accuracy cao nhất: 0.9737


In [5]:
catboost_params = {'learning_rate': 0.031472603665629095, 'depth': 3, 'colsample_bylevel': 0.7687785312158301, 'l2_leaf_reg': 1.994119921149311e-07, 'bootstrap_type': 'MVS', 'subsample': 0.8344833223801289}


In [7]:
from catboost import CatBoostClassifier

best_model = CatBoostClassifier(**catboost_params)
best_model.fit(X, y)

0:	learn: 0.6603395	total: 54.3ms	remaining: 54.2s
1:	learn: 0.6152476	total: 60.8ms	remaining: 30.3s
2:	learn: 0.5868399	total: 67.8ms	remaining: 22.5s
3:	learn: 0.5357406	total: 76ms	remaining: 18.9s
4:	learn: 0.5141884	total: 82.7ms	remaining: 16.4s
5:	learn: 0.4914087	total: 88.2ms	remaining: 14.6s
6:	learn: 0.4594535	total: 94.8ms	remaining: 13.4s
7:	learn: 0.4149746	total: 101ms	remaining: 12.5s
8:	learn: 0.3989782	total: 107ms	remaining: 11.8s
9:	learn: 0.3796219	total: 114ms	remaining: 11.2s
10:	learn: 0.3430819	total: 119ms	remaining: 10.7s
11:	learn: 0.3256212	total: 124ms	remaining: 10.2s
12:	learn: 0.3169463	total: 131ms	remaining: 9.91s
13:	learn: 0.3066531	total: 135ms	remaining: 9.53s
14:	learn: 0.2990591	total: 140ms	remaining: 9.19s
15:	learn: 0.2920062	total: 146ms	remaining: 8.97s
16:	learn: 0.2797439	total: 151ms	remaining: 8.75s
17:	learn: 0.2614258	total: 156ms	remaining: 8.52s
18:	learn: 0.2552272	total: 162ms	remaining: 8.37s
19:	learn: 0.2488413	total: 168ms	re

<catboost.core.CatBoostClassifier at 0x7f927343b4f0>

In [8]:
X_test, _ = prepare_data_for_model(
        df_test, 
        embedding_extractor=embedding_extractor, 
        fit_embedding=False
    )

Using 8 CPU cores for feature extraction...
Step 1: Extracting top importance features...
Using 8 cores for top features extraction...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step 2: Extracting rule-based features...
Using 8 cores for rule-based features extraction...



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- A

Step 3: Extracting statistical features...
Step 4: Extracting embedding features...
Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 17/17 [01:08<00:00,  4.02s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 17/17 [01:01<00:00,  3.60s/it]

Step 5: Extracting pairwise features...
Using 8 cores for pairwise features extraction...



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- A

Step 6: Combining features...
Final feature matrix shape: (1068, 1658)
Top features: 26, Rule: 81, Stat: 6, Embedding: 1538, Pairwise: 7


In [9]:
from pathlib import Path

print("Predicting on test ...")
test_pred = best_model.predict(X_test)

# --- Build submission -------------------------------------------------
submission = pd.DataFrame({
    "id": df_test.index,
    "real_text_id": test_pred.astype(int)
}).sort_values("id")

save_path = Path("submission_e5_catboost_improved_augmented_1.csv")
submission.to_csv(save_path, index=False)
print(f"✅ Submission saved to {save_path.resolve()}")

Predicting on test ...
✅ Submission saved to /home/thangquang09/CODE/CTAI_MachineLearning/notebooks/submission_e5_catboost_improved_augmented_1.csv
