In [1]:
from bert_logistic import prepare_data_for_model, read_texts_from_dir, train_and_evaluate, compute_rule_based_features
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train"
test_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/test"
gt_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train.csv"
print("Loading data...")
df_train = read_texts_from_dir(train_path)
df_test = read_texts_from_dir(test_path)
df_train_gt = pd.read_csv(gt_path)
y_train = df_train_gt["real_text_id"].values


# DATA AUGMENTATION

# Prepare the training dataframe
df_train['label'] = y_train - 1

# SWAP DATA
df_swap = df_train.copy()
df_swap['file_1'], df_swap['file_2'] = df_swap['file_2'], df_swap['file_1']
df_swap['label'] = 1 - df_swap['label']
# CONCAT AUGMENTED DATA TO REAL DATA
df_train = pd.concat((df_train, df_swap), axis=0).reset_index(drop=True)

# Update y_train to match the new df_train
y_train = df_train['label'].values + 1
df_train.drop(columns=['label'], inplace=True)

df_train.shape, y_train.shape

Loading data...
Number of directories: 95
Number of directories: 1068


((190, 2), (190,))

In [3]:
df_train.tail()

Unnamed: 0,file_1,file_2
185,A key focus of modern cosmology is to understa...,A main focus of modern cosmology is to underst...
186,"APEX, as its name suggests, serves as a guide ...","APEX, as its name suggests, serves as a guide ..."
187,FORS1 and FORS2 are early instruments of the V...,FORS1 and FORS2 are early instruments of the V...
188,The observations of the Pluto-Charon binary an...,The observations of the Pluto-Charon system an...
189,The new detector system was first tested on 30...,The new detector system was first tested on 30...


In [4]:
np.unique_counts(df_train_gt["real_text_id"].values)

UniqueCountsResult(values=array([1, 2]), counts=array([46, 49]))

In [5]:
np.unique_counts(y_train)

UniqueCountsResult(values=array([1, 2]), counts=array([95, 95]))

In [6]:
model_name = 'intfloat/multilingual-e5-small'

print("Preparing training data...")
X_train, embedding_extractor = prepare_data_for_model(
    df_train, 
    fit_embedding=True, 
    model_name=model_name
)

print(f"Feature matrix shape: {X_train.shape}")

Preparing training data...
Step 1: Extracting top importance features...


Extracting top features: 100%|██████████| 190/190 [00:01<00:00, 100.86it/s]


Step 2: Extracting rule-based features...


Extracting rule-based features: 100%|██████████| 190/190 [00:01<00:00, 164.43it/s]


Step 3: Extracting statistical features...
Step 4: Extracting embedding features...
Loading embedding model: intfloat/multilingual-e5-small
Loaded as SentenceTransformer model
Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 3/3 [00:13<00:00,  4.58s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 3/3 [00:10<00:00,  3.58s/it]


Step 5: Extracting pairwise features...


Extracting pairwise features: 100%|██████████| 190/190 [00:00<00:00, 387.93it/s]

Step 6: Combining features...
Final feature matrix shape: (190, 1654)
Top features: 25, Rule: 78, Stat: 6, Embedding: 1538, Pairwise: 7
Feature matrix shape: (190, 1654)





In [7]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid, including the current parameters
param_grid = {
    'iterations': [500],  # Current value
    'learning_rate': [0.1],  # Current value
    'depth': [6],  # Current value
    'l2_leaf_reg': [1, 3, 5],  # Example additional parameter
    'border_count': [32, 64]  # Example additional parameter
}

# Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(verbose=0)

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=1
)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# best score
best_score = grid_search.best_score_
print(f"Best cross-validation score: {best_score:.4f}")

Fitting 3 folds for each of 6 candidates, totalling 18 fits


Best parameters: {'border_count': 32, 'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Best cross-validation score: 0.9156


In [8]:
X_test, _ = prepare_data_for_model(
        df_test, 
        embedding_extractor=embedding_extractor, 
        fit_embedding=False
    )

Step 1: Extracting top importance features...


Extracting top features: 100%|██████████| 1068/1068 [00:03<00:00, 324.28it/s]


Step 2: Extracting rule-based features...


Extracting rule-based features: 100%|██████████| 1068/1068 [00:05<00:00, 204.04it/s]


Step 3: Extracting statistical features...
Step 4: Extracting embedding features...
Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 17/17 [01:08<00:00,  4.05s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 17/17 [01:12<00:00,  4.26s/it]


Step 5: Extracting pairwise features...


Extracting pairwise features: 100%|██████████| 1068/1068 [00:02<00:00, 428.27it/s]


Step 6: Combining features...
Final feature matrix shape: (1068, 1654)
Top features: 25, Rule: 78, Stat: 6, Embedding: 1538, Pairwise: 7


In [9]:
from pathlib import Path

print("Predicting on test ...")
test_pred = best_model.predict(X_test)

# --- Build submission -------------------------------------------------
submission = pd.DataFrame({
    "id": df_test.index,
    "real_text_id": test_pred.astype(int)
}).sort_values("id")

save_path = Path("submission_e5_catboost_improved_augmented.csv")
submission.to_csv(save_path, index=False)
print(f"✅ Submission saved to {save_path.resolve()}")

Predicting on test ...
✅ Submission saved to /home/thangquang09/CODE/CTAI_MachineLearning/notebooks/submission_e5_catboost_improved_augmented.csv
