## I. Read Data (Only Statistic Features)

In [22]:
import os
import sys
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")


# Find project root directory automatically
def find_project_root():
    current_dir = os.getcwd()
    while current_dir != "/":
        if any(
            marker in os.listdir(current_dir)
            for marker in [
                ".gitignore",
                "requirements.txt",
                "setup.py",
                "pyproject.toml",
            ]
        ):
            return current_dir
        current_dir = os.path.dirname(current_dir)
    return os.getcwd()  # fallback to current directory


project_root = find_project_root()

train_stat_df_path = os.path.join(project_root, "data/train_statistic_features.csv")
val_stat_df_path = os.path.join(project_root, "data/val_statistic_features.csv")
train_tfidf_df_path = os.path.join(project_root, "data/train_tfidf_features.csv")
val_tfidf_df_path = os.path.join(project_root, "data/val_tfidf_features.csv")
train_bow_df_path = os.path.join(project_root, "data/train_bow_features.csv")
val_bow_df_path = os.path.join(project_root, "data/val_bow_features.csv")
test_stat_df_path = os.path.join(project_root, "data/test_statistic_features.csv")
test_tfidf_df_path = os.path.join(project_root, "data/test_tfidf_features.csv")
test_bow_df_path = os.path.join(project_root, "data/test_bow_features.csv")

train_stat_df = pd.read_csv(train_stat_df_path)
val_stat_df = pd.read_csv(val_stat_df_path)
test_stat_df = pd.read_csv(test_stat_df_path)


print("Shape Train:", train_stat_df.shape)
print("Shape Validation:", val_stat_df.shape)
print("Columns Train:", train_stat_df.columns)

Shape Train: (152, 45)
Shape Validation: (19, 45)
Columns Train: Index(['file_1', 'file_2', 'label', 'file1_char_count', 'file1_word_count',
       'file1_sentence_count', 'file1_avg_sentence_length',
       'file1_english_word_ratio', 'file1_has_non_english_script',
       'file1_has_mixed_scripts', 'file1_unicode_control_chars',
       'file1_num_count', 'file1_repetition_score', 'file1_perplexity_score',
       'file1_ttr_ratio', 'file2_char_count', 'file2_word_count',
       'file2_sentence_count', 'file2_avg_sentence_length',
       'file2_english_word_ratio', 'file2_has_non_english_script',
       'file2_has_mixed_scripts', 'file2_unicode_control_chars',
       'file2_num_count', 'file2_repetition_score', 'file2_perplexity_score',
       'file2_ttr_ratio', 'diff_char_count', 'ratio_char_count',
       'diff_word_count', 'ratio_word_count', 'diff_sentence_count',
       'ratio_sentence_count', 'diff_avg_sentence_length',
       'diff_english_word_ratio', 'diff_has_non_english_scri

Có thể đọc mô tả các features ở [đây](../data/README.md#processed-features-information)

In [23]:
X_train, y_train = (
    train_stat_df.drop(columns=["label", "file_1", "file_2"]),
    train_stat_df["label"],
)
X_val, y_val = (
    val_stat_df.drop(columns=["label", "file_1", "file_2"]),
    val_stat_df["label"],
)

## 

In [None]:
# Check for columns that contain only zeros
zero_columns = (X_train == 0).all()
zero_column_names = zero_columns[zero_columns].index.tolist()

print(f"Number of columns with all zeros: {len(zero_column_names)}")
print(f"Zero columns: {zero_column_names}")

# Also check the percentage of zeros in each column
zero_percentages = (X_train == 0).mean() * 100
high_zero_columns = zero_percentages[zero_percentages > 70].sort_values(ascending=False)

print(f"\nColumns with >90% zeros:")
print(high_zero_columns)

# Remove columns with >80% zeros
high_zero_threshold = 80
columns_to_remove = zero_percentages[
    zero_percentages > high_zero_threshold
].index.tolist()

print(f"\nColumns to remove (>80% zeros): {columns_to_remove}")

# Remove these columns from X_train and X_val
X_train = X_train.drop(columns=columns_to_remove)
X_val = X_val.drop(columns=columns_to_remove)

print(f"New shape after removing high-zero columns:")
print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")

Number of columns with all zeros: 0
Zero columns: []

Columns with >90% zeros:
diff_sentence_count             98.684211
file1_has_mixed_scripts         90.131579
file1_has_non_english_script    90.131579
file2_has_non_english_script    90.131579
file2_has_mixed_scripts         90.131579
diff_has_non_english_script     80.263158
diff_has_mixed_scripts          80.263158
dtype: float64

Columns to remove (>80% zeros): ['file1_has_non_english_script', 'file1_has_mixed_scripts', 'file2_has_non_english_script', 'file2_has_mixed_scripts', 'diff_sentence_count', 'diff_has_non_english_script', 'diff_has_mixed_scripts']
New shape after removing high-zero columns:
X_train: (152, 35)
X_val: (19, 35)


## Modeling

### 1. Decision Tree

In [25]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5)
tree_model.fit(X_train, y_train)

y_pred = tree_model.predict(X_val)
print("Decision Tree: Validation Accuracy:", accuracy_score(y_val, y_pred))

Decision Tree: Validation Accuracy: 0.7894736842105263


### 2. Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    random_state=42, n_estimators=100, min_samples_split=5
)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_val)
print("Random Forest: Validation Accuracy:", accuracy_score(y_val, y_pred))

Random Forest: Validation Accuracy: 0.8947368421052632


### 3. SVM

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Try different SVM kernels and print validation accuracy for each

# Scale the features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

kernels = ["linear", "rbf", "poly", "sigmoid"]

for kernel in kernels:
    svm_model = SVC(random_state=42, kernel=kernel)
    svm_model.fit(X_train_scaled, y_train)
    y_pred = svm_model.predict(X_val_scaled)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"SVM ({kernel}): Validation Accuracy: {accuracy:.4f}")

SVM (linear): Validation Accuracy: 0.8421
SVM (rbf): Validation Accuracy: 0.8421
SVM (poly): Validation Accuracy: 0.7895
SVM (sigmoid): Validation Accuracy: 0.8421


### 4. Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_train)

y_pred = log_reg_model.predict(X_val)
print("Logistic Regression: Validation Accuracy:", accuracy_score(y_val, y_pred))

Logistic Regression: Validation Accuracy: 0.8947368421052632


### 5. Naive Bayes

In [29]:
# Do các đặc trưng là các giá trị liên tục và có âm nên ta thử nghiệm Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

y_pred = nb_model.predict(X_val_scaled)
print("Naive Bayes: Validation Accuracy:", accuracy_score(y_val, y_pred))

Naive Bayes: Validation Accuracy: 0.5789473684210527


### 6. Boosting Model

In [30]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

cat_model = CatBoostClassifier(
    random_state=42, iterations=1000, learning_rate=0.1, depth=6, verbose=0
)
cat_model.fit(X_train, y_train)

y_pred = cat_model.predict(X_val)
print("CatBoost: Validation Accuracy:", accuracy_score(y_val, y_pred))

lgb_model = LGBMClassifier(
    random_state=42, n_estimators=1000, learning_rate=0.1, num_leaves=31
)
lgb_model.fit(X_train, y_train)

y_pred = lgb_model.predict(X_val)
print("LightGBM: Validation Accuracy:", accuracy_score(y_val, y_pred))

y_train_xgboost = y_train.copy() - 1
xgb_model = XGBClassifier(
    random_state=42, n_estimators=1000, learning_rate=0.1, max_depth=6
)
xgb_model.fit(X_train, y_train_xgboost)

y_pred = xgb_model.predict(X_val) + 1
print("XGBoost: Validation Accuracy:", accuracy_score(y_val, y_pred))

CatBoost: Validation Accuracy: 0.8947368421052632
[LightGBM] [Info] Number of positive: 76, number of negative: 76
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1441
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM: Validation Accuracy: 0.8421052631578947
XGBoost: Validation Accuracy: 0.8421052631578947


## II. Data with TFIDF Features

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

train_tfidf_df = pd.read_csv(train_tfidf_df_path)
val_tfidf_df = pd.read_csv(val_tfidf_df_path)

# Scale statistical features first
# columns_drop = zero_column_names.extend()

X_train_stat = train_stat_df.drop(columns=["label", "file_1", "file_2"])
X_val_stat = val_stat_df.drop(columns=["label", "file_1", "file_2"])

X_train_stat.drop(columns=columns_to_remove, inplace=True, errors="ignore")
X_val_stat.drop(columns=columns_to_remove, inplace=True, errors="ignore")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_stat)
X_val_scaled = scaler.transform(X_val_stat)

# Concatenate scaled statistical features with TFIDF features
X_train = np.concatenate([X_train_scaled, train_tfidf_df.values], axis=1)
X_val = np.concatenate([X_val_scaled, val_tfidf_df.values], axis=1)
y_train = train_stat_df["label"]
y_val = val_stat_df["label"]
["label", "file_1", "file_2"].extend(zero_column_names)
print(X_train.shape, X_val.shape)


(152, 85) (19, 85)


## Modeling

In [None]:
# =============================================================================
# DECISION TREE MODEL
# =============================================================================
tree_model = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5)
tree_model.fit(X_train, y_train)

y_pred = tree_model.predict(X_val)
print("Decision Tree: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# RANDOM FOREST MODEL
# =============================================================================
rf_model = RandomForestClassifier(
    random_state=42, n_estimators=100, min_samples_split=5
)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_val)
print("Random Forest: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# SUPPORT VECTOR MACHINE (SVM) MODELS
# =============================================================================
kernels = ["linear", "rbf", "poly", "sigmoid"]

for kernel in kernels:
    svm_model = SVC(random_state=42, kernel=kernel)
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"SVM ({kernel}): Validation Accuracy: {accuracy:.4f}")

# =============================================================================
# LOGISTIC REGRESSION MODEL
# =============================================================================
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_train)

y_pred = log_reg_model.predict(X_val)
print("Logistic Regression: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# CATBOOST MODEL
# =============================================================================
cat_model = CatBoostClassifier(
    random_state=42, iterations=1000, learning_rate=0.1, depth=6, verbose=0
)
cat_model.fit(X_train, y_train)

y_pred = cat_model.predict(X_val)
print("CatBoost: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# LIGHTGBM MODEL
# =============================================================================
lgb_model = LGBMClassifier(
    random_state=42,
    n_estimators=1000,
    learning_rate=0.1,
    num_leaves=31,
    force_col_wise=True,
)
lgb_model.fit(X_train, y_train)

y_pred = lgb_model.predict(X_val)
print("LightGBM: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# XGBOOST MODEL
# =============================================================================
y_train_xgboost = y_train.copy() - 1
xgb_model = XGBClassifier(
    random_state=42, n_estimators=1000, learning_rate=0.1, max_depth=6
)
xgb_model.fit(X_train, y_train_xgboost)

y_pred = xgb_model.predict(X_val) + 1
print("XGBoost: Validation Accuracy:", accuracy_score(y_val, y_pred))


Decision Tree: Validation Accuracy: 0.7894736842105263
Random Forest: Validation Accuracy: 0.8421052631578947
SVM (linear): Validation Accuracy: 0.8421
SVM (rbf): Validation Accuracy: 0.8421
SVM (poly): Validation Accuracy: 0.8421
SVM (sigmoid): Validation Accuracy: 0.8421
Logistic Regression: Validation Accuracy: 0.8421052631578947
CatBoost: Validation Accuracy: 0.7894736842105263
[LightGBM] [Info] Number of positive: 76, number of negative: 76
[LightGBM] [Info] Total Bins 4062
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM: Validation Accuracy: 0.8421052631578947
XGBoost: Validation Accuracy: 0.8421052631578947


## II. Data with BOW Features

In [None]:
train_bow_df = pd.read_csv(train_bow_df_path)
val_bow_df = pd.read_csv(val_bow_df_path)

# Scale statistical features first
X_train_stat = train_stat_df.drop(columns=["label", "file_1", "file_2"])
X_val_stat = val_stat_df.drop(columns=["label", "file_1", "file_2"])

X_train_stat.drop(columns=columns_to_remove, inplace=True, errors="ignore")
X_val_stat.drop(columns=columns_to_remove, inplace=True, errors="ignore")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_stat)
X_val_scaled = scaler.transform(X_val_stat)

# Concatenate scaled statistical features with BOW features
X_train = np.concatenate([X_train_scaled, train_bow_df.values], axis=1)
X_val = np.concatenate([X_val_scaled, val_bow_df.values], axis=1)
y_train = train_stat_df["label"]
y_val = val_stat_df["label"]

print(X_train.shape, X_val.shape)

(152, 85) (19, 85)


In [None]:
# =============================================================================
# DECISION TREE MODEL
# =============================================================================
tree_model = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5)
tree_model.fit(X_train, y_train)

y_pred = tree_model.predict(X_val)
print("Decision Tree: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# RANDOM FOREST MODEL
# =============================================================================
rf_model = RandomForestClassifier(
    random_state=42, n_estimators=100, min_samples_split=5
)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_val)
print("Random Forest: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# SUPPORT VECTOR MACHINE (SVM) MODELS
# =============================================================================
kernels = ["linear", "rbf", "poly", "sigmoid"]

for kernel in kernels:
    svm_model = SVC(random_state=42, kernel=kernel)
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"SVM ({kernel}): Validation Accuracy: {accuracy:.4f}")

# =============================================================================
# LOGISTIC REGRESSION MODEL
# =============================================================================
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_train)

y_pred = log_reg_model.predict(X_val)
print("Logistic Regression: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# CATBOOST MODEL
# =============================================================================
cat_model = CatBoostClassifier(
    random_state=42, iterations=1000, learning_rate=0.1, depth=6, verbose=0
)
cat_model.fit(X_train, y_train)

y_pred = cat_model.predict(X_val)
print("CatBoost: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# LIGHTGBM MODEL
# =============================================================================
lgb_model = LGBMClassifier(
    random_state=42,
    n_estimators=1000,
    learning_rate=0.1,
    num_leaves=31,
    force_col_wise=True,
)
lgb_model.fit(X_train, y_train)

y_pred = lgb_model.predict(X_val)
print("LightGBM: Validation Accuracy:", accuracy_score(y_val, y_pred))

# =============================================================================
# XGBOOST MODEL
# =============================================================================
y_train_xgboost = y_train.copy() - 1
xgb_model = XGBClassifier(
    random_state=42, n_estimators=1000, learning_rate=0.1, max_depth=6
)
xgb_model.fit(X_train, y_train_xgboost)

y_pred = xgb_model.predict(X_val) + 1
print("XGBoost: Validation Accuracy:", accuracy_score(y_val, y_pred))


Decision Tree: Validation Accuracy: 0.8421052631578947
Random Forest: Validation Accuracy: 0.8421052631578947
SVM (linear): Validation Accuracy: 0.7368
SVM (rbf): Validation Accuracy: 0.8947
SVM (poly): Validation Accuracy: 0.8421
SVM (sigmoid): Validation Accuracy: 0.8421
Logistic Regression: Validation Accuracy: 0.8421052631578947
CatBoost: Validation Accuracy: 0.8947368421052632
[LightGBM] [Info] Number of positive: 76, number of negative: 76
[LightGBM] [Info] Total Bins 4062
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM: Validation Accuracy: 0.8421052631578947
XGBoost: Validation Accuracy: 0.7894736842105263


### Hyper Params Tuning (Statistic + BOW)

In [None]:
import optuna


def objective_catboost(trial):
    """Hàm mục tiêu để Optuna tối ưu cho CatBoost."""

    # Định nghĩa không gian tìm kiếm siêu tham số
    params = {
        "iterations": trial.suggest_int("iterations", 500, 2000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0,
        "random_state": 42,
    }

    # Khởi tạo và huấn luyện mô hình
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)

    # Đánh giá trên tập validation
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

In [None]:
print("Bắt đầu tuning cho CatBoost...")
study_catboost = optuna.create_study(direction="maximize")
study_catboost.optimize(objective_catboost, n_trials=50)  # Chạy 50 lần


print("\nQuá trình tuning CatBoost hoàn tất!")
print(f"Best trial score (accuracy): {study_catboost.best_value:.4f}")
print("Best hyperparameters:")
for key, value in study_catboost.best_params.items():
    print(f"  {key}: {value}")
print("-" * 50)

In [None]:
def objective_svm(trial):
    """Hàm mục tiêu để Optuna tối ưu cho SVM."""

    # Định nghĩa không gian tìm kiếm siêu tham số
    params = {
        "C": trial.suggest_float("C", 1e-2, 1e2, log=True),
        "gamma": trial.suggest_float("gamma", 1e-4, 1e-1, log=True),
        "kernel": "rbf",
        "random_state": 42,
    }

    # Khởi tạo và huấn luyện mô hình
    model = SVC(**params)
    model.fit(X_train, y_train)

    # Đánh giá
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy


print("\nBắt đầu tuning cho SVM (RBF)...")
study_svm = optuna.create_study(direction="maximize")
study_svm.optimize(
    objective_svm, n_trials=100, timeout=600
)  # Chạy 100 lần hoặc trong 10 phút

print("\nQuá trình tuning SVM hoàn tất!")
print(f"Best trial score (accuracy): {study_svm.best_value:.4f}")
print("Best hyperparameters:")
for key, value in study_svm.best_params.items():
    print(f"  {key}: {value}")
print("-" * 50)

[I 2025-08-21 23:08:17,097] A new study created in memory with name: no-name-ec0c9beb-746a-428c-9dce-44e328c2fccc
[I 2025-08-21 23:08:17,104] Trial 0 finished with value: 0.6842105263157895 and parameters: {'C': 55.08518668896078, 'gamma': 0.07173559798303256}. Best is trial 0 with value: 0.6842105263157895.
[I 2025-08-21 23:08:17,107] Trial 1 finished with value: 0.6842105263157895 and parameters: {'C': 0.8087790601484058, 'gamma': 0.021092949658385967}. Best is trial 0 with value: 0.6842105263157895.
[I 2025-08-21 23:08:17,111] Trial 2 finished with value: 0.7894736842105263 and parameters: {'C': 0.044668345692760486, 'gamma': 0.0025586678403146964}. Best is trial 2 with value: 0.7894736842105263.
[I 2025-08-21 23:08:17,114] Trial 3 finished with value: 0.8421052631578947 and parameters: {'C': 0.02864190179581928, 'gamma': 0.0012280421446768074}. Best is trial 3 with value: 0.8421052631578947.
[I 2025-08-21 23:08:17,117] Trial 4 finished with value: 0.8947368421052632 and parameters:


Bắt đầu tuning cho SVM (RBF)...


[I 2025-08-21 23:08:17,302] Trial 44 finished with value: 0.8947368421052632 and parameters: {'C': 7.387580805652581, 'gamma': 0.007055867227607033}. Best is trial 30 with value: 1.0.
[I 2025-08-21 23:08:17,308] Trial 45 finished with value: 0.7894736842105263 and parameters: {'C': 2.7976937508010087, 'gamma': 0.013944277462155387}. Best is trial 30 with value: 1.0.
[I 2025-08-21 23:08:17,314] Trial 46 finished with value: 0.9473684210526315 and parameters: {'C': 1.9213257394818228, 'gamma': 0.004215525423577248}. Best is trial 30 with value: 1.0.
[I 2025-08-21 23:08:17,320] Trial 47 finished with value: 0.8421052631578947 and parameters: {'C': 25.24656277018434, 'gamma': 0.0028338701257701764}. Best is trial 30 with value: 1.0.
[I 2025-08-21 23:08:17,326] Trial 48 finished with value: 0.6842105263157895 and parameters: {'C': 1.0184555248562452, 'gamma': 0.0287407224548322}. Best is trial 30 with value: 1.0.
[I 2025-08-21 23:08:17,332] Trial 49 finished with value: 0.9473684210526315 a


Quá trình tuning SVM hoàn tất!
Best trial score (accuracy): 1.0000
Best hyperparameters:
  C: 27.023593525639352
  gamma: 0.0038919385515236416
--------------------------------------------------


## III. Tạo Submission

In [None]:
train_bow_df = pd.read_csv(train_bow_df_path)
val_bow_df = pd.read_csv(val_bow_df_path)

# Scale statistical features first
X_train_stat = train_stat_df.drop(columns=["label", "file_1", "file_2"])
X_val_stat = val_stat_df.drop(columns=["label", "file_1", "file_2"])

X_train_stat.drop(columns=columns_to_remove, inplace=True, errors="ignore")
X_val_stat.drop(columns=columns_to_remove, inplace=True, errors="ignore")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_stat)
X_val_scaled = scaler.transform(X_val_stat)

# Concatenate scaled statistical features with BOW features
X_train = np.concatenate([X_train_scaled, train_bow_df.values], axis=1)
X_val = np.concatenate([X_val_scaled, val_bow_df.values], axis=1)
y_train = train_stat_df["label"]
y_val = val_stat_df["label"]

print(X_train.shape, X_val.shape)

In [46]:
import sys

sys.path.append(os.path.join(project_root, "src"))

from load_data import read_texts_from_dir

df_test = read_texts_from_dir(
    os.path.join(project_root, "data/fake-or-real-the-impostor-hunt/data/test")
)

test_bow_df = pd.read_csv(test_bow_df_path)
X_test_stat = test_stat_df.drop(columns=["file_1", "file_2"])

X_test_stat.drop(columns=columns_to_remove, inplace=True, errors="ignore")
X_test_scaled = scaler.transform(X_test_stat)

X_test = np.concatenate([X_test_scaled, test_bow_df.values], axis=1)
print(X_test.shape)

Number of directories: 1068
(1068, 85)


In [49]:
best_svm_model = SVC(**study_svm.best_params)
best_svm_model.fit(X_train, y_train)

test_y_pred = best_svm_model.predict(X_test)

In [50]:
from pathlib import Path

# --- Build submission -------------------------------------------------
submission = pd.DataFrame({
    "id": df_test.index,
    "real_text_id": test_y_pred.astype(int)
}).sort_values("id")

save_path = Path("submission_svm.csv")
submission.to_csv(save_path, index=False)
print(f"✅ Submission saved to {save_path.resolve()}")

✅ Submission saved to /home/thangquang09/CODE/CTAI_MachineLearning/notebooks/submission_svm.csv
