In [1]:
import pandas as pd
import numpy as np
# from thangquang.bert_logistic import read_texts_from_dir
import os

def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].

    Params:
      dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(
        os.path.isdir(os.path.join(root, d))
        for root, dirs, _ in os.walk(dir_path)
        for d in dirs
    )
    data = [0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")

    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i = 0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                with open(
                    os.path.join(folder_path, "file_1.txt"), "r", encoding="utf-8"
                ) as f1:
                    text1 = f1.read().strip()
                with open(
                    os.path.join(folder_path, "file_2.txt"), "r", encoding="utf-8"
                ) as f2:
                    text2 = f2.read().strip()
                index = int(folder_name[-4:])
                data[i] = (index, text1, text2)
                i += 1
            except Exception as e:
                print(f"Error reading directory {folder_name}: {e}")

    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=["id", "file_1", "file_2"]).set_index("id")
    return df

In [2]:
df_train = read_texts_from_dir(r"C:\workspace\AI\Machine learning\CTAI_MachineLearning\data\train")
df_test = read_texts_from_dir(r"C:\workspace\AI\Machine learning\CTAI_MachineLearning\data\test")
df_train_gt = pd.read_csv(r"C:\workspace\AI\Machine learning\CTAI_MachineLearning\data\train.csv")
y_train = df_train_gt["real_text_id"].values
df_train['label'] = df_train_gt["real_text_id"]

Number of directories: 95
Number of directories: 1068


In [3]:
df_train.head()

Unnamed: 0_level_0,file_1,file_2,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2


In [4]:
from feature_engineering import prepare_data_for_model, EmbeddingExtractor
embedding_extractor = EmbeddingExtractor(model_name='google-bert/bert-base-uncased')
feature_matrix = prepare_data_for_model(df_train, embedding_extractor=embedding_extractor)
feature_matrix.shape

Using 8 CPU cores for feature extraction...
Step 1: Extracting top importance features...
Using 8 cores for top features extraction...


Extracting top features (single-threaded):   0%|          | 0/95 [00:00<?, ?it/s]

Step 2: Extracting rule-based features...
Using 8 cores for rule-based features extraction...


Extracting rule-based features (single-threaded):   0%|          | 0/95 [00:00<?, ?it/s]

Step 4: Extracting embedding features...


Extracting embeddings:   0%|          | 0/95 [00:00<?, ?it/s]

Step 6: Combining features...
Final feature matrix shape: (95, 853)
Top features: 25, Rule: 60, Embedding: 768


(95, 853)

In [None]:
feature_matrix_test = prepare_data_for_model(df_test, embedding_extractor=embedding_extractor)
feature_matrix_test.shape

Using 8 CPU cores for feature extraction...
Step 1: Extracting top importance features...
Using 8 cores for top features extraction...


Extracting top features (multi-threaded):   0%|          | 0/1068 [00:00<?, ?it/s]

In [None]:
# X_test = feature_matrix_test.copy()


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(feature_matrix, y_train, test_size=0.2, random_state=42)

In [9]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

((76, 853), (19, 853), (76,), (19,), (1068, 853))

### Lựa chọn mô hình phân loại với dữ liệu nhiều thuộc tính (853 features)
- **Tree-based models** như Random Forest, Gradient Boosting (XGBoost, LightGBM, CatBoost) rất phù hợp với dữ liệu nhiều chiều, không cần chuẩn hóa đặc trưng, tự động chọn thuộc tính quan trọng và chống overfitting tốt.
- **Logistic Regression** với regularization (L1/L2) cũng có thể thử, nhưng hiệu quả thường kém hơn tree-based khi dữ liệu phi tuyến tính và nhiều thuộc tính không quan trọng.
- **SVM** (Support Vector Machine) có thể dùng, nhưng với số chiều lớn sẽ tốn nhiều tài nguyên và thời gian.
- **Neural Network** (MLP) chỉ nên dùng nếu dữ liệu rất lớn và đã chuẩn hóa tốt.

**Khuyến nghị:**
- Ưu tiên thử Random Forest hoặc LightGBM/XGBoost đầu tiên.
- Có thể dùng Logistic Regression để baseline và kiểm tra feature importance.
- Nên dùng cross-validation để chọn mô hình tối ưu.

## RandomForestClassifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rdc = RandomForestClassifier()
rdc.fit(X_train, y_train)
y_pred = rdc.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.83      0.91        12
           2       0.78      1.00      0.88         7

    accuracy                           0.89        19
   macro avg       0.89      0.92      0.89        19
weighted avg       0.92      0.89      0.90        19



In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [i for i in range(100, 201, 20)],
    'max_depth': [i for i in range(1, 21)],
}
rdc = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rdc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [13]:
grid_search.best_params_

{'max_depth': 9, 'n_estimators': 100}

In [12]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.75      0.86        12
           2       0.70      1.00      0.82         7

    accuracy                           0.84        19
   macro avg       0.85      0.88      0.84        19
weighted avg       0.89      0.84      0.84        19



In [15]:
df_train.columns

Index(['file_1', 'file_2', 'label', 'cleaned_file_1', 'cleaned_file_2',
       'text'],
      dtype='object')

In [18]:
y_test_pred = best_model.predict(X_test)
submission = pd.DataFrame({
    'id': [i for i in range(0, len(y_test_pred))],
    'real_text_id': y_test_pred
})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,real_text_id
0,0,2
1,1,2
2,2,1
3,3,2
4,4,1


## XGBoot

In [6]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 4.2 MB/s eta 0:00:14
   -- ------------------------------------- 4.2/56.8 MB 14.0 MB/s eta 0:00:04
   ------ --------------------------------- 9.4/56.8 MB 19.9 MB/s eta 0:00:03
   ---------- ----------------------------- 14.7/56.8 MB 21.6 MB/s eta 0:00:02
   -------------- ------------------------- 20.4/56.8 MB 22.2 MB/s eta 0:00:02
   ----------------- ---------------------- 24.9/56.8 MB 22.0 MB/s eta 0:00:02
   -------------------- ------------------- 28.8/56.8 MB 21.6 MB/s eta 0:00:02
   ----------------------- ---------------- 33.6/56.8 MB 21.6 MB/s eta 0:00:02
   -------------------------- ------------- 38.3/56.8 MB 21.8 MB/s eta 0:00:01
   ------------------------------ --------- 43.8/56.8 MB 22.2 MB/s eta 0:00:

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

bst = XGBClassifier()
param_grid = {
    'n_estimators': [i for i in range(100, 201, 20)],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'objective': ['binary:logistic']
}
grid_search = GridSearchCV(n_jobs=-1, estimator=bst, param_grid=param_grid, cv=5, verbose=2)

# fit model
y_train_norm = y_train - 1  # Normalize labels to start from 0
grid_search.fit(X_train, y_train_norm)
# make predictions
preds = grid_search.predict(X_val) + 1  # Reverse normalization
print(classification_report(y_val, preds))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
              precision    recall  f1-score   support

           1       0.88      0.58      0.70        12
           2       0.55      0.86      0.67         7

    accuracy                           0.68        19
   macro avg       0.71      0.72      0.68        19
weighted avg       0.75      0.68      0.69        19



In [11]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 2,
 'n_estimators': 100,
 'objective': 'binary:logistic'}

## SVM

In [15]:
from sklearn.svm import SVC
svc = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly']
}
grid_search = GridSearchCV(n_jobs=-1, estimator=svc, param_grid=param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train_norm)
y_pred = grid_search.predict(X_val) + 1  # Reverse normalization

print(classification_report(y_val, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
              precision    recall  f1-score   support

           1       1.00      0.75      0.86        12
           2       0.70      1.00      0.82         7

    accuracy                           0.84        19
   macro avg       0.85      0.88      0.84        19
weighted avg       0.89      0.84      0.84        19



In [13]:
grid_search.best_params_

{'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}

## Logistỉc Regression

In [16]:
from sklearn.linear_model import LogisticRegression

lgt = LogisticRegression()
lgt.fit(X_train, y_train_norm)
y_pred = lgt.predict(X_val) + 1  # Reverse normalization

print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           1       1.00      0.83      0.91        12
           2       0.78      1.00      0.88         7

    accuracy                           0.89        19
   macro avg       0.89      0.92      0.89        19
weighted avg       0.92      0.89      0.90        19



In [None]:
y_pred_test = lgt.predict(X_test) + 1  # Reverse normalization

submission = pd.DataFrame({
    'id': [i for i in range(0, len(y_pred_test))],
    'real_text_id': y_pred_test
})
# submission.to_csv('submission_logistic_regression.csv', index=False)
len(y_pred_test)

19