## I. Read Data (Only Statistic Features)

In [35]:
import os
import sys
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')
# Find project root directory automatically
def find_project_root():
    current_dir = os.getcwd()
    while current_dir != '/':
        if any(marker in os.listdir(current_dir) for marker in ['.gitignore', 'requirements.txt', 'setup.py', 'pyproject.toml']):
            return current_dir
        current_dir = os.path.dirname(current_dir)
    return os.getcwd()  # fallback to current directory

project_root = find_project_root()

train_stat_df_path = os.path.join(project_root, 'data/train_statistic_features.csv')
val_stat_df_path = os.path.join(project_root, 'data/val_statistic_features.csv')
train_tfidf_df_path = os.path.join(project_root, 'data/train_tfidf_features.csv')
val_tfidf_df_path = os.path.join(project_root, 'data/val_tfidf_features.csv')
train_bow_df_path = os.path.join(project_root, 'data/train_bow_features.csv')
val_bow_df_path = os.path.join(project_root, 'data/val_bow_features.csv')
test_stat_df_path = os.path.join(project_root, 'data/test_statistic_features.csv')
test_tfidf_df_path = os.path.join(project_root, 'data/test_tfidf_features.csv')
test_bow_df_path = os.path.join(project_root, 'data/test_bow_features.csv')

train_stat_df = pd.read_csv(train_stat_df_path)
val_stat_df = pd.read_csv(val_stat_df_path)
test_stat_df = pd.read_csv(test_stat_df_path)



print("Shape Train:", train_stat_df.shape)
print("Shape Validation:", val_stat_df.shape)
print("Columns Train:", train_stat_df.columns)

Shape Train: (152, 45)
Shape Validation: (19, 45)
Columns Train: Index(['file_1', 'file_2', 'label', 'file1_char_count', 'file1_word_count',
       'file1_sentence_count', 'file1_avg_sentence_length',
       'file1_english_word_ratio', 'file1_has_non_english_script',
       'file1_has_mixed_scripts', 'file1_unicode_control_chars',
       'file1_num_count', 'file1_repetition_score', 'file1_perplexity_score',
       'file1_ttr_ratio', 'file2_char_count', 'file2_word_count',
       'file2_sentence_count', 'file2_avg_sentence_length',
       'file2_english_word_ratio', 'file2_has_non_english_script',
       'file2_has_mixed_scripts', 'file2_unicode_control_chars',
       'file2_num_count', 'file2_repetition_score', 'file2_perplexity_score',
       'file2_ttr_ratio', 'diff_char_count', 'ratio_char_count',
       'diff_word_count', 'ratio_word_count', 'diff_sentence_count',
       'ratio_sentence_count', 'diff_avg_sentence_length',
       'diff_english_word_ratio', 'diff_has_non_english_scri

Có thể đọc mô tả các features ở [đây](../data/README.md#processed-features-information)

In [36]:
X_train, y_train = train_stat_df.drop(columns=["label", "file_1", "file_2"]), train_stat_df["label"]
X_val, y_val = val_stat_df.drop(columns=["label", "file_1", "file_2"]), val_stat_df["label"]

## 

In [37]:
# Check for columns that contain only zeros
zero_columns = (X_train == 0).all()
zero_column_names = zero_columns[zero_columns].index.tolist()

print(f"Number of columns with all zeros: {len(zero_column_names)}")
print(f"Zero columns: {zero_column_names}")

# Also check the percentage of zeros in each column
zero_percentages = (X_train == 0).mean() * 100
high_zero_columns = zero_percentages[zero_percentages > 70].sort_values(ascending=False)

print(f"\nColumns with >90% zeros:")
print(high_zero_columns)

Number of columns with all zeros: 0
Zero columns: []

Columns with >90% zeros:
diff_sentence_count             98.684211
file1_has_mixed_scripts         90.131579
file1_has_non_english_script    90.131579
file2_has_non_english_script    90.131579
file2_has_mixed_scripts         90.131579
diff_has_non_english_script     80.263158
diff_has_mixed_scripts          80.263158
dtype: float64


## Modeling

In [38]:
from catboost import CatBoostClassifier
from IPython.display import display
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier


# Initialize models
models = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVC': SVC(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'catboost': CatBoostClassifier(random_state=42, verbose=0),
    'lightgbm': LGBMClassifier(random_state=42, verbose=0),
    'xgboost': XGBClassifier(random_state=42, verbosity=0)
}

# Initialize scaler
scaler = StandardScaler()

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train and evaluate models
results = {}

y_train = y_train - 1
for model_name, model in models.items():
    # print(f"\n{'='*50}")
    # print(f"Training {model_name}")
    # print(f"{'='*50}")
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_val_scaled)
    y_pred = y_pred + 1
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    
    # Store results
    results[model_name] = {
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    # print(f"Accuracy: {accuracy:.4f}")

# Convert results to DataFrame and sort by accuracy
results_df = pd.DataFrame.from_dict(
    {model: {'accuracy': result['accuracy']} for model, result in results.items()}, 
    orient='index'
).sort_values('accuracy', ascending=False)

# Summary of results
print(f"\n{'='*60}")
print("SUMMARY OF MODEL PERFORMANCE (Sorted by Accuracy)")
print(f"{'='*60}")
display(results_df)


SUMMARY OF MODEL PERFORMANCE (Sorted by Accuracy)


Unnamed: 0,accuracy
lightgbm,0.894737
SVC,0.842105
LogisticRegression,0.842105
RandomForest,0.842105
KNN,0.842105
catboost,0.842105
xgboost,0.842105


In [39]:

def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].

    Params:
      dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(
        os.path.isdir(os.path.join(root, d))
        for root, dirs, _ in os.walk(dir_path)
        for d in dirs
    )
    data = [0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")

    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i = 0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                with open(
                    os.path.join(folder_path, "file_1.txt"), "r", encoding="utf-8"
                ) as f1:
                    text1 = f1.read().strip()
                with open(
                    os.path.join(folder_path, "file_2.txt"), "r", encoding="utf-8"
                ) as f2:
                    text2 = f2.read().strip()
                index = int(folder_name[-4:])
                data[i] = (index, text1, text2)
                i += 1
            except Exception as e:
                print(f"Error reading directory {folder_name}: {e}")

    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=["id", "file_1", "file_2"]).set_index("id")
    return df
    
def make_submission(y_pred, file_name):
    
    from pathlib import Path
    df_test = read_texts_from_dir(os.path.join(project_root, 'data/fake-or-real-the-impostor-hunt/data/test'))
    # --- Build submission -------------------------------------------------
    submission = pd.DataFrame({
        "id": df_test.index,
        "real_text_id": y_pred.astype(int)
    }).sort_values("id")
    # submission['real_text_id'] = submission['real_text_id'].map({0: 1, 1: 2})


    save_path = Path(f"{file_name}.csv")
    submission.to_csv(save_path, index=False)
    print(f"✅ Submission saved to {save_path.resolve()}")

In [41]:
X_test_scaled = scaler.transform(test_stat_df.drop(columns=["file_1", "file_2"]))
y_pred = LGBMClassifier(random_state=42, verbose=0).fit(X_train_scaled, y_train).predict(X_test_scaled)



In [None]:
y_pred


array([2, 2, 1, ..., 1, 2, 1], shape=(1068,))

In [None]:

make_submission(y_pred, file_name='test_pipeline')

Number of directories: 1068
✅ Submission saved to /home/thangquang09/CODE/CTAI_MachineLearning/notebooks/test_pipeline.csv


## II. Data with TFIDF Features

In [None]:
train_tfidf_df = pd.read_csv(train_tfidf_df_path)
val_tfidf_df = pd.read_csv(val_tfidf_df_path)


full_train_df = pd.concat([train_stat_df, train_tfidf_df], axis=1)
full_val_df = pd.concat([val_stat_df, val_tfidf_df], axis=1)

X_train, y_train = full_train_df.drop(columns=["label"]), full_train_df["label"]
X_val, y_val = full_val_df.drop(columns=["label"]), full_val_df["label"]

print(X_train.shape, X_val.shape)

(152, 92) (19, 92)


## II. Data with BOW Features

In [None]:
train_bow_df = pd.read_csv(train_bow_df_path)
val_bow_df = pd.read_csv(val_bow_df_path)

full_train_df = pd.concat([train_stat_df, train_bow_df], axis=1)
full_val_df = pd.concat([val_stat_df, val_bow_df], axis=1)

X_train, y_train = full_train_df.drop(columns=["label"]), full_train_df["label"]
X_val, y_val = full_val_df.drop(columns=["label"]), full_val_df["label"]

print(X_train.shape, X_val.shape)

(152, 92) (19, 92)


## Data with Pertained Model Embedding

In [42]:
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm import tqdm

class UniversalEmbeddingExtractor:
    def __init__(
        self, model_name="bert-base-uncased", max_length=512, device=None, batch_size=32
    ):
        self.model_name = model_name
        self.max_length = max_length
        self.batch_size = batch_size
        self.device = (
            device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        )

        print(f"Loading embedding model: {model_name}")

        # Detect model type and load accordingly
        self._load_model()

    def _load_model(self):
        """Load model based on model name/type."""
        model_name_lower = self.model_name.lower()

        # Check if it's a sentence-transformers model
        if any(
            keyword in model_name_lower
            for keyword in [
                "sentence-transformers",
                "all-minilm",
                "all-mpnet",
                "bge-",
                "e5-",
            ]
        ):
            self._load_sentence_transformer()
        # Check if it's a Vietnamese model
        elif any(
            keyword in model_name_lower
            for keyword in ["vinai", "vietnamese", "phobert"]
        ):
            self._load_transformers_model()
        # Check if it's OpenAI model
        elif "openai" in model_name_lower or "text-embedding" in model_name_lower:
            self._load_openai_model()
        # Default to transformers for BERT, RoBERTa, etc.
        else:
            self._load_transformers_model()

    def _load_sentence_transformer(self):
        """Load sentence-transformers model."""
        try:
            from sentence_transformers import SentenceTransformer

            self.model_type = "sentence_transformer"
            self.model = SentenceTransformer(self.model_name, device=self.device)
            self.tokenizer = None  # Not needed for sentence-transformers
            print(f"Loaded as SentenceTransformer model")
        except ImportError:
            print("sentence-transformers not installed, falling back to transformers")
            self._load_transformers_model()

    def _load_transformers_model(self):
        """Load standard transformers model (BERT, RoBERTa, etc.)."""
        self.model_type = "transformers"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.model.to(self.device)
        self.model.eval()
        print(f"Loaded as Transformers model")

    def _load_openai_model(self):
        """Load OpenAI embedding model."""
        try:
            import openai

            self.model_type = "openai"
            self.model = None  # Will use API
            self.tokenizer = None
            print(f"Loaded as OpenAI model")
        except ImportError:
            print("openai package not installed, falling back to transformers")
            self._load_transformers_model()

    def get_embeddings(self, texts):
        """Extract embeddings for a list of texts using the appropriate method."""
        if self.model_type == "sentence_transformer":
            return self._get_sentence_transformer_embeddings(texts)
        elif self.model_type == "transformers":
            return self._get_transformers_embeddings(texts)
        elif self.model_type == "openai":
            return self._get_openai_embeddings(texts)
        else:
            raise ValueError(f"Unknown model type: {self.model_type}")

    def _get_sentence_transformer_embeddings(self, texts):
        """Extract embeddings using sentence-transformers."""
        embeddings = []

        # Process in batches
        for i in tqdm(
            range(0, len(texts), self.batch_size), desc="Extracting embeddings"
        ):
            batch_texts = texts[i : i + self.batch_size]
            batch_embeddings = self.model.encode(
                batch_texts,
                convert_to_tensor=False,
                normalize_embeddings=True,
                show_progress_bar=False,
            )
            embeddings.extend(batch_embeddings)

        return np.array(embeddings)

    def _get_transformers_embeddings(self, texts):
        """Extract embeddings using transformers (BERT-style)."""
        embeddings = []

        with torch.no_grad():
            
            for i in tqdm(
                range(0, len(texts), self.batch_size), desc="Extracting embeddings"
            ):
                batch_texts = texts[i : i + self.batch_size]
                batch_embeddings = []

                for text in batch_texts:
                    # Tokenize
                    inputs = self.tokenizer(
                        text,
                        max_length=self.max_length,
                        truncation=True,
                        padding="max_length",
                        return_tensors="pt",
                    )

                    # Move to device
                    inputs = {k: v.to(self.device) for k, v in inputs.items()}

                    # Get embeddings
                    outputs = self.model(**inputs)

                    # Use [CLS] token embedding or mean pooling
                    if hasattr(outputs, "last_hidden_state"):
                        # For BERT-style models, use [CLS] token
                        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                    elif hasattr(outputs, "pooler_output"):
                        # Some models have pooler output
                        cls_embedding = outputs.pooler_output.cpu().numpy()
                    else:
                        # Fallback: mean pooling
                        cls_embedding = (
                            outputs.last_hidden_state.mean(dim=1).cpu().numpy()
                        )

                    batch_embeddings.append(cls_embedding.flatten())

                embeddings.extend(batch_embeddings)

        return np.array(embeddings)

    def _get_openai_embeddings(self, texts):
        """Extract embeddings using OpenAI API."""
        import openai

        embeddings = []

        for i in tqdm(
            range(0, len(texts), self.batch_size), desc="Extracting OpenAI embeddings"
        ):
            batch_texts = texts[i : i + self.batch_size]

            try:
                response = openai.Embedding.create(
                    model=self.model_name, input=batch_texts
                )
                batch_embeddings = [item["embedding"] for item in response["data"]]
                embeddings.extend(batch_embeddings)
            except Exception as e:
                print(f"Error with OpenAI API: {e}")
                # Fallback to zero embeddings
                embeddings.extend(
                    [np.zeros(1536) for _ in batch_texts]
                )  # OpenAI default dim

        return np.array(embeddings)


def extract_embedding_features(
    df: pd.DataFrame, embedding_extractor: UniversalEmbeddingExtractor
) -> np.ndarray:
    """Extract embedding features for both file_1 and file_2, including diff features."""
    print("Extracting embedding features for file_1...")
    emb_f1 = embedding_extractor.get_embeddings(df["file_1"].tolist())

    print("Extracting embedding features for file_2...")
    emb_f2 = embedding_extractor.get_embeddings(df["file_2"].tolist())

    # Create difference and similarity features
    emb_diff = emb_f1 - emb_f2
    emb_abs_diff = np.abs(emb_diff)

    # Cosine similarity
    cosine_sim = np.sum(emb_f1 * emb_f2, axis=1, keepdims=True) / (
        np.linalg.norm(emb_f1, axis=1, keepdims=True)
        * np.linalg.norm(emb_f2, axis=1, keepdims=True)
        + 1e-8
    )

    # Euclidean distance
    euclidean_dist = np.linalg.norm(emb_diff, axis=1, keepdims=True)

    # Concatenate all embedding features
    embedding_features = np.concatenate(
        [emb_f1, emb_f2, emb_diff, emb_abs_diff, cosine_sim, euclidean_dist], axis=1
    )

    return embedding_features.astype(np.float32)

In [49]:
embedding_features = extract_embedding_features(train_stat_df, embedding_extractor=UniversalEmbeddingExtractor(model_name='intfloat/multilingual-e5-small'))

val_embedding_features = extract_embedding_features(val_stat_df, embedding_extractor=UniversalEmbeddingExtractor(model_name='intfloat/multilingual-e5-small'))

Loading embedding model: intfloat/multilingual-e5-small
Loaded as SentenceTransformer model
Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 5/5 [00:10<00:00,  2.09s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 5/5 [00:10<00:00,  2.09s/it]


Loading embedding model: intfloat/multilingual-e5-small
Loaded as SentenceTransformer model
Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]


In [50]:
# concat with X_Train_scaled
X_train_new = np.concatenate([X_train_scaled, embedding_features], axis=1)
X_val_new = np.concatenate([X_val_scaled, val_embedding_features], axis=1)

In [55]:
model = LGBMClassifier(random_state=42, verbose=0)
model.fit(X_train_new, y_train)

y_pred = model.predict(X_val_new) + 1
accuracy_score(y_val, y_pred)



0.8947368421052632

In [56]:

# Thay thế cell hiện tại bằng code này:

# ===== Chuẩn bị data đúng cách =====
# Đảm bảo dùng cùng một DataFrame
print("Original shapes:")
print(f"train_stat_df: {train_stat_df.shape}")
print(f"val_stat_df: {val_stat_df.shape}")

# Extract embedding features từ chính các DataFrame đã được chuẩn bị
print("\nExtracting embedding features...")
embedding_extractor = UniversalEmbeddingExtractor(model_name='intfloat/multilingual-e5-small')

# Extract cho train data
train_embedding_features = extract_embedding_features(train_stat_df, embedding_extractor)

# Extract cho validation data  
val_embedding_features = extract_embedding_features(val_stat_df, embedding_extractor)

print(f"Train embedding features shape: {train_embedding_features.shape}")
print(f"Val embedding features shape: {val_embedding_features.shape}")

# ===== Chuẩn bị X, y từ cùng DataFrame =====
X_train_stat = train_stat_df.drop(columns=["label", "file_1", "file_2"])
y_train = train_stat_df["label"] - 1  # Convert to 0,1

X_val_stat = val_stat_df.drop(columns=["label", "file_1", "file_2"])  
y_val = val_stat_df["label"]

print(f"X_train_stat shape: {X_train_stat.shape}")
print(f"X_val_stat shape: {X_val_stat.shape}")

# ===== Scale statistical features =====
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_stat)
X_val_scaled = scaler.transform(X_val_stat)

# ===== Kết hợp statistical + embedding features =====
X_train_combined = np.concatenate([X_train_scaled, train_embedding_features], axis=1)
X_val_combined = np.concatenate([X_val_scaled, val_embedding_features], axis=1)

print(f"Combined train features shape: {X_train_combined.shape}")
print(f"Combined val features shape: {X_val_combined.shape}")

# ===== Train model =====
model = LGBMClassifier(random_state=42, verbose=0)
model.fit(X_train_combined, y_train)

y_pred = model.predict(X_val_combined) + 1
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy with combined features: {accuracy:.4f}")

Original shapes:
train_stat_df: (152, 45)
val_stat_df: (19, 45)

Extracting embedding features...
Loading embedding model: intfloat/multilingual-e5-small
Loaded as SentenceTransformer model
Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 5/5 [00:10<00:00,  2.19s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 5/5 [00:10<00:00,  2.12s/it]


Extracting embedding features for file_1...


Extracting embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


Extracting embedding features for file_2...


Extracting embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.36s/it]

Train embedding features shape: (152, 1538)
Val embedding features shape: (19, 1538)
X_train_stat shape: (152, 42)
X_val_stat shape: (19, 42)
Combined train features shape: (152, 1580)
Combined val features shape: (19, 1580)
Accuracy with combined features: 0.8947



