In [8]:
!pip install --q numpy pandas scipy tqdm nltk textstat torch transformers

In [9]:
# ----------------------------- IMPORTS ------------------------------------
import math
import os
import random
import re
import warnings
from collections import Counter
# from pathlib import Path
from multiprocessing import Pool, cpu_count
# from functools import partial

# import joblib
# import lightgbm as lgb

# NLP
import nltk
import numpy as np
# import optuna
import pandas as pd
import textstat

# Deep Learning
import torch
# from langdetect import DetectorFactory, detect
# from langdetect.lang_detect_exception import LangDetectException
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.sparse import csr_matrix, hstack
# from sklearn.feature_selection import SelectKBest, f_classif
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# ML & utils
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from sklearn.pipeline import Pipeline
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer

warnings.filterwarnings("ignore")
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("stopwords", quiet=True)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
# DetectorFactory.seed = SEED

#----------------------------- BASE FUNCTION ------------------------------

def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].

    Params:
      dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(
        os.path.isdir(os.path.join(root, d))
        for root, dirs, _ in os.walk(dir_path)
        for d in dirs
    )
    data = [0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")

    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i = 0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                with open(
                    os.path.join(folder_path, "file_1.txt"), "r", encoding="utf-8"
                ) as f1:
                    text1 = f1.read().strip()
                with open(
                    os.path.join(folder_path, "file_2.txt"), "r", encoding="utf-8"
                ) as f2:
                    text2 = f2.read().strip()
                index = int(folder_name[-4:])
                data[i] = (index, text1, text2)
                i += 1
            except Exception as e:
                print(f"Error reading directory {folder_name}: {e}")

    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=["id", "file_1", "file_2"]).set_index("id")
    return df

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # 2. Xóa các ký tự không mong muốn nhưng giữ lại ' và - nếu ở trong từ
    #   - Cho phép: chữ, số, khoảng trắng, ', -
    text = re.sub(r"[^a-z0-9\s'\-]", " ", text)
    # 3. Chuẩn hoá khoảng trắng
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ----------------------------- FEATURE ENGINEERING ------------------------------

def compute_advanced_features(text: str) -> dict:
    """
    Tính toán các features đếm và tỉ lệ, ... cho real or fake detection.
    """
    if not isinstance(text, str) or not text.strip():
        return {f: 0 for f in [
            'unique_word_count_ratio', 'latin_ratio', 'digit_count', 
            'flesch_reading_ease', 'dale_chall_readability',
            'coleman_liau_index', 'short_word_count_ratio', 'uppercase_ratio',
            'english_ratio', 'perplexity_score', 'sentence_count',
            'word_count', 'avg_word_length'
        ]}
    
    # Basic text stats
    cleaned_text = clean_text(text)
    words = cleaned_text.split()
    word_count = len(words)
    unique_words = len(set(words))
    char_count = len(text)

    # 1. Unique word count ratio (top feature!)
    unique_word_count_ratio = unique_words / max(word_count, 1)
    
    # 2. Latin ratio (character-based)
    latin_chars = len(re.findall(r'[a-zA-Z]', text))
    latin_ratio = latin_chars / max(char_count, 1)
    
    # 3. Digit count
    digit_count = len(re.findall(r'\d', text))
    
    # 4. Readability scores using textstat
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    dale_chall_readability = textstat.dale_chall_readability_score(text)
    coleman_liau_index = textstat.coleman_liau_index(text)

    # 5. Short word ratio
    short_words = [w for w in words if len(w) <= 3]
    short_word_count_ratio = len(short_words) / max(word_count, 1)

    # 6. Uppercase ratio
    uppercase_chars = len(re.findall(r'[A-Z]', text))
    uppercase_ratio = uppercase_chars / max(char_count, 1)

    # 7. English ratio (approximate using common English patterns)
    english_words = len(re.findall(r'\b[a-zA-Z]+\b', cleaned_text))
    english_ratio = english_words / max(word_count, 1)

    # 8. Simple perplexity approximation (entropy-based)
    def calculate_perplexity(words):
        if not words:
            return 0
        
        word_freq = Counter(words)
        total_words = len(words)  # Dùng total occurrences thay vì unique
        
        # Calculate entropy
        entropy = 0
        for freq in word_freq.values():
            prob = freq / total_words
            entropy -= prob * math.log2(prob)
        
        return 2 ** entropy if entropy > 0 else 1
    perplexity_score = calculate_perplexity(words)


    # 9. Sentence count
    sentences = re.split(r'[.!?]+', text)
    sentence_count = len([s for s in sentences if s.strip()])

    # 10. Average word length
    avg_word_length = sum(len(w) for w in words) / max(word_count, 1)
    
    return {
        'unique_word_count_ratio': unique_word_count_ratio,
        'latin_ratio': latin_ratio,
        'digit_count': digit_count,
        'flesch_reading_ease': flesch_reading_ease,
        'dale_chall_readability': dale_chall_readability,
        'coleman_liau_index': coleman_liau_index,
        'short_word_count_ratio': short_word_count_ratio,
        'uppercase_ratio': uppercase_ratio,
        'english_ratio': english_ratio,
        'perplexity_score': perplexity_score,
        'sentence_count': sentence_count,
        'word_count': word_count,
        'avg_word_length': avg_word_length
    }

def process_row_top_features(row_data):
    """Process single row for top features extraction (for multiprocessing)."""
    text1, text2 = row_data
    
    # Get features for both texts
    f1 = compute_advanced_features(text1)
    f2 = compute_advanced_features(text2)
    
    # Create difference and ratio features (theo pattern từ biểu đồ)
    feature_row = []
    
    # 1. unique_word_count_ratio (tỷ lệ giữa file1 và file2)
    unique_ratio = (f1['unique_word_count_ratio'] + 1e-8) / (f2['unique_word_count_ratio'] + 1e-8)
    unique_ratio = np.clip(unique_ratio, 0.1, 10.0)
    feature_row.append(unique_ratio)
    
    # 2. latin_ratio_diff (signed difference)
    latin_ratio_diff = f1['latin_ratio'] - f2['latin_ratio']
    feature_row.append(latin_ratio_diff)
    
    # 3. digit_count_diff (signed difference)
    digit_count_diff = f1['digit_count'] - f2['digit_count']
    feature_row.append(digit_count_diff)
    
    # 4. semantic_similarity 
    def cosine_similarity(text1, text2):
        words1 = Counter(text1.lower().split())
        words2 = Counter(text2.lower().split())
        # Get common words
        common_words = set(words1.keys()) & set(words2.keys())
        if not common_words:
            return 0.0
        # Calculate dot product and norms
        dot_product = sum(words1[word] * words2[word] for word in common_words)
        norm1 = math.sqrt(sum(count**2 for count in words1.values()))
        norm2 = math.sqrt(sum(count**2 for count in words2.values()))
        
        return dot_product / (norm1 * norm2) if norm1 * norm2 > 0 else 0.0
    semantic_similarity = cosine_similarity(text1, text2)
    feature_row.append(semantic_similarity)
    
    
    # 5. perplexity_diff (signed difference)
    perplexity_diff = f1['perplexity_score'] - f2['perplexity_score']
    feature_row.append(perplexity_diff)

    # 6. flesch_reading_ease_ratio
    flesch_ratio = (f1['flesch_reading_ease'] + 100) / (f2['flesch_reading_ease'] + 100)
    flesch_ratio = np.clip(flesch_ratio, 0.1, 10.0)
    feature_row.append(flesch_ratio)

    # 7. short_word_count_ratio
    short_ratio = (f1['short_word_count_ratio'] + 1e-8) / (f2['short_word_count_ratio'] + 1e-8)
    short_ratio = np.clip(short_ratio, 0.1, 10.0)
    feature_row.append(short_ratio)

    # 8. readability_avg_ratio
    readability_avg_1 = (f1['flesch_reading_ease'] + f1['dale_chall_readability']) / 2
    readability_avg_2 = (f2['flesch_reading_ease'] + f2['dale_chall_readability']) / 2
    readability_avg_ratio = (readability_avg_1 + 50) / (readability_avg_2 + 50)
    readability_avg_ratio = np.clip(readability_avg_ratio, 0.1, 10.0)
    feature_row.append(readability_avg_ratio)

    # 9. dale_chall_readability_score_diff (signed difference)
    dale_chall_diff = f1['dale_chall_readability'] - f2['dale_chall_readability']
    feature_row.append(dale_chall_diff)

    # 10. sentence_count_diff (signed difference)
    sentence_count_diff = f1['sentence_count'] - f2['sentence_count']
    feature_row.append(sentence_count_diff)

    # 11. perplexity_ratio
    perplexity_ratio = (f1['perplexity_score'] + 1e-8) / (f2['perplexity_score'] + 1e-8)
    perplexity_ratio = np.clip(perplexity_ratio, 0.1, 10.0)
    feature_row.append(perplexity_ratio)

    # 12. coleman_liau_index_diff (signed difference)
    coleman_diff = f1['coleman_liau_index'] - f2['coleman_liau_index']
    feature_row.append(coleman_diff)

    # 13. english_ratio_diff (signed difference)
    english_ratio_diff = f1['english_ratio'] - f2['english_ratio']
    feature_row.append(english_ratio_diff)

    # 14. word_count_diff (signed difference)
    word_count_diff = f1['word_count'] - f2['word_count']
    feature_row.append(word_count_diff)

    # 15. uppercase_ratio_diff (signed difference)
    uppercase_ratio_diff = f1['uppercase_ratio'] - f2['uppercase_ratio']
    feature_row.append(uppercase_ratio_diff)

    # 16. latin_ratio_ratio
    latin_ratio_ratio = (f1['latin_ratio'] + 1e-8) / (f2['latin_ratio'] + 1e-8)
    latin_ratio_ratio = np.clip(latin_ratio_ratio, 0.1, 10.0)
    feature_row.append(latin_ratio_ratio)

    # 17. english_ratio_ratio
    english_ratio_ratio = (f1['english_ratio'] + 1e-8) / (f2['english_ratio'] + 1e-8)
    english_ratio_ratio = np.clip(english_ratio_ratio, 0.1, 10.0)
    feature_row.append(english_ratio_ratio)

    # Add individual features as well
    feature_row.extend([
        f1['unique_word_count_ratio'], f2['unique_word_count_ratio'],
        f1['latin_ratio'], f2['latin_ratio'],
        f1['flesch_reading_ease'], f2['flesch_reading_ease'],
        f1['perplexity_score'], f2['perplexity_score']
    ])
    
    return feature_row

def extract_top_features(df: pd.DataFrame, n_jobs: int = None) -> np.ndarray:
    """Extract top features theo biểu đồ importance with multiprocessing."""
    if n_jobs is None:
        n_jobs = min(cpu_count(), 8)  # Limit to 8 cores max to avoid memory issues
    
    # Prepare data for multiprocessing
    row_data = [(row['file_1'], row['file_2']) for _, row in df.iterrows()]
    
    print(f"Using {n_jobs} cores for top features extraction...")
    
    if len(row_data) < 100 or n_jobs == 1:
        # For small datasets, use single process to avoid overhead
        features = []
        for data in tqdm(row_data, desc="Extracting top features (single-threaded)"):
            features.append(process_row_top_features(data))
    else:
        # Use multiprocessing for larger datasets
        with Pool(n_jobs) as pool:
            features = list(tqdm(
                pool.imap(process_row_top_features, row_data),
                total=len(row_data),
                desc="Extracting top features (multi-threaded)"
            ))
    
    return np.array(features).astype(np.float32)


# ----------------------- Extracting rule-based features ---------------------------
def compute_rule_based_features(text: str) -> dict:
    """Tính toán các đặc trưng rule-based cho một văn bản."""
    if not isinstance(text, str):
        text = ""

    # Existing features...
    cleaned_text = clean_text(text)
    word_count = len(cleaned_text.split())

    # === NEW FEATURES FOR FAKE DETECTION ===

    # 1. Multi-script detection (Garbage text pattern)
    cyrillic_count = len(re.findall(r"[\u0400-\u04FF]", text))  # Russian
    arabic_count = len(re.findall(r"[\u0600-\u06FF]", text))  # Arabic
    chinese_count = len(re.findall(r"[\u4e00-\u9fff]", text))  # Chinese
    emoji_count = len(
        re.findall(
            r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF]", text
        )
    )

    # Mixed script score
    script_diversity = sum(
        1 for count in [cyrillic_count, arabic_count, chinese_count] if count > 0
    )

    # 2. Storytelling pattern detection
    exclamation_ratio = text.count("!") / max(len(text), 1)
    question_ratio = text.count("?") / max(len(text), 1)

    # Informal language indicators
    informal_words = ["forget", "wow", "amazing", "incredible", "magic", "unicorn"]
    informal_count = sum(text.lower().count(word) for word in informal_words)

    # Bold/emphasis markers (markdown style)
    bold_count = text.count("**") + text.count("__")

    # 3. Tone analysis
    first_person_count = len(re.findall(r"\b(I|we|our|my|mine)\b", text, re.I))
    second_person_count = len(re.findall(r"\b(you|your|yours)\b", text, re.I))

    # Scientific vs casual tone
    scientific_terms = [
        "observation",
        "analysis",
        "telescope",
        "data",
        "measurement",
        "survey",
    ]
    scientific_count = sum(text.lower().count(term) for term in scientific_terms)

    # 4. Inconsistency detection
    # Sudden language change (character encoding issues)
    unicode_control_chars = len(re.findall(r"[\u0000-\u001F\u007F-\u009F]", text))

    # Suspicious name patterns (China relay network, etc.)
    suspicious_entities = ["china relay", "rainbow unicorn", "santa", "north pole"]
    suspicious_count = sum(text.lower().count(entity) for entity in suspicious_entities)


    return {
        # Multi-script features
        "cyrillic_count": cyrillic_count,
        "arabic_count": arabic_count,
        "chinese_count": chinese_count,
        "emoji_count": emoji_count,
        "script_diversity": script_diversity,
        # Storytelling features
        "exclamation_ratio": exclamation_ratio,
        "question_ratio": question_ratio,
        "informal_count": informal_count,
        "bold_count": bold_count,
        # Tone features
        "first_person_count": first_person_count,
        "second_person_count": second_person_count,
        "scientific_count": scientific_count,
        # Inconsistency features
        "unicode_control_chars": unicode_control_chars,
        "suspicious_count": suspicious_count,
        # Existing features
        "number_count": len(re.findall(r"\d+", text)),
        "unit_count": len(
            re.findall(
                r"\b(?:km|cm|m|s|kg|g|Hz|K|A|deg|arcsec|dex|A|petabytes|terabytes)\b",
                text,
                re.I,
            )
        ),
        "acronym_count": len(re.findall(r"\b[A-Z]{2,}\b", text)),
        "uppercase_word_count": len(re.findall(r"\b[A-Z][A-Z]+\b", text)),
        "exclamation_count": text.count("!"),
        "repetition_score": sum(
            [
                count
                for word, count in Counter(cleaned_text.lower().split()).items()
                if count > 3
            ]
        )
        / max(word_count, 1),
    }


def process_row_rule_based(row_data):
    """Process single row for rule-based features extraction (for multiprocessing)."""
    text1, text2 = row_data
    f1 = compute_rule_based_features(text1)
    f2 = compute_rule_based_features(text2)
    
    # Tạo diff features
    diff = {k: f1[k] - f2[k] for k in f1}
    
    # Kết hợp f1, f2, diff thành một vector
    feature_vector = list(f1.values()) + list(f2.values()) + list(diff.values())
    return feature_vector

def extract_rule_based_features(df: pd.DataFrame, n_jobs: int = None) -> np.ndarray:
    """Tạo ma trận đặc trưng rule-based với multiprocessing."""
    if n_jobs is None:
        n_jobs = min(cpu_count(), 8)
    
    # Prepare data for multiprocessing
    row_data = [(row['file_1'], row['file_2']) for _, row in df.iterrows()]
    
    print(f"Using {n_jobs} cores for rule-based features extraction...")
    
    if len(row_data) < 100 or n_jobs == 1:
        # For small datasets, use single process
        features = []
        for data in tqdm(row_data, desc="Extracting rule-based features (single-threaded)"):
            features.append(process_row_rule_based(data))
    else:
        # Use multiprocessing for larger datasets
        with Pool(n_jobs) as pool:
            features = list(tqdm(
                pool.imap(process_row_rule_based, row_data),
                total=len(row_data),
                desc="Extracting rule-based features (multi-threaded)"
            ))

    return np.array(features).astype(np.float32)

# ----------------------- PREPARE DATA FOR MODEL ---------------------------
class EmbeddingExtractor:
    def __init__(self, model_name: str, max_length: int = 512):
        self.model_name = model_name
        self.max_length = max_length
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(self.device)

    def get_embedding(self, text : str ) -> np.ndarray:
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length
        )
        with torch.no_grad():
            embeddings = self.model(inputs['input_ids'].to(self.device), inputs['attention_mask'].to(self.device)).last_hidden_state[:, 0, :].cpu().numpy()
        return embeddings


def extract_embedding_features(df: pd.DataFrame, embedding_extractor ) -> np.ndarray:
    """Extract embedding features from the text data using the embedding extractor."""
    embeddings = []
    for text in tqdm(df['text'], desc="Extracting embeddings"):
        embedding = embedding_extractor.get_embedding(text )
        embeddings.append(embedding)
    return np.vstack(embeddings)

# ----------------------- PREPARE DATA FOR MODEL ---------------------------
def prepare_data_for_model(
    df: pd.DataFrame,
    embedding_extractor: EmbeddingExtractor = None,
    model_name: str = "bert-base-uncased",
    n_jobs: int = None,
):
    """
    Chuẩn bị dữ liệu cho model với focus vào top features theo importance chart.

    Args:
        df: DataFrame chứa dữ liệu thô
        embedding_extractor: Universal embedding extractor, nếu None sẽ tạo mới
        model_name: Tên model embedding để sử dụng
        n_jobs: Số lượng CPU cores để sử dụng cho multiprocessing

    Returns:
        feature_matrix: Ma trận features đã kết hợp
        embedding_extractor: Embedding extractor (để dùng cho test set)
    """
    if n_jobs is None:
        n_jobs = min(cpu_count(), 8)  # Default to 8 cores max
    
    print(f"Using {n_jobs} CPU cores for feature extraction...")
    
    # 0. clean text
    df['cleaned_file_1'] = df['file_1'].apply(clean_text)
    df['cleaned_file_2'] = df['file_2'].apply(clean_text)
    df['text'] = '[CLS] ' + df['cleaned_file_1'] + " [SEP] " + df['cleaned_file_2']

    # 1. Extract top features (most important) - with multiprocessing
    print("Step 1: Extracting top importance features...")
    top_features = extract_top_features(df, n_jobs=n_jobs)
    
    # 2. Extract rule-based features (existing) - with multiprocessing
    print("Step 2: Extracting rule-based features...")
    rule_features = extract_rule_based_features(df, n_jobs=n_jobs)
    
    # # 4. Extract embedding features (lighter approach) - handled by embedding extractor
    # print("Step 4: Extracting embedding features...")
    # if embedding_extractor is None:
    #     embedding_extractor = EmbeddingExtractor(
    #         model_name=model_name,
    #         max_length=512,
    #     )
    
    # embedding_features = extract_embedding_features(df, embedding_extractor)
    
    # 6. Combine all features with priority on top features
    print("Step 6: Combining features...")  
    # Priority: top features first, then others
    # feature_matrix = np.hstack([top_features, rule_features, embedding_features])
    feature_matrix = np.hstack([top_features, rule_features])

    print(f"Final feature matrix shape: {feature_matrix.shape}")
    # print(f"Top features: {top_features.shape[1]}, Rule: {rule_features.shape[1]}, Embedding: {embedding_features.shape[1]}")

    return feature_matrix

In [10]:
import pandas as pd
import numpy as np
import os

def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].

    Params:
      dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(
        os.path.isdir(os.path.join(root, d))
        for root, dirs, _ in os.walk(dir_path)
        for d in dirs
    )
    data = [0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")

    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i = 0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                with open(
                    os.path.join(folder_path, "file_1.txt"), "r", encoding="utf-8"
                ) as f1:
                    text1 = f1.read().strip()
                with open(
                    os.path.join(folder_path, "file_2.txt"), "r", encoding="utf-8"
                ) as f2:
                    text2 = f2.read().strip()
                index = int(folder_name[-4:])
                data[i] = (index, text1, text2)
                i += 1
            except Exception as e:
                print(f"Error reading directory {folder_name}: {e}")

    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=["id", "file_1", "file_2"]).set_index("id")
    return df

In [11]:
df_train = read_texts_from_dir("/kaggle/input/real-or-fake/data/train")
df_test = read_texts_from_dir("/kaggle/input/real-or-fake/data/test")
df_train_gt = pd.read_csv("/kaggle/input/real-or-fake/data/train.csv")
y_train = df_train_gt["real_text_id"].values
df_train['label'] = df_train_gt["real_text_id"]

Number of directories: 95
Number of directories: 1068


In [12]:
df_train.head()

Unnamed: 0_level_0,file_1,file_2,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2


## Finetune Encoder for Data

In [13]:

df_finetune = df_train.copy()
df_finetune['file_1'] = df_finetune['file_1'].apply(clean_text)
df_finetune['file_2'] = df_finetune['file_2'].apply(clean_text)
df_finetune['label'] = df_finetune['label'] - 1
df_finetune['text'] = '[CLS] ' + df_finetune['file_1'] + " [SEP] " + df_finetune['file_2']
df_finetune.head()

Unnamed: 0_level_0,file_1,file_2,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,the virsa visible infrared survey telescope ar...,the china relay network has released a signifi...,0,[CLS] the virsa visible infrared survey telesc...
1,china the goal of this project involves achiev...,the project aims to achieve an accuracy level ...,1,[CLS] china the goal of this project involves ...
2,scientists can learn about how galaxies form a...,dinosaur eggshells offer clues about what dino...,0,[CLS] scientists can learn about how galaxies ...
3,china the study suggests that multiple star sy...,the importance for understanding how stars evo...,1,[CLS] china the study suggests that multiple s...
4,dinosaur rex was excited about his new toy set...,analyzing how fast stars rotate within a galax...,1,[CLS] dinosaur rex was excited about his new t...


In [7]:
from datasets import Dataset
finetune_dataset = Dataset.from_pandas(df_finetune[['text', 'label']])
split_datasets = finetune_dataset.train_test_split(test_size=0.2)
train_dataset = split_datasets['train']
val_dataset = split_datasets['test']

In [8]:
import torch
from transformers import AutoModel, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Sử dụng thiết bị:", device)
model_name = 'google-bert/bert-base-uncased'

d_model = 1024
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
tokenizer.add_special_tokens({'sep_token': '[SEP]'})
# encoder = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)


max_length = 512
def tokenizer_fn(example):
    tokenized_inputs = tokenizer(example['text'], padding= 'max_length', truncation= True, return_tensors= 'pt', max_length= max_length).to(device)
    if 'label' in example.keys():
        tokenized_inputs['labels'] = torch.Tensor(example['label']).to(device)
    return tokenized_inputs

train_dataset = train_dataset.map(tokenizer_fn, batched=True)
val_dataset = val_dataset.map(tokenizer_fn, batched=True)


Sử dụng thiết bị: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

In [9]:
train_dataset

Dataset({
    features: ['text', 'label', 'id', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 76
})

In [10]:
import torch
import torch.nn as nn
import torch
from transformers import AutoModel, AutoTokenizer


class Encoder(nn.Module):
    def __init__(self, model_name, d_model, max_length = 512, num_label = 2):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print('sử dụng', self.device)
        self.encoder = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.tokenizer.add_special_tokens({'cls_token': '[CLS]'})
        self.tokenizer.add_special_tokens({'sep_token': '[SEP]'})
        self.encoder.resize_token_embeddings(len(tokenizer))
        self.d_model = d_model
        self.max_length= max_length
        self.fc = nn.Linear(d_model, num_label)

        # setup loss function
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels = None ):
        embeddings = self.encoder(input_ids= input_ids, attention_mask= attention_mask).last_hidden_state[:, 0, :]
        logits = self.fc(embeddings)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
    def get_embedding(self, text):
        cleaned_text = clean_text(text)
        inputs = self.tokenizer(cleaned_text, return_tensors="pt", padding='max_length', max_length= self.max_length, truncation=True).to(self.device)
        with torch.no_grad():
            embeddings = self.encoder(**inputs).last_hidden_state[:, 0, :].cpu().numpy()
        return embeddings

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'google-bert/bert-base-uncased'
encoder = Encoder(model_name = model_name, d_model = 768, max_length = 512).to(device)

sử dụng cuda


2025-08-21 16:00:36.103320: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755792036.252694      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755792036.296092      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./PairwiseClassification",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,   # giảm xuống 1
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,   # giữ effective batch size = 4
    learning_rate=3e-5,
    num_train_epochs=5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    logging_strategy="epoch",
    fp16=True,                       # nếu lỗi thì thử bf16 hoặc tắt
    optim="adamw_torch",
    load_best_model_at_end=True,
    weight_decay=0.03,
    metric_for_best_model="f1"
)


In [13]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
def compute_metrics(eval_pred):
    y_pred, y_true = np.argmax(eval_pred.predictions, -1), eval_pred.label_ids
    return {'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'f1': f1_score(y_true, y_pred)}

In [14]:
!pip --q install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

wandb.login(key= user_secrets.get_secret("wandb_key"))

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtrongbg2692004[0m ([33mtrongbg2692004-post-and-telecommunications-institute-of-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [16]:
from transformers import Trainer

# Hàm preprocess để convert dict của dataset thành tensor đúng định dạng cho mô hình
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch])
    attention_mask = torch.tensor([item["attention_mask"] for item in batch])
    # Chuyển đổi nhãn sang kiểu long
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

trainer = Trainer(
    model=encoder,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)

In [17]:
num_params = sum(p.numel() for p in encoder.parameters())
print("Tổng số tham số của mô hình:", num_params)

Tổng số tham số của mô hình: 109483778


In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4394,0.677759,0.526316,0.5,0.888889,0.64
2,0.4122,0.600469,0.578947,0.538462,0.777778,0.636364
3,0.3605,0.51336,0.736842,0.833333,0.555556,0.666667
4,0.3316,0.512176,0.736842,0.7,0.777778,0.736842
5,0.301,0.536611,0.684211,0.636364,0.777778,0.7


TrainOutput(global_step=10, training_loss=0.36893380284309385, metrics={'train_runtime': 44.381, 'train_samples_per_second': 8.562, 'train_steps_per_second': 0.225, 'total_flos': 0.0, 'train_loss': 0.36893380284309385, 'epoch': 5.0})

## Prepare Data for classification

In [14]:
# embedding_extractor = EmbeddingExtractor(model_name='Qwen/Qwen3-Embedding-0.6B', max_length = 1024)
feature_matrix = prepare_data_for_model(df_train, embedding_extractor=None)
feature_matrix.shape

Using 4 CPU cores for feature extraction...
Step 1: Extracting top importance features...
Using 4 cores for top features extraction...


Extracting top features (single-threaded):   0%|          | 0/95 [00:00<?, ?it/s]

Step 2: Extracting rule-based features...
Using 4 cores for rule-based features extraction...


Extracting rule-based features (single-threaded):   0%|          | 0/95 [00:00<?, ?it/s]

Step 6: Combining features...
Final feature matrix shape: (95, 85)


(95, 85)

In [16]:
feature_matrix_test = prepare_data_for_model(df_test, embedding_extractor=None)
feature_matrix_test.shape

Using 4 CPU cores for feature extraction...
Step 1: Extracting top importance features...
Using 4 cores for top features extraction...


Extracting top features (multi-threaded):   0%|          | 0/1068 [00:00<?, ?it/s]

Step 2: Extracting rule-based features...
Using 4 cores for rule-based features extraction...


Extracting rule-based features (multi-threaded):   0%|          | 0/1068 [00:00<?, ?it/s]

Step 6: Combining features...
Final feature matrix shape: (1068, 85)


(1068, 85)

In [17]:
X_test = feature_matrix_test.copy()

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(feature_matrix, y_train, test_size=0.2, random_state=42)

In [19]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

((76, 85), (19, 85), (76,), (19,), (1068, 85))

### Lựa chọn mô hình phân loại với dữ liệu nhiều thuộc tính (853 features)
- **Tree-based models** như Random Forest, Gradient Boosting (XGBoost, LightGBM, CatBoost) rất phù hợp với dữ liệu nhiều chiều, không cần chuẩn hóa đặc trưng, tự động chọn thuộc tính quan trọng và chống overfitting tốt.
- **Logistic Regression** với regularization (L1/L2) cũng có thể thử, nhưng hiệu quả thường kém hơn tree-based khi dữ liệu phi tuyến tính và nhiều thuộc tính không quan trọng.
- **SVM** (Support Vector Machine) có thể dùng, nhưng với số chiều lớn sẽ tốn nhiều tài nguyên và thời gian.
- **Neural Network** (MLP) chỉ nên dùng nếu dữ liệu rất lớn và đã chuẩn hóa tốt.

**Khuyến nghị:**
- Ưu tiên thử Random Forest hoặc LightGBM/XGBoost đầu tiên.
- Có thể dùng Logistic Regression để baseline và kiểm tra feature importance.
- Nên dùng cross-validation để chọn mô hình tối ưu.

## RandomForestClassifier

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rdc = RandomForestClassifier()
rdc.fit(X_train, y_train)
y_pred = rdc.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.75      0.86        12
           2       0.70      1.00      0.82         7

    accuracy                           0.84        19
   macro avg       0.85      0.88      0.84        19
weighted avg       0.89      0.84      0.84        19



In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [i for i in range(100, 201, 20)],
    'max_depth': [i for i in range(1, 21)],
}
rdc = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rdc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END ......................max_depth=1, n_estimators=100; total time=   0.2s
[CV] END ......................max_depth=1, n_estimators=120; total time=   0.3s
[CV] END ......................max_depth=1, n_estimators=140; total time=   0.3s
[CV] END ......................max_depth=1, n_estimators=160; total time=   0.4s
[CV] END ......................max_depth=1, n_estimators=160; total time=   0.4s
[CV] END ......................max_depth=1, n_estimators=180; total time=   0.5s
[CV] END ......................max_depth=1, n_estimators=200; total time=   0.4s
[CV] END ......................max_depth=2, n_estimators=100; total time=   0.2s
[CV] END ......................max_depth=2, n_estimators=100; total time=   0.2s
[CV] END ......................max_depth=2, n_estimators=120; total time=   0.3s
[CV] END ......................max_depth=2, n_estimators=140; total time=   0.3s
[CV] END ......................max_depth=2, n_

In [22]:
grid_search.best_params_

{'max_depth': 6, 'n_estimators': 200}

In [23]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.83      0.91        12
           2       0.78      1.00      0.88         7

    accuracy                           0.89        19
   macro avg       0.89      0.92      0.89        19
weighted avg       0.92      0.89      0.90        19



In [24]:
df_train.columns

Index(['file_1', 'file_2', 'label', 'cleaned_file_1', 'cleaned_file_2',
       'text'],
      dtype='object')

In [25]:
y_test_pred = best_model.predict(X_test)
submission = pd.DataFrame({
    'id': [i for i in range(0, len(y_test_pred))],
    'real_text_id': y_test_pred
})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,real_text_id
0,0,2
1,1,2
2,2,1
3,3,2
4,4,2


## XGBoot

In [26]:
!pip install xgboost



In [27]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

bst = XGBClassifier()
param_grid = {
    'n_estimators': [i for i in range(100, 201, 20)],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'objective': ['binary:logistic']
}
grid_search = GridSearchCV(n_jobs=-1, estimator=bst, param_grid=param_grid, cv=5, verbose=2)

# fit model
y_train_norm = y_train - 1  # Normalize labels to start from 0
grid_search.fit(X_train, y_train_norm)
# make predictions
preds = grid_search.predict(X_val) + 1  # Reverse normalization
print(classification_report(y_val, preds))

Fitting 5 folds for each of 54 candidates, totalling 270 fits

[CV] END .....................max_depth=14, n_estimators=160; total time=   0.4s
[CV] END .....................max_depth=14, n_estimators=160; total time=   0.4s
[CV] END .....................max_depth=14, n_estimators=180; total time=   0.5s
[CV] END .....................max_depth=14, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=15, n_estimators=100; total time=   0.2s
[CV] END .....................max_depth=15, n_estimators=120; total time=   0.3s
[CV] END .....................max_depth=15, n_estimators=120; total time=   0.3s
[CV] END .....................max_depth=15, n_estimators=140; total time=   0.3s
[CV] END .....................max_depth=15, n_estimators=160; total time=   0.4s
[CV] END .....................max_depth=15, n_estimators=180; total time=   0.4s
[CV] END .....................max_depth=15, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=15, n_

In [28]:
grid_search.best_params_

{'learning_rate': 0.01,
 'max_depth': 2,
 'n_estimators': 120,
 'objective': 'binary:logistic'}

## SVM

In [29]:
from sklearn.svm import SVC
svc = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly']
}
grid_search = GridSearchCV(n_jobs=-1, estimator=svc, param_grid=param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train_norm)
y_pred = grid_search.predict(X_val) + 1  # Reverse normalization

print(classification_report(y_val, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
              precision    recall  f1-score   support

           1       1.00      0.75      0.86        12
           2       0.70      1.00      0.82         7

    accuracy                           0.84        19
   macro avg       0.85      0.88      0.84        19
weighted avg       0.89      0.84      0.84        19



In [30]:
grid_search.best_params_

{'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}

## Logistỉc Regression

In [31]:
from sklearn.linear_model import LogisticRegression

lgt = LogisticRegression()
lgt.fit(X_train, y_train_norm)
y_pred = lgt.predict(X_val) + 1  # Reverse normalization

print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           1       1.00      0.83      0.91        12
           2       0.78      1.00      0.88         7

    accuracy                           0.89        19
   macro avg       0.89      0.92      0.89        19
weighted avg       0.92      0.89      0.90        19



In [32]:
y_pred_test = lgt.predict(X_test) + 1  # Reverse normalization

submission = pd.DataFrame({
    'id': [i for i in range(0, len(y_pred_test))],
    'real_text_id': y_pred_test
})
submission.to_csv('submission_logistic_regression.csv', index=False)
len(y_pred_test)

1068


[CV] END learning_rate=0.1, max_depth=6, n_estimators=120, objective=binary:logistic; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=6, n_estimators=160, objective=binary:logistic; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=6, n_estimators=160, objective=binary:logistic; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=6, n_estimators=180, objective=binary:logistic; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=6, n_estimators=200, objective=binary:logistic; total time=   0.1s
[CV] END learning_rate=0.2, max_depth=2, n_estimators=100, objective=binary:logistic; total time=   0.0s
[CV] END learning_rate=0.2, max_depth=2, n_estimators=100, objective=binary:logistic; total time=   0.0s
[CV] END learning_rate=0.2, max_depth=2, n_estimators=120, objective=binary:logistic; total time=   0.1s
[CV] END learning_rate=0.2, max_depth=2, n_estimators=120, objective=binary:logistic; total time=   0.0s
[CV] END learning_rate=0.2, max_depth=2, n_estimators=