# Baseline Solution: Fake or Real - The Impostor Hunt in Texts 🔍

---

Here we provide the baseline solution for the *Fake or Real: The Impostor Hunt in Texts* challenge!
In this notebook, we walk you through two **simple, interpretable, and ML-free approaches** to tackle the problem of detecting fake texts.

### 💡 The overview of first approach:

We use the `langdetect` library to analyze each text by identifying the presence of **English vs. non-English words**. Here's the idea:

1. **Detect Language**: We break the text into words and determine the language of each.
2. **Calculate Proportion**: We then compute the percentage of English words in the entire text.
3. **Assign Label**: The text which gets higher percentage of English words is classified as **Real** and its number is saved to the results list.

---

### 📦 Getting Started: Install & Import Required Packages


In [32]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0mm-:--:--[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
[33m  DEPRECATION: Building 'langdetect' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'langdetect'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=6f6486e61971787709e7f5c4bbd55cee1297e11f810d3775007df2

In [33]:
import os
import string
import unicodedata

import numpy as np
import pandas as pd
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException
from sklearn.metrics import accuracy_score

DetectorFactory.seed = 42


def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].

    Params:
      dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(
        os.path.isdir(os.path.join(root, d))
        for root, dirs, _ in os.walk(dir_path)
        for d in dirs
    )
    data = [0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")

    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i = 0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                with open(
                    os.path.join(folder_path, "file_1.txt"), "r", encoding="utf-8"
                ) as f1:
                    text1 = f1.read().strip()
                with open(
                    os.path.join(folder_path, "file_2.txt"), "r", encoding="utf-8"
                ) as f2:
                    text2 = f2.read().strip()
                index = int(folder_name[-4:])
                data[i] = (index, text1, text2)
                i += 1
            except Exception as e:
                print(f"Error reading directory {folder_name}: {e}")

    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=["id", "file_1", "file_2"]).set_index("id")
    return df


# =================== LOAD DATA ================================

# Use the above function to load both train and test data
# train_path="/kaggle/input/fake-or-real-the-impostor-hunt/data/train"
# df_train=read_texts_from_dir(train_path)
# test_path="/kaggle/input/fake-or-real-the-impostor-hunt/data/test"
# df_test=read_texts_from_dir(test_path)


# Use the above function to load both train and test data
train_path = "/home/thangquang/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train"
df_train = read_texts_from_dir(train_path)
test_path = "/home/thangquang/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/test"
df_test = read_texts_from_dir(test_path)

# Load ground truth for train data
df_train_gt = pd.read_csv(
    "/home/thangquang/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train.csv"
)
# df_train_gt

Number of directories: 95
Number of directories: 1068


In [76]:
# ----------------------------- 1. IMPORTS ------------------------------------
import os, re, gc, warnings, random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# ML & utils
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix

warnings.filterwarnings("ignore")
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ----------------------- 2. TEXT PRE-PROCESSING ------------------------------
lemmatizer = WordNetLemmatizer()
stop_words  = set(stopwords.words("english"))

def clean_text(text: str) -> str:
    """Làm sạch + chuẩn hoá một câu văn."""
    if not isinstance(text, str):
        return ""
    
    text = re.sub(r"http\S+", " ", text)          # xoá URL
    text = re.sub(r"\d+", " NUM ", text)          # thay số = token NUM
    text = re.sub(r"[^\w\s]", " ", text)          # bỏ punctuation
    text = text.lower()
    tokens = [lemmatizer.lemmatize(tok)
              for tok in word_tokenize(text)
              if tok.isalpha() and tok not in stop_words and len(tok) > 2]
    return " ".join(tokens)

def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Áp dụng clean_text cho 2 cột file_1 và file_2 + tạo cột combined."""
    tqdm.pandas(desc="Cleaning text")
    df["file_1"] = df["file_1"].progress_apply(clean_text)
    df["file_2"] = df["file_2"].progress_apply(clean_text)
    df["combined"] = df["file_1"] + " [SEP] " + df["file_2"]
    return df

# ----------------------- 3. STATISTICAL FEATURES -----------------------------
def statistical_features(df: pd.DataFrame) -> np.ndarray:
    """Sinh đặc trưng thống kê/đếm đơn giản ở dạng dense numpy array."""
    len_1   = df["file_1"].str.len()
    len_2   = df["file_2"].str.len()
    words_1 = df["file_1"].str.split().apply(len)
    words_2 = df["file_2"].str.split().apply(len)

    features = pd.DataFrame({
        "len_diff"       : (len_1 - len_2).abs(),
        "word_diff"      : (words_1 - words_2).abs(),
        "len_ratio"      : (len_1 + 1) / (len_2 + 1),
        "words_ratio"    : (words_1 + 1) / (words_2 + 1),
        "avg_word_len_1" : df["file_1"].apply(lambda x: np.mean([len(w) for w in x.split()]) if x else 0),
        "avg_word_len_2" : df["file_2"].apply(lambda x: np.mean([len(w) for w in x.split()]) if x else 0),
    })
    return features.values.astype(np.float32)

# ----------------------- 4. VECTORIZERS --------------------------------------
word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    analyzer="word",
    max_features=20_000,
    sublinear_tf=True,
)

char_vectorizer = TfidfVectorizer(
    ngram_range=(3, 6),
    analyzer="char_wb",
    max_features=15_000,
    sublinear_tf=True,
)

# ----------------------- 5. MODEL + PIPELINE ---------------------------------
def build_feature_matrix(df: pd.DataFrame, fit: bool = False):
    """Trả về sparse matrix = [word_tfidf | char_tfidf | stats_dense]."""
    # TF-IDF
    if fit:
        X_word = word_vectorizer.fit_transform(df["combined"])
        X_char = char_vectorizer.fit_transform(df["combined"])
    else:
        X_word = word_vectorizer.transform(df["combined"])
        X_char = char_vectorizer.transform(df["combined"])

    # Statistical (dense) -> convert to sparse for hstack
    X_stats = csr_matrix(statistical_features(df))

    return hstack([X_word, X_char, X_stats]).tocsr()

def train_and_evaluate(X, y):
    """Cross-validate + grid search LogisticRegression, trả về best model."""
    # Feature selection bên trong pipeline để không rò rỉ dữ liệu
    clf = Pipeline([
        ("select", SelectKBest(chi2, k=20_000)),
        ("clf", LogisticRegression(max_iter=5_000, solver="liblinear", n_jobs=-1))
    ])

    param_grid = {
        "select__k"   : [15_000, 20_000, 25_000],
        "clf__C"      : [0.1, 0.5, 1, 2, 5],
        "clf__penalty": ["l1", "l2"]
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    grid = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring="accuracy",
        cv=skf,
        n_jobs=-1,
        verbose=2
    )
    grid.fit(X, y)
    print(f"Best CV accuracy: {grid.best_score_:.4f}")
    print(f"Best params    : {grid.best_params_}")
    return grid.best_estimator_

# ----------------------- 6. MAIN EXECUTION -----------------------------------
if __name__ == "__main__":
    # --- 6.1 Pre-process ------------------------------------------------------
    print("Pre-processing train ...")
    df_train = preprocess_dataframe(df_train)
    print("Pre-processing test  ...")
    df_test  = preprocess_dataframe(df_test)

    # --- 6.2 Build feature matrices -----------------------------------------
    print("Vectorizing & building features ...")
    X_train = build_feature_matrix(df_train, fit=True)
    X_test  = build_feature_matrix(df_test,  fit=False)
    y_train = df_train_gt["real_text_id"].values  # 0 hoặc 1

    # --- 6.3 Train + CV ------------------------------------------------------
    model = train_and_evaluate(X_train, y_train)

    # --- 6.4 Validate hold-out set (optional) --------------------------------
    # Nếu muốn một tập hold-out thay vì CV:
    # X_tr, X_val, y_tr, y_val = train_test_split(
    #     X_train, y_train, test_size=0.2,
    #     random_state=SEED, stratify=y_train
    # )
    # model.fit(X_tr, y_tr)
    # val_pred = model.predict(X_val)
    # print("Hold-out accuracy:", accuracy_score(y_val, val_pred))

    # --- 6.5 Retrain on full data & infer test -------------------------------
    print("Training on full data ...")
    model.fit(X_train, y_train)

    print("Predicting on test ...")
    test_pred = model.predict(X_test)

    # --- 6.6 Build submission -------------------------------------------------
    submission = pd.DataFrame({
        "id": df_test.index,
        "real_text_id": test_pred.astype(int)
    }).sort_values("id")

    save_path = Path("submission.csv")
    submission.to_csv(save_path, index=False)
    print(f"✅  Submission saved to {save_path.resolve()}")


Pre-processing train ...


Cleaning text:   0%|          | 0/95 [00:00<?, ?it/s]

Cleaning text:   0%|          | 0/95 [00:00<?, ?it/s]

Pre-processing test  ...


Cleaning text:   0%|          | 0/1068 [00:00<?, ?it/s]

Cleaning text:   0%|          | 0/1068 [00:00<?, ?it/s]

Vectorizing & building features ...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=15000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l1, select__k=15000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l1, select__k=15000; total time=   0.5s
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=15000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l1, select__k=25000; total time=   0.5s
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=15000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=15000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=20000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=15000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=20000; total time=   0.1s
[CV] END .......clf__C=0.1, clf__penalty=l2, select__k=20000; total time=   0.1s
[CV] END ..