# LGBM: Fake or Real - The Impostor Hunt in Texts 🔍


In [17]:
!pip install langdetect



In [18]:
import os
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import unicodedata

import string
from sklearn.metrics import accuracy_score
import numpy as np
DetectorFactory.seed = 42

---

### 📄 Load the Data

Now, let's load the data into memory for exploration and processing.

We'll use `Pandas` to read the file into a DataFrame, which allows for easy data manipulation and analysis throughout the notebook.

In [19]:
def read_texts_from_dir(dir_path):
  """
  Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].

  Params:
    dir_path (str): path to the directory with data
  """
  # Count number of directories in the provided path
  dir_count = sum(os.path.isdir(os.path.join(root, d)) for root, dirs, _ in os.walk(dir_path) for d in dirs)
  data=[0 for _ in range(dir_count)]
  print(f"Number of directories: {dir_count}")

  # For each directory, read both file_1.txt and file_2.txt and save results to the list
  i=0
  for folder_name in sorted(os.listdir(dir_path)):
    folder_path = os.path.join(dir_path, folder_name)
    if os.path.isdir(folder_path):
      try:
        with open(os.path.join(folder_path, 'file_1.txt'), 'r', encoding='utf-8') as f1:
          text1 = f1.read().strip()
        with open(os.path.join(folder_path, 'file_2.txt'), 'r', encoding='utf-8') as f2:
          text2 = f2.read().strip()
        index = int(folder_name[-4:])
        data[i]=(index, text1, text2)
        i+=1
      except Exception as e:
        print(f"Error reading directory {folder_name}: {e}")

  # Change list with results into pandas DataFrame
  df = pd.DataFrame(data, columns=['id', 'file_1', 'file_2']).set_index('id')
  return df

In [20]:
# 检查输入目录的内容
print("Input directory contents:")
print(os.listdir('/kaggle/input'))

Input directory contents:
['fake-or-real-the-impostor-hunt']


In [21]:
# Use the above function to load both train and test data
train_path="/kaggle/input/fake-or-real-the-impostor-hunt/data/train"
df_train=read_texts_from_dir(train_path)
test_path="/kaggle/input/fake-or-real-the-impostor-hunt/data/test"
df_test=read_texts_from_dir(test_path)

Number of directories: 95
Number of directories: 1068


In [22]:
df_train.head()

Unnamed: 0_level_0,file_1,file_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...


In [23]:
df_test.head()

Unnamed: 0_level_0,file_1,file_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"""Music"" Music music music Music music Music mu...",Since its launch on Paranal observatory's Very...
1,underground exploration on SN's birth has prov...,SN 1987A provides valuable insights as newer o...
2,This research aimed to understand how star sha...,ChromeDriver music player\nThis study focused ...
3,Using OmegaCAM's wide field capabilities spann...,"greek translation :\nvazhi (megaCAM), territor..."
4,AssemblyCulture AssemblyCulture AssemblyCultur...,XClass is software tool that helps astronomers...


---

### 🏷️ Read the Labels

Next, we’ll load the **labels** associated with each text sample.
These labels indicate which text is **Real** - 1 or 2. The labels will serve as our ground truth for evaluation.

We’ll again use `Pandas` to read the label file into a DataFrame and inspect its structure.


In [24]:
# Load ground truth for train data
df_train_gt=pd.read_csv("/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv")
df_train_gt

Unnamed: 0,id,real_text_id
0,0,1
1,1,2
2,2,1
3,3,2
4,4,2
...,...,...
90,90,2
91,91,1
92,92,2
93,93,2


In [25]:
#安装和导入升级所需的库
!pip install lightgbm sentence-transformers
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
from collections import Counter
import re

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.11.0->sentence-transformers)
  Downloading nvid

2025-08-20 03:22:40.903180: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755660161.151067      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755660161.219946      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [26]:
#特征提取函数
# 我们将创建一个函数，为一段文本计算多种特征
def extract_features(text):
    """
    为输入的文本计算一系列特征。
    返回一个特征字典。
    """
    features = {}
    
    # 1. 基础统计特征
    words = text.split()
    sentences = re.split(r'[.!?]+', text)
    sentences = [s for s in sentences if s.strip() != '']
    num_chars = len(text)
    num_words = len(words)
    num_sentences = len(sentences)
    
    features['num_chars'] = num_chars
    features['num_words'] = num_words
    features['num_sentences'] = num_sentences
    features['avg_word_length'] = np.mean([len(word) for word in words]) if num_words > 0 else 0
    features['avg_sentence_length_words'] = num_words / num_sentences if num_sentences > 0 else 0
    features['avg_sentence_length_chars'] = num_chars / num_sentences if num_sentences > 0 else 0
    
    # 2. 词汇多样性 (Type-Token Ratio)
    unique_words = set(words)
    features['vocab_richness'] = len(unique_words) / num_words if num_words > 0 else 0
    
    # 3. 标点符号比例
    punctuation_count = sum(1 for char in text if char in '.,!?;:')
    features['punctuation_ratio'] = punctuation_count / num_chars if num_chars > 0 else 0
    
    # 4. 大写字母比例
    upper_case_ratio = sum(1 for char in text if char.isupper()) / num_chars if num_chars > 0 else 0
    features['upper_case_ratio'] = upper_case_ratio
    
    # 5. 特殊字符比例 (非字母数字空格)
    special_char_ratio = sum(1 for char in text if not char.isalnum() and not char.isspace()) / num_chars if num_chars > 0 else 0
    features['special_char_ratio'] = special_char_ratio

    # 注意：这里移除了缓慢的langdetect特征，或用更高效的方法替代
    # 我们可以简单计算字母字符的比例作为代理特征
    alpha_ratio = sum(1 for char in text if char.isalpha()) / num_chars if num_chars > 0 else 0
    features['alpha_ratio'] = alpha_ratio

    return features


In [27]:
#为训练和测试数据提取特征
print("为训练数据提取特征...")
train_features_1 = []
train_features_2 = []
for idx, row in df_train.iterrows():
    feats1 = extract_features(row['file_1'])
    feats2 = extract_features(row['file_2'])
    train_features_1.append(feats1)
    train_features_2.append(feats2)
    
print("为测试数据提取特征...")
test_features_1 = []
test_features_2 = []
for idx, row in df_test.iterrows():
    feats1 = extract_features(row['file_1'])
    feats2 = extract_features(row['file_2'])
    test_features_1.append(feats1)
    test_features_2.append(feats2)

为训练数据提取特征...
为测试数据提取特征...


In [28]:
# 转换为DataFrame
df_train_feats_1 = pd.DataFrame(train_features_1)
df_train_feats_2 = pd.DataFrame(train_features_2)
df_test_feats_1 = pd.DataFrame(test_features_1)
df_test_feats_2 = pd.DataFrame(test_features_2)

In [29]:
# 获取特征列名，稍后有用
feature_columns = df_train_feats_1.columns.tolist()

In [30]:
#构建Learning to Rank的训练数据
# 对于每一对，我们计算特征差异：Text_1的特征 - Text_2的特征
X_train_diff = df_train_feats_1[feature_columns].values - df_train_feats_2[feature_columns].values
# 我们的标签：如果真实文本是第一个，标签为1；如果是第二个，标签为0。
# 原标签中1代表选第一个文本，2代表选第二个文本。
y_train = (df_train_gt['real_text_id'].values == 1).astype(int) # 如果real_text_id是1，则y=1，否则y=0

# 为了稳健性，我们也可以创建一个平衡的验证集
X_train, X_val, y_train_split, y_val = train_test_split(X_train_diff, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [31]:
#训练LightGBM模型
print("训练LightGBM模型...")
# 定义模型参数
params = {
    'objective': 'binary', # 二分类任务
    'metric': 'binary_logloss', 
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'verbosity': -1,
    'random_state': 42,
}

# 创建LightGBM数据集
lgb_train = lgb.Dataset(X_train, label=y_train_split)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

训练LightGBM模型...


In [32]:
# 训练模型
num_round = 200
bst = lgb.train(params,
                lgb_train,
                num_round,
                valid_sets=[lgb_val],
                callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(50)])



Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[18]	valid_0's binary_logloss: 0.569981


In [33]:
#在训练集上评估模型 
# 预测的是Text_1优于Text_2的概率
y_pred_prob_train = bst.predict(X_train_diff, num_iteration=bst.best_iteration)
# 将概率转换为最终选择：如果概率>0.5，我们选择Text_1(预测为1)，否则选择Text_2(预测为2)
y_pred_train = np.where(y_pred_prob_train > 0.5, 1, 2)

train_accuracy = accuracy_score(df_train_gt['real_text_id'], y_pred_train)
print(f"\nLearning to Rank模型在训练集上的准确率: {train_accuracy:.4f}")



Learning to Rank模型在训练集上的准确率: 0.8526


In [34]:
#对测试集进行预测 
# 计算测试集的特征差异
X_test_diff = df_test_feats_1[feature_columns].values - df_test_feats_2[feature_columns].values
# 预测概率
y_pred_prob_test = bst.predict(X_test_diff, num_iteration=bst.best_iteration)
# 转换为最终提交格式的预测 (1 or 2)
predictions_test_ltr = np.where(y_pred_prob_test > 0.5, 1, 2)



In [35]:
#准备提交文件
output_df = pd.DataFrame({
    'id': range(len(predictions_test_ltr)),
    'real_text_id': predictions_test_ltr
})
output_df.to_csv('ltr_submission.csv', index=False)
print("提交文件 'ltr_submission.csv' 已保存！")
output_df.head()

提交文件 'ltr_submission.csv' 已保存！


Unnamed: 0,id,real_text_id
0,0,2
1,1,1
2,2,1
3,3,2
4,4,2


In [36]:
output_df

Unnamed: 0,id,real_text_id
0,0,2
1,1,1
2,2,1
3,3,2
4,4,2
...,...,...
1063,1063,1
1064,1064,1
1065,1065,1
1066,1066,2
