In [10]:
import numpy as np
import pandas as pd
# from bert_logistic import read_texts_from_dir
import os
def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].

    Params:
      dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(
        os.path.isdir(os.path.join(root, d))
        for root, dirs, _ in os.walk(dir_path)
        for d in dirs
    )
    data = [0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")

    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i = 0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                with open(
                    os.path.join(folder_path, "file_1.txt"), "r", encoding="utf-8"
                ) as f1:
                    text1 = f1.read().strip()
                with open(
                    os.path.join(folder_path, "file_2.txt"), "r", encoding="utf-8"
                ) as f2:
                    text2 = f2.read().strip()
                index = int(folder_name[-4:])
                data[i] = (index, text1, text2)
                i += 1
            except Exception as e:
                print(f"Error reading directory {folder_name}: {e}")

    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=["id", "file_1", "file_2"]).set_index("id")
    return df

In [11]:
train_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train"
test_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/test"
gt_path = "/home/thangquang09/CODE/CTAI_MachineLearning/data/fake-or-real-the-impostor-hunt/data/train.csv"
print("Loading data...")
df_train = read_texts_from_dir(train_path)
df_test = read_texts_from_dir(test_path)
df_train_gt = pd.read_csv(gt_path)
y_train = df_train_gt["real_text_id"].values

Loading data...
Number of directories: 95
Number of directories: 1068


In [12]:
PROMPT = """You are an expert AI analyst specializing in detecting textual artifacts and style inconsistencies. Your task is to analyze a pair of texts, text_0 and text_1. One of these texts is the "REAL" text, which is closely based on an original scientific article. The other is the "FAKE" text, which has been deliberately altered by an AI to be different from the original.

Your goal is to identify which text is the REAL one.

Here are the criteria to guide your analysis:

Characteristics of a REAL Text:

    Maintains a formal, objective, and academic tone throughout.

    Uses consistent, topic-specific terminology.

    The content is focused and coherent, presenting scientific information.

Common Patterns in a FAKE Text:

    Text Corruption: The text may contain nonsensical characters, random strings of text from multiple languages, emojis, or malformed code snippets, often appearing abruptly after a few coherent sentences.

    Fantastical Content: The text introduces bizarre, fantastical, or absurd elements that are completely unrelated to the scientific topic (e.g., Santa Claus living on the moon, rainbow unicorns, interstellar wars).

    Inappropriate Tone Shift: The text shifts from a scientific tone to an overly informal, conversational, or narrative style, resembling a blog post, a story, or marketing copy. It may use exclamation points excessively or ask rhetorical questions.

    Plausible Falsification: The text might maintain a scientific tone but replace key entities (like the names of telescopes, projects, or locations) with fabricated but plausible-sounding names.

Your Analysis Process:

    Read both text_0 and text_1 carefully.

    For each text, check for any of the FAKE text patterns listed above.

    Compare the tone, style, and content of the two texts.

    Based on your analysis, decide which text is REAL and which is FAKE.

Input Texts:

<text_0>
{text_0}
</text_0>

<text_1>
{text_1}
</text_1>

Output Format:
Provide your response as a single JSON object. Do not write any text outside of the JSON block. The JSON object should have the following structure:
{{
"label": 0
}}

If you determine text_0 is REAL, the label should be 0. If text_1 is REAL, the label should be 1.

Now, analyze the provided texts and return the JSON output."""

In [13]:
from pydantic import BaseModel, Field
class ResponseFormatter(BaseModel):
    """Always use this tool to structure your response to the user."""
    label: str = Field(description="If you determine text_0 is REAL, the label should be 0. If text_1 is REAL, the label should be 1.")

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0.0,
    api_key=os.getenv("GEMINI_API_KEY")
)

llm_structured_output = llm.with_structured_output(ResponseFormatter)

In [15]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

llm = ChatOpenAI(
    model="o1",
    # temperature=0.0,
    api_key=os.getenv("OPENAI_API_KEY")
)

llm_structured_output = llm.with_structured_output(ResponseFormatter)

In [16]:
text_1 = df_train.loc[0]["file_1"]
text_2 = df_train.loc[0]["file_2"]

test_prompt = PROMPT.format(
    text_0=text_1,
    text_1=text_2
)

In [17]:
response = llm_structured_output.invoke(test_prompt)
response

ResponseFormatter(label='0')

In [18]:
# To run this code you need to install the following dependencies:
# pip install google-genai

import base64
import json
import os

from dotenv import load_dotenv
# from google import genai
# from google.genai import types

load_dotenv()


def generate(prompt):
    response = llm_structured_output.invoke(prompt)
    return int(response.label)

In [19]:
predicted_label = []

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm

# Reset predicted_label since the previous run failed
predicted_label = []

# Function to process a single row with retry logic
def process_single_row(row_data):
    file_1, file_2, row_id = row_data
    max_retries = 3
    
    for attempt in range(max_retries):
        try:
            prompt = PROMPT.format(text_0=file_1, text_1=file_2)
            response = generate(prompt)
            return row_id, response
        except Exception as e:
            if "rate_limit" in str(e).lower() or "quota" in str(e).lower():
                wait_time = (2 ** attempt) * 60  # Exponential backoff: 1min, 2min, 4min
                print(f"Rate limit hit for row {row_id}, waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Error processing row {row_id}: {e}")
                break
    
    return row_id, None  # Return None if all retries failed

# Prepare data for processing
row_data_list = [(row.file_1, row.file_2, row.Index) for row in df_test.itertuples()]

# Use ThreadPoolExecutor with limited workers to avoid overwhelming the API
results = {}
with ThreadPoolExecutor(max_workers=2) as executor:  # Reduced workers to avoid rate limits
    # Submit all tasks
    future_to_row = {executor.submit(process_single_row, row_data): row_data[2] 
                     for row_data in row_data_list}
    
    # Process completed tasks with progress bar
    for future in tqdm(as_completed(future_to_row), total=len(future_to_row), desc="Processing rows"):
        try:
            row_id, result = future.result()
            if result is not None:
                results[row_id] = result
            else:
                print(f"Failed to process row {row_id}")
        except Exception as e:
            row_id = future_to_row[future]
            print(f"Exception for row {row_id}: {e}")

# Convert results to list in correct order
predicted_label = [results.get(row_id, 0) for row_id in df_test.index]

print(f"Successfully processed {len([r for r in predicted_label if r != 0])}/{len(df_test)} rows")

Processing rows:   0%|          | 0/1068 [00:00<?, ?it/s]

Processing rows:   2%|▏         | 25/1068 [02:16<1:58:17,  6.81s/it]

In [None]:
len(predicted_label)

1068

In [None]:
predicted_label = np.array(predicted_label) + 1

In [None]:
from pathlib import Path

print("Predicting on test ...")
test_pred = predicted_label

# --- Build submission -------------------------------------------------
submission = pd.DataFrame({
    "id": df_test.index,
    "real_text_id": test_pred.astype(int)
}).sort_values("id")

save_path = Path("submission_api_call_full_openai_o1.csv")
submission.to_csv(save_path, index=False)
print(f"✅ Submission saved to {save_path.resolve()}")

Predicting on test ...
✅ Submission saved to /home/thangquang09/CODE/CTAI_MachineLearning/notebooks/submission_api_call_full_openai.csv
