In [None]:
import os
import json
import time
import random
import requests
from pathlib import Path
import re

def compile_keyword_patterns(keywords):
    return [re.compile(rf"\b{re.escape(kw)}\b", re.IGNORECASE) for kw in keywords]

KEYWORDS = [
    "automated processing",
    "profiling",
    "artificial intelligence",
    "ai",
    "machine learning",
    "deep learning",
    "neural network",
    "algorithm",
    "intelligent system",
    "intelligent systems",
    "automated system",
    "automated systems",
    "decision-making",
    "automated decision",
    "automated decision-making",
    "autonomous system",
    "autonomous systems",
    "predictive model",
    "predictive analytics",
    "computer vision",
    "natural language processing",
    "nlp",
    "data-driven",
    "learning system",
    "self-learning",
    "self-learning system",
    "pattern recognition",
    "classification model",
    "generative model",
    "chatbot",
    "large language model",
    "language model",
    "reinforcement learning",
    "unsupervised learning",
    "supervised learning",
    "training data",
    "training dataset",
    "ai system",
    "ai-based",
    "ml-based",
    "intelligent automation",
    "cognitive computing",
    "robotic process automation",
    "rpa",
    "digital agent"
]


In [None]:

compiled_patterns = compile_keyword_patterns(KEYWORDS)

def contains_keywords(text):
    matched = [pat.pattern.strip(r"\b").lower() for pat in compiled_patterns if pat.search(text)]
    return bool(matched), matched


In [None]:
API_KEY = os.getenv("ANTHROPIC_API_KEY")
BASE_URL = "https://api.anthropic.com/v1/messages"
MODEL_NAME = "claude-3-sonnet-20240229"
MAX_RETRIES = 5


In [None]:

def ask_claude(prompt):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            headers = {
                "x-api-key": API_KEY,
                "Content-Type": "application/json",
                "anthropic-version": "2023-06-01"
            }

            data = {
                "model": MODEL_NAME,
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 50,
                "temperature": 0,
            }

            response = requests.post(BASE_URL, headers=headers, data=json.dumps(data))
            if response.status_code == 200:
                result = response.json()
                for item in result.get("content", []):
                    if item.get("type") == "text":
                        return item.get("text", "").strip()
            else:
                print(f"Error {response.status_code}: {response.text}")
                retries += 1
                time.sleep(2 ** retries + random.uniform(0, 1))
        except Exception as e:
            print(f"Request error: {e}")
            retries += 1
            time.sleep(2 ** retries + random.uniform(0, 1))

    return "Error: Failed after retries"


In [None]:

def moving_window(paragraphs, window_size=2):
    for i in range(len(paragraphs) - window_size + 1):
        yield i, paragraphs[i:i+window_size]


In [None]:

def scan_for_ai_mentions(paragraph_dir, window_size=2, output_file="ai_detection_results.json"):
    paragraphs = []

    # Load all .txt segments (assumes ordered filenames)
    for file_path in sorted(Path(paragraph_dir).glob("*.txt")):
        with open(file_path, 'r', encoding='utf-8') as f:
            paragraphs.append(f.read().strip())

    results = []
    for index, window in moving_window(paragraphs, window_size):
        combined_text = "\n\n".join(window)
        is_match, matched_keywords = contains_keywords(combined_text)
        if is_match:
            answer = f"Yes (keyword match: {', '.join(matched_keywords)})"
        else:
            prompt = (
                "Does the following text mention or relate to artificial intelligence (AI)? "
                "Answer 'Yes' or 'No' only. Don't say anything but 'Yes' or 'No' don't add any text after.\n\n"
                f"Text:\n{combined_text}"
            )
            answer = ask_claude(prompt)
        results.append({
            "start_index": index,
            "window": window,
            "response": answer
        })
        print(f"Window {index}: {answer}")

    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as out_f:
        json.dump(results, out_f, indent=2, ensure_ascii=False)

    
    yes_results = [r for r in results if r["response"].strip().lower().startswith("yes")]

    yes_output_file = output_file.replace(".json", "_only_yes.json")
    with open(yes_output_file, 'w', encoding='utf-8') as yes_f:
        json.dump(yes_results, yes_f, indent=2, ensure_ascii=False)

    print(f"\n✅ Saved {len(yes_results)} 'Yes' results to {yes_output_file}")


In [None]:
if __name__ == "__main__":
    fileName = "synthetic_paragraphs_for_test2"
    segment_dir = "./" + fileName
    scan_for_ai_mentions(segment_dir, window_size=1, output_file="isAI/" + fileName + "_ai_detection_results.json")

# if __name__ == "__main__":
#     base_dir = Path("./fichiers segmentés")
#     for segment_dir in base_dir.iterdir():
#         if segment_dir.is_dir():
#             print(f"\n🔍 Scanning: {segment_dir}")
#             relative_name = segment_dir.relative_to("fichiers segmentés")
#             output_json = "./isAI/" + f"{relative_name}.json"
#             print(f"Output file: {output_json}")
#             scan_for_ai_mentions(segment_dir, window_size=2, output_file=output_json)
