In [None]:
import os
import json
from pathlib import Path
from collections import defaultdict
import re

def compile_keyword_patterns(keywords):
    return [re.compile(rf"\b{re.escape(kw)}\b", re.IGNORECASE) for kw in keywords]

# Define full keyword list
KEYWORDS = [
    "automated processing",
    "profiling",
    "artificial intelligence",
    "ai",
    "machine learning",
    "deep learning",
    "neural network",
    "algorithm",
    "intelligent system",
    "intelligent systems",
    "automated system",
    "automated systems",
    "decision-making",
    "automated decision",
    "automated decision-making",
    "autonomous system",
    "autonomous systems",
    "predictive model",
    "predictive analytics",
    "computer vision",
    "natural language processing",
    "nlp",
    "data-driven",
    "learning system",
    "self-learning",
    "self-learning system",
    "pattern recognition",
    "classification model",
    "generative model",
    "chatbot",
    "large language model",
    "language model",
    "reinforcement learning",
    "unsupervised learning",
    "supervised learning",
    "training data",
    "training dataset",
    "ai system",
    "ai-based",
    "ml-based",
    "intelligent automation",
    "cognitive computing",
    "robotic process automation",
    "rpa",
    "digital agent"
]


In [None]:

def count_keywords_in_paragraph(paragraph, compiled_patterns):
    keyword_counts = defaultdict(int)
    matched_keywords = set()

    for pattern in compiled_patterns:
        matches = pattern.findall(paragraph)
        if matches:
            keyword = pattern.pattern.strip(r"\b").lower()  # extract clean version
            keyword_counts[keyword] += len(matches)
            matched_keywords.add(keyword)

    return keyword_counts, matched_keywords


In [None]:

def run_keyword_stats(base_dir):
    stats = {
        "total_paragraphs": 0,
        "keyword_occurrences": defaultdict(int),
        "keyword_paragraph_hits": defaultdict(int)
    }

    base_path = Path(base_dir)

    for segment_dir in base_path.iterdir():
        if not segment_dir.is_dir():
            continue

        for txt_file in segment_dir.glob("*.txt"):
            with open(txt_file, 'r', encoding='utf-8') as f:
                paragraph = f.read()
                stats["total_paragraphs"] += 1

                compiled_patterns = compile_keyword_patterns(KEYWORDS)

                counts, matched_keywords = count_keywords_in_paragraph(paragraph, compiled_patterns)

                # Add total word count
                for kw, count in counts.items():
                    stats["keyword_occurrences"][kw] += count

                # Track per-paragraph presence
                for kw in matched_keywords:
                    stats["keyword_paragraph_hits"][kw] += 1

    stats["keyword_occurrences"] = dict(stats["keyword_occurrences"])
    stats["keyword_paragraph_hits"] = dict(stats["keyword_paragraph_hits"])

    return stats


In [None]:

if __name__ == "__main__":
    # base_dir = "fichiers segmentés"
    base_dir = "../contracts_3_redacted"
    output_file = "keyword_stats.json"

    stats = run_keyword_stats(base_dir)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)

    print(f"✅ Keyword statistics saved to {output_file}")
    print(f"📊 Scanned {stats['total_paragraphs']} paragraphs.")
