In [None]:
import os
import json
import time
import random
import requests
from pathlib import Path


In [None]:

API_KEY = os.getenv("ANTHROPIC_API_KEY")
BASE_URL = "https://api.anthropic.com/v1/messages"
MODEL_NAME = "claude-3-sonnet-20240229"
MAX_RETRIES = 5

OUTPUT_FOLDER = "synthetic_paragraphs_for_test2"
LABELS_FILE = "synthetic_paragraphs_for_test/labels.json"


In [None]:

def call_claude(prompt, max_tokens=1500):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            headers = {
                "x-api-key": API_KEY,
                "Content-Type": "application/json",
                "anthropic-version": "2023-06-01"
            }
            data = {
                "model": MODEL_NAME,
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": max_tokens,
                "temperature": 0.7
            }
            response = requests.post(BASE_URL, headers=headers, data=json.dumps(data))
            if response.status_code == 200:
                result = response.json()
                for item in result.get("content", []):
                    if item.get("type") == "text":
                        return item.get("text", "").strip()
            else:
                print(f"Error {response.status_code}: {response.text}")
                retries += 1
                time.sleep(2 ** retries + random.uniform(0, 1))
        except Exception as e:
            print(f"Request error: {e}")
            retries += 1
            time.sleep(2 ** retries + random.uniform(0, 1))
    return "Error: Claude API failed after retries."


In [None]:

def generate_paragraphs(prompt, expected_count):
    raw = call_claude(prompt, max_tokens=2000)
    paragraphs = []

    for line in raw.split('\n'):
        line = line.strip()
        if line and line[0].isdigit():
            _, paragraph = line.split('.', 1)
            paragraphs.append(paragraph.strip())

    return paragraphs[:expected_count]


In [None]:

def generate_balanced_paragraphs(n=50):
    assert n % 2 == 0, "Use an even number for balanced sets"

    # 1. Generate AI-related clauses
    ai_prompt = (
        f"Generate {n // 2} realistic, legal-style contract paragraphs that discuss "
        "artificial intelligence, machine learning, profiling, or automated processing. But don't make it too obvious, don't write ai or machine learning for example, i need to be able to understand from the context but not from keywords. "
        "Each paragraph should sound like it belongs in a contract. Format as numbered list."
    )
    ai_paragraphs = generate_paragraphs(ai_prompt, n // 2)

    # 2. Generate neutral (non-AI) contract clauses
    neutral_prompt = (
        f"Generate {n // 2} realistic legal-style contract paragraphs, "
        "They can be about general clauses like liability, termination, payment terms, etc. "
        "Each paragraph should sound like a real clause from a contract. Format as numbered list."
    )
    neutral_paragraphs = generate_paragraphs(neutral_prompt, n // 2)

    final_paragraphs = []
    labels = {}
    Path(OUTPUT_FOLDER).mkdir(parents=True, exist_ok=True)

    for i in range(n):
        is_ai = (i + 1) % 2 == 0
        paragraph = ai_paragraphs[i // 2] if is_ai else neutral_paragraphs[i // 2]
        filename = f"{i+1:03d}_synthetic_ai_clause.txt"
        filepath = Path(OUTPUT_FOLDER) / filename

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(paragraph + "\n")

        labels[filename] = "ai" if is_ai else "neutral"

    with open(LABELS_FILE, 'w', encoding='utf-8') as f:
        json.dump(labels, f, indent=2, ensure_ascii=False)

    print(f"✅ Created {n} synthetic paragraphs in '{OUTPUT_FOLDER}/'")
    print(f"📝 Labels saved to '{LABELS_FILE}'")


In [None]:

if __name__ == "__main__":
    generate_balanced_paragraphs(n=50)