In [1]:
from openai import OpenAI
import os
import datetime
from urllib.parse import urlparse
import re

os.environ["OPENAI_API_KEY"] = "your_openai_key_here"
os.environ["GEMINI_API_KEY"] = "your_gemini_key_here"
os.environ["GOOGLE_API_KEY"] = "your_google_key_here"
os.environ["DEEPSEEK_API_KEY"] = "your_deepseek_key_here"


client = OpenAI(timeout=3600)


In [None]:
suggested_rewriting_prompt = """
You will be given a research task by a user. Your job is to produce a set of instructions for a researcher that will complete the task. Do NOT complete the task yourself, just provide instructions on how to complete it.

GUIDELINES:
1. **Maximize Specificity and Detail**
- Include all known user preferences and explicitly list key attributes or dimensions to consider.
- It is of utmost importance that all details from the user are included in the instructions.

2. **Fill in Unstated But Necessary Dimensions as Open-Ended**
- If certain attributes are essential for a meaningful output but the user has not provided them, explicitly state that they are open-ended or default to no specific constraint.

3. **Avoid Unwarranted Assumptions**
- If the user has not provided a particular detail, do not invent one.
- Instead, state the lack of specification and guide the researcher to treat it as flexible or accept all possible options.

4. **Use the First Person**
- Phrase the request from the perspective of the user.

5. **Tables**
- If you determine that including a table will help illustrate, organize, or enhance the information in the research output, you must explicitly request that the researcher provide them.
Examples:
- Product Comparison (Consumer): When comparing different smartphone models, request a table listing each model's features, price, and consumer ratings side-by-side.
- Project Tracking (Work): When outlining project deliverables, create a table showing tasks, deadlines, responsible team members, and status updates.
- Budget Planning (Consumer): When creating a personal or household budget, request a table detailing income sources, monthly expenses, and savings goals.
Competitor Analysis (Work): When evaluating competitor products, request a table with key metrics, such as market share, pricing, and main differentiators.

6. **Headers and Formatting**
- You should include the expected output format in the prompt.
- If the user is asking for content that would be best returned in a structured format (e.g. a report, plan, etc.), ask the researcher to format as a report with the appropriate headers and formatting that ensures clarity and structure.

7. **Language**
- If the user input is in a language other than English, tell the researcher to respond in this language, unless the user query explicitly asks for the response in a different language.

8. **Sources**
- If specific sources should be prioritized, specify them in the prompt.
- For product and travel research, prefer linking directly to official or primary websites (e.g., official brand sites, manufacturer pages, or reputable e-commerce platforms like Amazon for user reviews) rather than aggregator sites or SEO-heavy blogs.
- For academic or scientific queries, prefer linking directly to the original paper or official journal publication rather than survey papers or secondary summaries.
- If the query is in a specific language, prioritize sources published in that language.
"""

import os
import re
import json

# File paths to your parsed prompt files
parsed_files = [
    "FEDWEB13.txt",
    "SESSION2014.txt"
]

# Helper to extract "Prompt N: ..." blocks
def extract_prompts(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    return re.findall(r"Prompt \d+:\s*(.*?)\n\n", content, re.DOTALL)

# Output file for OpenAI Batch API
output_path = "my_batch_requests.jsonl"

# # Function to rewrite prompts via OpenAI API
# def rewrite_prompt(prompt, client, model="gpt-4.1-2025-04-14"):
#     response = client.responses.create(
#         instructions=suggested_rewriting_prompt,
#         model="gpt-4.1-2025-04-14",
#         input=prompt
#     )
#     return response.output[0].content[0].text
    

# Build the batch JSONL
with open(output_path, "w", encoding="utf-8") as out_f:
    task_id = 1
    for file_path in parsed_files:
        prompts = extract_prompts(file_path)
        for prompt in prompts:
            request = {
                "custom_id": f"task-{task_id}",
                "method": "POST",
                "url": "/v1/responses",
                "body": {
                    "instructions": suggested_rewriting_prompt,
                    "input": prompt.strip(),
                    "model": "gpt-4.1-2025-04-14",  # Or gpt-4.0 / gpt-4.1 depending on support
                }
            }
            out_f.write(json.dumps(request) + "\n")
            task_id += 1

print(f"✅ Created batch request file: {output_path}")


In [12]:
from openai import OpenAI
client = OpenAI()

batch_input_file = client.files.create(
    file=open("my_batch_requests.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

batch_input_file_id = batch_input_file.id
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/responses",
    completion_window="24h",
    metadata={
        "description": "rewriting prompts"
    }
)


FileObject(id='file-UoNfjUD8WB2Q9FZsqg55zb', bytes=378514, created_at=1752529824, filename='my_batch_requests.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


Batch(id='batch_68757ba094588190aa9b27334359f611', completion_window='24h', created_at=1752529824, endpoint='/v1/responses', input_file_id='file-UoNfjUD8WB2Q9FZsqg55zb', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1752616224, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'rewriting prompts'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [None]:
file_response = client.files.content("file-68757ba094588190aa9b27334359f611")
print(file_response.text)


In [None]:
import os
import re
import json

# File paths to your parsed prompt files
parsed_files = [
    "FEDWEB13.txt",
    "SESSION2014.txt"
]

# Helper to extract "Prompt N: ..." blocks
def extract_prompts(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    return re.findall(r"Prompt \d+:\s*(.*?)\n\n", content, re.DOTALL)

# Output file for OpenAI Batch API
output_path = "my_batch_requests.jsonl"

# # Function to rewrite prompts via OpenAI API
# def rewrite_prompt(prompt, client, model="gpt-4.1-2025-04-14"):
#     response = client.responses.create(
#         instructions=suggested_rewriting_prompt,
#         model="gpt-4.1-2025-04-14",
#         input=prompt
#     )
#     return response.output[0].content[0].text
    

# Build the batch JSONL
with open(output_path, "w", encoding="utf-8") as out_f:
    task_id = 1
    for file_path in parsed_files:
        prompts = extract_prompts(file_path)
        for prompt in prompts:
            request = {
                "custom_id": f"task-{task_id}",
                "method": "POST",
                "url": "/v1/responses",
                "body": {
                    "instructions": suggested_rewriting_prompt,
                    "input": prompt.strip(),
                    "model": "gpt-4.1-2025-04-14",  # Or gpt-4.0 / gpt-4.1 depending on support
                }
            }
            out_f.write(json.dumps(request) + "\n")
            task_id += 1

print(f"✅ Created batch request file: {output_path}")


In [2]:
suggested_rewriting_prompt = """
You will be given a research task by a user. Your job is to produce a set of instructions for a researcher that will complete the task. Do NOT complete the task yourself, just provide instructions on how to complete it.

GUIDELINES:
1. **Maximize Specificity and Detail**
- Include all known user preferences and explicitly list key attributes or dimensions to consider.
- It is of utmost importance that all details from the user are included in the instructions.

2. **Fill in Unstated But Necessary Dimensions as Open-Ended**
- If certain attributes are essential for a meaningful output but the user has not provided them, explicitly state that they are open-ended or default to no specific constraint.

3. **Avoid Unwarranted Assumptions**
- If the user has not provided a particular detail, do not invent one.
- Instead, state the lack of specification and guide the researcher to treat it as flexible or accept all possible options.

4. **Use the First Person**
- Phrase the request from the perspective of the user.

5. **Tables**
- If you determine that including a table will help illustrate, organize, or enhance the information in the research output, you must explicitly request that the researcher provide them.
Examples:
- Product Comparison (Consumer): When comparing different smartphone models, request a table listing each model's features, price, and consumer ratings side-by-side.
- Project Tracking (Work): When outlining project deliverables, create a table showing tasks, deadlines, responsible team members, and status updates.
- Budget Planning (Consumer): When creating a personal or household budget, request a table detailing income sources, monthly expenses, and savings goals.
Competitor Analysis (Work): When evaluating competitor products, request a table with key metrics, such as market share, pricing, and main differentiators.

6. **Headers and Formatting**
- You should include the expected output format in the prompt.
- If the user is asking for content that would be best returned in a structured format (e.g. a report, plan, etc.), ask the researcher to format as a report with the appropriate headers and formatting that ensures clarity and structure.

7. **Language**
- If the user input is in a language other than English, tell the researcher to respond in this language, unless the user query explicitly asks for the response in a different language.

8. **Sources**
- If specific sources should be prioritized, specify them in the prompt.
- For product and travel research, prefer linking directly to official or primary websites (e.g., official brand sites, manufacturer pages, or reputable e-commerce platforms like Amazon for user reviews) rather than aggregator sites or SEO-heavy blogs.
- For academic or scientific queries, prefer linking directly to the original paper or official journal publication rather than survey papers or secondary summaries.
- If the query is in a specific language, prioritize sources published in that language.
"""

In [4]:
# Paths to parsed prompt files
parsed_files = [
    "DD2016.txt",
    "FEDWEB13.txt",
    "SESSSION2014.txt"
]

# Function to extract prompts from a file
def extract_prompts(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    return re.findall(r"Prompt \d+:\s*(.*?)\n\n", content, re.DOTALL)

# Function to rewrite prompts via OpenAI API
def rewrite_prompt(prompt, client, model="gpt-4.1-2025-04-14"):
    response = client.responses.create(
        instructions=suggested_rewriting_prompt,
        model="gpt-4.1-2025-04-14",
        input=prompt
    )
    return response.output[0].content[0].text


In [None]:
# Initialize OpenAI client
client = OpenAI()  # or openai.Client(api_key="your-key")

# Output directory
os.makedirs("rewritten_prompts", exist_ok=True)

# Process each file
for file_path in parsed_files:
    prompts = extract_prompts(file_path)
    rewritten = []

    print(f"Rewriting prompts from {file_path}...")

    for i, prompt in enumerate(prompts, start=45):
        rewritten_text = rewrite_prompt(prompt, client)
        rewritten.append(f"Prompt {i}: {rewritten_text}\n")

    # Save to output file
    out_path = os.path.join("rewritten_prompts", os.path.basename(file_path).replace(".txt", ".rewritten2.txt"))
    with open(out_path, "w", encoding="utf-8") as f:
        f.writelines("\n".join(rewritten))

    print(f"✔ Saved rewritten prompts to {out_path}")


In [15]:
import re

def renumber_prompts(input_path, output_path):
    # Read input file
    with open(input_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Find all prompt blocks
    prompts = re.findall(r"Prompt\s+\d+:\s*(.*?)(?=(?:Prompt\s+\d+:|$))", content, re.DOTALL)

    # Reconstruct content with new numbering
    renumbered = "\n\n".join([f"Prompt {i+1}: {p.strip()}" for i, p in enumerate(prompts)])

    # Save to output file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(renumbered)

    print(f"✅ Renumbered prompts saved to: {output_path}")

# Example usage
renumber_prompts("rewritten_prompts/DD2016.rewritten2.txt", "rewritten_prompts/DD2016.rewritten.txt")


✅ Renumbered prompts saved to: rewritten_prompts/DD2016.rewritten.txt


In [23]:
import json

def extract_outputs_from_jsonl(input_path, output_path):
    extracted_texts = []

    # Read each JSON line
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data = json.loads(line)

                # Extract assistant message content
                content_blocks = data["response"]["body"]["output"][0]["content"]

                # Concatenate all blocks of type 'output_text'
                output_text = "\n".join(
                    block["text"] for block in content_blocks if block["type"] == "output_text"
                )

                extracted_texts.append(output_text.strip())

            except Exception as e:
                extracted_texts.append(f"[ERROR parsing line]: {e}")

    # Write to output file
    with open(output_path, "w", encoding="utf-8") as out_f:
        for i, text in enumerate(extracted_texts, start=1):
            out_f.write(f"Prompt {i}:\n{text}\n\n")

    print(f"✅ Extracted {len(extracted_texts)} outputs to: {output_path}")

# === Example Usage ===
input_file = "rewritten_prompts/SESSION14.rewritten.txt"
output_file = "rewritten_prompts/SESSION14.rewritten_outputs_extracted.txt"
extract_outputs_from_jsonl(input_file, output_file)


✅ Extracted 60 outputs to: rewritten_prompts/SESSION14.rewritten_outputs_extracted.txt


In [5]:
response.model_dump().get("output", [])

[{'id': 'rs_687fc93d8684819182495abf2a04ceaa0fb66a0de26da257',
  'summary': [{'text': "**Exploring Swahili cuisine**\n\nThe user is asking for a detailed report on traditional Swahili dishes, which sounds fascinating! I think I should collect information on various recipes, their cultural significance, and cooking methods. It’ll be important to organize this data in a clear way so it's easy for the user to follow. I’m curious to see what unique flavors and ingredients are typically used in these dishes, and how they reflect the rich culture of the Swahili coast!",
    'type': 'summary_text'},
   {'text': "**Crafting a Swahili cuisine report**\n\nThe user wants to learn about traditional Swahili dishes to surprise a friend from Kenya, and they've given clear instructions. I need to focus specifically on Swahili home cooking, creating a structured report that includes at least five traditional dishes, each with a brief description and resource link. Including pictures or videos would be 

In [7]:
import os
import re
import json
from openai import OpenAI

client = OpenAI()

system_message = """
You are a professional research assistant tasked with preparing a structured, data-driven report on the topic the user provides. Your goal is to deliver clear, factual, and well-cited analysis.

Do:
- Focus on data-rich insights: include specific figures, trends, statistics, and measurable outcomes (e.g., market size, historical trends, cost comparisons, adoption rates).
- When appropriate, summarize data in a way that could be turned into visualizations or tables (e.g., “this would work well as a bar chart comparing regional adoption rates”).
- Prioritize reliable, up-to-date sources: peer-reviewed research, government and international organizations, industry white papers, or credible journalism.
- Include inline citations and return all source metadata used in the report.

Be analytical, avoid vague statements, and ensure each section supports reasoned, evidence-backed conclusions that could inform decision-making or future research.
"""

# === Load prompts from file ===
input_path = "prompts/SESSION14-DR.txt"

def load_prompts(filepath, limit=10):
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read()

    pattern = r"Prompt\s+\d+:\s*(.*?)\s*(?=Prompt\s+\d+:|$)"
    matches = re.findall(pattern, text, re.DOTALL)
    return [prompt.strip() for prompt in matches[:limit]]

prompts = load_prompts(input_path, limit=10)
os.makedirs("deep_research_outputs", exist_ok=True)

# === Run research and save ===
for i, user_query in enumerate(prompts, start=1):
    output_path = f"deep_research_outputs/deep_topic_{i}.json"
    domain_path = f"deep_research_outputs/deep_topic_{i}_domains.json"

    if os.path.exists(output_path) and os.path.exists(domain_path):
        print(f"⏭️ Skipping prompt {i}: already exists.")
        continue

    print(f"🔍 Prompt {i}: {user_query[:80]}...")

    try:
        response = client.responses.create(
            model="o3-deep-research",
            input=[
                {
                    "role": "developer",
                    "content": [{"type": "input_text", "text": system_message}]
                },
                {
                    "role": "user",
                    "content": [{"type": "input_text", "text": user_query}]
                }
            ],
            reasoning={"summary": "auto"},
            tools=[
                {"type": "web_search_preview"},
                {"type": "code_interpreter", "container": {"type": "auto", "file_ids": []}}
            ]
        )
        response_data = response.model_dump()

        # === Save full output ===
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(response_data, f, indent=2, ensure_ascii=False)
        print(f"✅ Saved full response to {output_path}")

        # === Extract and save domains ===
        domains = set()
        steps = response_data.get("output", [])

        for step in steps:
            action = step.get("action", {})
            if (
                step.get("type") == "web_search_call"
                and isinstance(action, dict)
                and "url" in action
            ):
                url = action["url"]
                if isinstance(url, str):
                    domain = urlparse(url).netloc
                    if domain:
                        domains.add(domain)

        with open(domain_path, "w", encoding="utf-8") as f:
            json.dump(list(domains), f, indent=2, ensure_ascii=False)
        print(f"🌐 Saved {len(domains)} unique domains to {domain_path}")

    except Exception as e:
        print(f"❌ Error on prompt {i}: {e}")


⏭️ Skipping prompt 1: already exists.
🔍 Prompt 2: Instructions for Researcher

Objective:  
I want to gather detailed information ...
❌ Error on prompt 2: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
⏭️ Skipping prompt 3: already exists.
🔍 Prompt 4: Instructions for Researcher:  
**Goal:**  
Please gather comprehensive and actio...
❌ Error on prompt 4: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
⏭️ Skipping prompt 5: already exists.
⏭️ Skipping prompt 6: already 

In [36]:
# system_message = """
# You are a professional researcher preparing a structured, data-driven report on behalf of a global health economics team. Your task is to analyze the health question the user poses.

# Do:
# - Focus on data-rich insights: include specific figures, trends, statistics, and measurable outcomes (e.g., reduction in hospitalization costs, market size, pricing trends, payer adoption).
# - When appropriate, summarize data in a way that could be turned into charts or tables, and call this out in the response (e.g., “this would work well as a bar chart comparing per-patient costs across regions”).
# - Prioritize reliable, up-to-date sources: peer-reviewed research, health organizations (e.g., WHO, CDC), regulatory agencies, or pharmaceutical earnings reports.
# - Include inline citations and return all source metadata.

# Be analytical, avoid generalities, and ensure that each section supports data-backed reasoning that could inform healthcare policy or financial modeling.
# """

# import os
# import re
# import json

# def parse_instruction_prompts(input_path):
#     with open(input_path, "r", encoding="utf-8") as f:
#         content = f.read()

#     # Match everything from "Prompt N:" to the next "Prompt M:" or end of file
#     prompt_blocks = re.findall(r"Prompt\s+\d+:\s*(.*?)(?=\nPrompt\s+\d+:|$)", content, re.DOTALL)

#     # Strip and return the list of full prompts
#     return [block.strip() for block in prompt_blocks]

# input_path = "rewritten_prompts/SESSION14-DR.txt"  
# output_path = "deep_research_session_batch_requests.jsonl"

# prompts = parse_instruction_prompts(input_path)

# # === Generate JSONL for batch submission ===
# with open(output_path, "w", encoding="utf-8") as out_f:
#     for task_id, prompt in enumerate(prompts, start=1):
#         request = {
#             "custom_id": f"task-{task_id}",
#             "method": "POST",
#             "url": "/v1/responses",
#             "body": {
#                 "model": "o3-deep-research",
#                 "input": [
#                     {
#                         "role": "developer",
#                         "content": [{"type": "input_text", "text": system_message}]
#                     },
#                     {
#                         "role": "user",
#                         "content": [{"type": "input_text", "text": prompt}]
#                     }
#                 ],
#                 "reasoning": {"summary": "auto"},
#                 "tools": [
#                     {"type": "web_search_preview"},
#                     {
#                         "type": "code_interpreter",
#                         "container": {"type": "auto", "file_ids": []}
#                     }
#                 ]
#             }
#         }
#         out_f.write(json.dumps(request) + "\n")

# print(f"✅ Created batch request file with {len(prompts)} prompts: {output_path}")


✅ Created batch request file with 60 prompts: deep_research_session_batch_requests.jsonl
