# OpenAI Deep-Research api

In [None]:
from openai import OpenAI
import os
import datetime
from urllib.parse import urlparse
import re

os.environ["OPENAI_API_KEY"] = "your_openai_key_here"
os.environ["GEMINI_API_KEY"] = "your_gemini_key_here"
os.environ["GOOGLE_API_KEY"] = "your_google_key_here"
os.environ["DEEPSEEK_API_KEY"] = "your_deepseek_key_here"


client = OpenAI(timeout=3600)


In [None]:
# STEP 1: Run Deep Research over prompts and save deep_topic_{i}.json

import os, re, json, time, pathlib
from datetime import datetime
from openai import OpenAI

# --- config ---
INPUT_PATH = "data/prompts/SESSION14-DR.txt"      
OUT_DIR    = "prompt_reconstruction/deep_research"           
MODEL      = "o3-deep-research"
MAX_PROMPTS = 10  # set an int (e.g., 10) to limit, or None for all

# system instructions (as in your snippet)
system_message = """
You are a professional researcher preparing a structured, data-driven report on behalf of a global health economics team. Your task is to analyze the health question the user poses.

Do:
- Focus on data-rich insights: include specific figures, trends, statistics, and measurable outcomes (e.g., reduction in hospitalization costs, market size, pricing trends, payer adoption).
- When appropriate, summarize data in a way that could be turned into charts or tables, and call this out in the response (e.g., “this would work well as a bar chart comparing per-patient costs across regions”).
- Prioritize reliable, up-to-date sources: peer-reviewed research, health organizations (e.g., WHO, CDC), regulatory agencies, or pharmaceutical earnings reports.
- Include inline citations and return all source metadata.

Be analytical, avoid generalities, and ensure that each section supports data-backed reasoning that could inform healthcare policy or financial modeling.
"""

TOOLS = [
    {"type": "web_search_preview"},
    {"type": "code_interpreter", "container": {"type": "auto", "file_ids": []}},
]

client = OpenAI()

def read_prompts_from_file(path: str):
    """
    Parse blocks labeled 'Prompt N:' (case-insensitive) and return
    a list of (idx:int, text:str), sorted by idx.
    """
    text = pathlib.Path(path).read_text(encoding="utf-8", errors="ignore")
    pat = re.compile(r"(?:^|\n)Prompt\s+(\d+):\s*(.*?)(?=\nPrompt\s+\d+:|$)", re.IGNORECASE | re.DOTALL)
    blocks = pat.findall(text)
    prompts = [(int(n), body.strip()) for (n, body) in blocks]
    prompts.sort(key=lambda x: x[0])
    return prompts

def call_deep_research(prompt_text: str):
    return client.responses.create(
        model=MODEL,
        input=[
            {"role": "developer", "content": [{"type": "input_text", "text": system_message}]},
            {"role": "user",      "content": [{"type": "input_text", "text": prompt_text}]},
        ],
        reasoning={"summary": "auto"},
        tools=TOOLS,
    )

pathlib.Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
prompts = read_prompts_from_file(INPUT_PATH)
if MAX_PROMPTS is not None:
    prompts = prompts[:MAX_PROMPTS]

print(f"Running Deep Research on {len(prompts)} prompts...")
for i, (idx, prompt_text) in enumerate(prompts, start=1):
    out_path = pathlib.Path(OUT_DIR) / f"deep_topic_{i}.json"
    print(f"→ Prompt {idx} → {out_path.name}")

    # simple retry for transient errors
    for attempt in range(5):
        try:
            resp = call_deep_research(prompt_text)
            # save raw JSON; support different SDKs
            try:
                raw_json = resp.model_dump()
            except Exception:
                raw_json = json.loads(resp.json())
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(raw_json, f, indent=2, ensure_ascii=False)
            print(f"✅ Saved {out_path}")
            break
        except Exception as e:
            wait = 2 ** attempt
            print(f"  [Attempt {attempt+1}] Error: {e} → retrying in {wait}s")
            time.sleep(wait)
    else:
        print(f"❌ Failed after retries for Prompt {idx}")


In [None]:
import json
from urllib.parse import urlparse

for i in range(1, 11):
    input_path = f"deep_research_outputs/deep_topic_{i}.json"
    output_path = f"deep_research_outputs/deep_topic_{i}_domains.json"
    domains = set()

    with open(input_path, "r", encoding="utf-8") as f:
        response_data = json.load(f)

    response = response_data.get("output", [])

    for step in response:
        action = step.get("action", {})
        if (
            step.get("type") == "web_search_call"
            and isinstance(action, dict)
            and "url" in action
        ):
            url = action["url"]
            if isinstance(url, str):
                domain = urlparse(url).netloc
                if domain:
                    domains.add(domain)

    # Save to file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(list(domains), f, indent=2, ensure_ascii=False)

    print(f"✅ Saved {len(domains)} unique domains to {output_path}")


# OpenAI Web-Search (through api)

In [None]:
import openai
import os
import datetime
from urllib.parse import urlparse

os.environ["OPENAI_API_KEY"] = "your_openai_key_here"
os.environ["GEMINI_API_KEY"] = "your_gemini_key_here"
os.environ["GOOGLE_API_KEY"] = "your_google_key_here"
os.environ["DEEPSEEK_API_KEY"] = "your_deepseek_key_here"

client = openai.OpenAI()

# prompt = "A friend from Kenya is visiting you and you'd like to surprise him with by cooking a traditional Swahili dish. You want to learn about Swahili dishes and how to cook them. Find web pages about Swahili home cooking.",
prompt = "Your friend would like to quit smoking. You would like to provide him with relevant information about: different ways to quit smoking, programs available to help quit smoking, benefits of quitting smoking, second effects of quitting smoking, using hypnosis to quit smoking, using the cold turkey method to quit smoking.\
Only use information from pages you actually visit by clicking links—do not rely on search result summaries or prior knowledge."

response = client.responses.create(
    model="gpt-4o",  # or another supported model
    input=prompt,
    stream=True,
    tools=[{"type": "web_search"}]
)

# Store visited URLs with timestamps
visited_urls = []

for event in response:
    # Check for streamed annotation events
    if event.type == "response.output_text.annotation.added":
        annotation = event.annotation
        if annotation and annotation.get("type") == "url_citation":
            url = annotation.get("url")
            domain = urlparse(url).netloc
            timestamp = datetime.datetime.utcnow().isoformat()

            visited_info = {
                "timestamp": timestamp,
                "url": url,
                "domain": domain,
                "title": annotation.get("title"),
                "sequence_number": event.sequence_number,
                "item_id": event.item_id
            }

            visited_urls.append(visited_info)

            # Print each as it's received
            print(f"[{timestamp}] Visited domain: {domain} ({url})")

# print(response)
# for annon in response.output[1].content[0].annotations:
#     print(annon.url)