In [None]:
import os, requests
from typing import Any, ClassVar
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback


class OpenAIResponsesLLM(CustomLLM):
    """
    Wrapper for the preview `/v1/responses` endpoint.
    Handles the three payload shapes seen so far:
      • {"choices": …, "message": …}   (chat-like)
      • {"choices": …, "delta": …}     (chat stream)
      • {"output":  […{"content": […{"type": "output_text"} ] } ]}
    """

    # mark as `ClassVar` so pydantic ignores them
    context_window: ClassVar[int] = 16_384
    num_output:     ClassVar[int] = 512
    model_name:     ClassVar[str] = "gpt-4o"

    @property
    def metadata(self) -> LLMMetadata:                    # noqa: D401
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    # ---------- blocking completion ----------
    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        url = "https://api.openai.com/v1/responses"
        headers = {
            "Authorization": f"Bearer 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",
            "Content-Type": "application/json",
        }
        payload = {
            "model": self.model_name,
            "input": [
                {"role": "user",
                 "content": [{"type": "input_text", "text": prompt}]}
            ],
            "max_output_tokens": kwargs.get("max_tokens", self.num_output),
            "temperature":      kwargs.get("temperature", 0.7),
        }

        r = requests.post(url, headers=headers, json=payload, timeout=60)
        r.raise_for_status()
        data = r.json()

        # -------- unified extraction ----------
        text = ""

        # 1) chat/completions-style
        if data.get("choices"):
            choice = data["choices"][0]
            if (msg := choice.get("message")):
                text = msg.get("content", "")
            elif (delta := choice.get("delta")):
                text = delta.get("content", "")

        # 2) new "output" array shape
        elif data.get("output"):
            first = data["output"][0]           # one assistant msg
            if first.get("content"):
                for part in first["content"]:
                    # look for the output_text segment
                    if part.get("type") in ("output_text", "text"):
                        text = part.get("text", "")
                        break

        # 3) very early beta shape
        elif "output_text" in data:
            text = data["output_text"]

        if not text:
            raise ValueError(f"Unrecognised /v1/responses payload:\n{data}")
        # --------------------------------------

        return CompletionResponse(text=text)

    # ---------- streaming (single-chunk stub) ----------
    @llm_completion_callback()
    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
        full = self.complete(prompt, **kwargs).text
        yield CompletionResponse(text=full, delta=full)


In [None]:
%pip install llama_index


In [None]:
import os, requests
from typing import Any, List, Optional
from llama_index.core.embeddings import BaseEmbedding
from pydantic import FieldValidationInfo


class OpenAI3SmallEmbeddings(BaseEmbedding):
    model_config = {"extra": "allow"}     # <-- add this

    def __init__(
        self,
        model_name: str = "text-embedding-3-small",
        api_key: Optional[str] = None,
        dimensions: Optional[int] = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        self.model_name = model_name
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
        self.endpoint = "https://api.openai.com/v1/embeddings"
        self.dimensions = dimensions            # e.g. 512 to shorten vectors

    # ---------- internal helper ----------
    def _embed(self, texts: List[str]) -> List[List[float]]:
        headers = {
            "Authorization": f"Bearer add your key here",
            "Content-Type": "application/json",
        }
        payload: dict[str, Any] = {"model": self.model_name, "input": texts}
        if self.dimensions:
            payload["dimensions"] = self.dimensions

        r = requests.post(self.endpoint, headers=headers, json=payload)
        r.raise_for_status()
        return [item["embedding"] for item in r.json()["data"]]

    # ---------- synchronous API ----------
    def _get_text_embedding(self, text: str) -> List[float]:
        return self._embed([text])[0]

    def _get_query_embedding(self, query: str) -> List[float]:
        return self._get_text_embedding(query)

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        return self._embed(texts)

    # ---------- asynchronous API ----------
    async def _aget_text_embedding(self, text: str) -> List[float]:
        return self._get_text_embedding(text)

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)

    async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        return self._get_text_embeddings(texts)


  from pydantic import FieldValidationInfo


In [None]:
!wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt -O data/paul_graham_essay.txt

--2025-07-03 03:48:47--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham_essay.txt’


2025-07-03 03:48:48 (4.47 MB/s) - ‘data/paul_graham_essay.txt’ saved [75042/75042]



In [None]:
from typing import Optional, List, Mapping, Any

from llama_index.core import SimpleDirectoryReader, SummaryIndex
from llama_index.core.callbacks import CallbackManager
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from llama_index.core import Settings, VectorStoreIndex

Settings.embed_model = OpenAI3SmallEmbeddings()

llm = OpenAIResponsesLLM()
Settings.llm = llm

documents = SimpleDirectoryReader("data").load_data()

# build index
index = VectorStoreIndex.from_documents(documents)

# build / reuse your query_engine
query_engine = index.as_query_engine(
    similarity_top_k=8,          # tweak how many chunks come back
    # any other kwargs…
)

query_engine = index.as_query_engine(similarity_top_k=8)  # adjust k here
resp = query_engine.query("What is mentioned about Sam Altman")

print("=== ANSWER ===")
print(resp)

print("\n=== CHUNKS USED ===")
for i, node_with_score in enumerate(resp.source_nodes, 1):
    node   = node_with_score.node
    score  = node_with_score.score          # cosine-sim in embedding space
    text   = node.text.replace("\n", " ")   # single-line for brevity
    print(f"{i:02d} | score={score:.4f} | {text[:120]}…")


=== ANSWER ===
Sam Altman is mentioned as one of the impressive members of the first batch of Y Combinator startups. He later became the second president of YC after Paul Graham decided to step down. Initially, Sam said no to the offer because he wanted to start a startup to make nuclear reactors, but he eventually agreed in October 2013.

=== CHUNKS USED ===
01 | score=0.4219 | But while I continued to work a good deal in Arc, I gradually stopped working on Arc, partly because I didn't have time …
02 | score=0.4183 | [13]  Once again, ignorance worked in our favor. We had no idea how to be angel investors, and in Boston in 2005 there w…
03 | score=0.3767 | Publishing online means you treat the online version as the (or at least a) primary version.  [12] There is a general le…
04 | score=0.3681 | So while working on things that aren't prestigious doesn't guarantee you're on the right track, it at least guarantees y…
05 | score=0.3530 | Now when I walked past charming little restaurants