In [2]:
from pathlib import Path
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors

from openai import OpenAI  # OpenAI Python SDK

PROJECT_ROOT = Path("..").resolve()
EMB_DIR = PROJECT_ROOT / "embeddings"

EMB_PATH = EMB_DIR / "embeddings.npy"
META_PATH = EMB_DIR / "chunk_metadata.jsonl"

EMB_PATH, META_PATH

(WindowsPath('C:/Users/sully/RAGPROJ/embeddings/embeddings.npy'),
 WindowsPath('C:/Users/sully/RAGPROJ/embeddings/chunk_metadata.jsonl'))

In [3]:
def load_embeddings_and_metadata(emb_path: Path, meta_path: Path):
    embeddings = np.load(emb_path)
    chunks = []
    with meta_path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            chunks.append(json.loads(line))
    print(f"Loaded embeddings: {embeddings.shape}")
    print(f"Loaded metadata records: {len(chunks)}")
    return embeddings, chunks

embeddings, chunks = load_embeddings_and_metadata(EMB_PATH, META_PATH)
chunks[:2]

Loaded embeddings: (2905, 384)
Loaded metadata records: 2905


[{'id': 'US10452978_0',
  'patent_id': 'US10452978',
  'chunk_index': 0,
  'text': 'US010452978B2 ( 12 ) United States Patent Shazeer et al . ( 10 ) Patent No . : US 10 , 452 , 978 B2 ( 45 ) Date of Patent : Oct . 22 , 2019 ( 54 ) ATTENTION - BASED SEQUENCE TRANSDUCTION NEURAL NETWORKS ) U . S . Ci . ( 71 ) Applicant : Google LLC , Mountain View , CA ( US ) ( 58 ) Field of Classification Search CPC . . . . . . . . . . . . . . . . . GOON 3 / 08 ( 2013 . 01 ) ; G06N 3 / 04 ( 2013 . 01 ) ; G06N 3 / 0454 ( 2013 . 01 ) CPC USPC . . . . . . . . . . . . . . . . . . . . . . . . . GOOF 3 / 015 . . . . . . 706 / 15 , 45 See application file for complete search history . ( 72 ) Inventors : Noam M . Shazeer , Palo Alto , CA ( US ) ; Aidan Nicholas Gomez , Toronto ( CA ) ; Lukasz Mieczyslaw Kaiser , Mountain View , CA ( US ) ; Jakob D . Uszkoreit , Portola Valley , CA ( US ) ; Llion Owen Jones , San Francisco , CA ( US ) ; Niki J . Parmar , Sunnyvale , CA ( US ) ; Illia Polosukhin , Mountain View ,

In [4]:
def build_nn_index(embeddings: np.ndarray, n_neighbors: int = 5):
    nn = NearestNeighbors(
        n_neighbors=n_neighbors,
        metric="cosine"
    )
    nn.fit(embeddings)
    return nn

nn_index = build_nn_index(embeddings, n_neighbors=5)

In [5]:
def search_chunks(
    query: str,
    embed_model,          # SentenceTransformer model from 03, or reload it
    nn_index,
    embeddings,
    chunks,
    top_k: int = 5,
):
    # embed query
    q_emb = embed_model.encode([query])
    
    # retrieve
    distances, indices = nn_index.kneighbors(q_emb, n_neighbors=top_k)
    
    results = []
    for rank, (idx, dist) in enumerate(zip(indices[0], distances[0])):
        rec = chunks[idx]
        results.append({
            "rank": rank,
            "score": 1 - float(dist),  # cosine similarity approx
            "id": rec["id"],
            "patent_id": rec["patent_id"],
            "chunk_index": rec["chunk_index"],
            "text": rec["text"],
        })
    return results

In [6]:
from sentence_transformers import SentenceTransformer

model_name = "all-MiniLM-L6-v2"
embed_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
test_query = "How does this invention handle language model dialogue?"
hits = search_chunks(test_query, embed_model, nn_index, embeddings, chunks, top_k=3)

for h in hits:
    print(f"[{h['rank']}] {h['patent_id']} (sim={h['score']:.3f})")
    print(h["text"][:300], "...\n")

[0] US12148421 (sim=0.580)
part of a dialog session between a user of a client device and an automated assistant implemented by the client device: receiving a stream of audio data that captures a spoken utterance ofthe user, the stream of audio data being generated by one or more microphones of the client device, and the spok ...

[1] US11562147 (sim=0.577)
an utterance of the human user in the dialogue history or a language model response. 14. The system of claim 11, wherein a position level encoding layer from the plurality of text encoding layers generates the position level encoding, wherein the position level encoding identifies a token ordering i ...

[2] US11562147 (sim=0.575)
visual dialogue model 55 receives the image 110, the dialogue history 120 and the question 130 as input and generates the answer 150 base on the received input. Prior approaches have attempted to implement visual dialogue, where a dialogue machine agent is tasked to answer a series of questions grou ...



In [8]:
import os
print(os.getenv("OPENAI_API_KEY"))

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


In [9]:
from openai import OpenAI

client = OpenAI()  # uses OPENAI_API_KEY env var

In [10]:
def build_context_string(retrieved_chunks):
    pieces = []
    for r in retrieved_chunks:
        header = f"[{r['patent_id']} | chunk {r['chunk_index']} | score={r['score']:.3f}]"
        pieces.append(header + "\n" + r["text"])
    return "\n\n---\n\n".join(pieces)

In [24]:
def rag_answer(
    question: str,
    embed_model,
    nn_index,
    embeddings,
    chunks,
    client,
    model: str = "gpt-4.1-mini",
    top_k: int = 5,
):
    # 1. Retrieve
    retrieved = search_chunks(
        question,
        embed_model,
        nn_index,
        embeddings,
        chunks,
        top_k=top_k,
    )
    context = build_context_string(retrieved)
    
    system_prompt = (
        "You are a helpful assistant answering questions about a small set of US patents. "
        "Answer the user's question using ONLY the information in the provided context. "
        "If the answer is not in the context, say you don't know based on these documents."
    )
    
    user_prompt = (
        f"Question:\n{question}\n\n"
        f"Context (patent chunks):\n{context}"
    )
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2,
    )
    
    answer = response.choices[0].message.content
    return answer, retrieved

In [25]:
question = "What is the main novelty of these inventions related to language model training or dialogue systems?"

answer, retrieved = rag_answer(
    question,
    embed_model,
    nn_index,
    embeddings,
    chunks,
    client,
    model="gpt-4.1-mini",
    top_k=5,
)

print("QUESTION:")
print(question)
print("\nANSWER:")
print(answer)

print("\n\nRETRIEVED CHUNKS (for debugging):")
for r in retrieved:
    print(f"- {r['patent_id']} chunk {r['chunk_index']} (sim={r['score']:.3f})")

QUESTION:
What is the main novelty of these inventions related to language model training or dialogue systems?

ANSWER:
The main novelties of the inventions related to language model training or dialogue systems, as described in the provided patents, include:

1. **Dataset Generation Using Large Language Models (US20240185001A1)**: This invention introduces a system and technique for generating training datasets for task-oriented dialogue systems by leveraging large language models (LLMs). The method involves selecting template queries, sampling domain-specific tokens, modifying the templates with these tokens to create query prompts, and then using an LLM to generate diverse natural language queries. These generated queries are then used to train conversational machine-learning models tailored to specific domains. This approach automates and enhances the creation of relevant training data, which is crucial for effective domain-specific dialogue system training.

2. **Natural Language 

In [26]:
print("\n\nRETRIEVED CHUNKS:")
for r in retrieved:
    print(f"- {r['patent_id']} | chunk {r['chunk_index']} | sim={r['score']:.3f}")
    print(r['text'][:300], "...\n")



RETRIEVED CHUNKS:
- US20240185001A1 | chunk 0 | sim=0.575
US 20240185001A1 (19) United States (12) Patent Application Publication Nagaraju et al. (54) DATASET GENERATION USING LARGE LANGUAGE MODELS (71) Applicant: NVIDIA Corporation, Santa Clara, CА (US) (72) Inventors: Divija Nagaraju, Mountain View, СА (US); Christopher Parisien, Toronto (CA) (21) Appl.  ...

- US20240185001A1 | chunk 5 | sim=0.537
that were previously performed by humans. In addition to designing efficient and effective machine-learning model (MLM) architectures, the successful deployment or application of the MLMs also depends heavily on the training techniques employed. For example, training an MLM to perform a specific tas ...

- US20240346254A1 | chunk 0 | sim=0.530
(19) United States (12) Patent Application Publication LIU et al. (54) NATURAL LANGUAGE TRAINING AND/OR AUGMENTATION WITH LARGE LANGUAGE MODELS (71) Applicant: MICROSOFT TECHNOLOGY LICENSING, LLC, Redmond, WA (US) (72) Inventors: Yang LIU, Bellevue

NameError: name 'docs' is not defined