# start

In [1]:
"""!pip install datasets -q
!pip install transformers -q
import sys
!{sys.executable} -m pip install faiss-gpu"""

'!pip install datasets -q\n!pip install transformers -q\nimport sys\n!{sys.executable} -m pip install faiss-gpu'

In [2]:
import math
import time
from pathlib import Path
import torch
import torch.nn.functional as F
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, DatasetDict


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DEVICE = "cuda"
BATCH_SIZE = 256
MAX_LENGTH = 512
AMP_DTYPE = torch.float32
K_VALUES = 5
BATCH_Q = 32  # query batch size
EMB_OUTPUT_PATH = Path("REG/v1/data/embeddings.pt")

In [4]:
ds = load_dataset("nvidia/TechQA-RAG-Eval")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5").to(DEVICE)

# 1. Utils

In [5]:
def embed_query(text: str) -> torch.Tensor:
    if isinstance(text, str):
        prefixed = [f"query: {text}"]  # BGE recommendation for queries
    else:
        prefixed = [f"query: {q}" for q in text]

    with torch.no_grad():
        inputs = tokenizer(
            prefixed,
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt",
        ).to(DEVICE)

        with torch.amp.autocast("cuda", dtype=AMP_DTYPE):
            emb = model(**inputs).last_hidden_state[:, 0, :]  
    return F.normalize(emb.squeeze(0), p=2, dim=-1).to(torch.float32)

def assert_1_to_1(ds, splits=None):
    filename_to_text, text_to_filename = {}, {}
    splits = splits or list(ds.keys())

    for split in splits:
        dsplit = ds[split]
        for row in dsplit:
            for ctx in row["contexts"]:
                fname = ctx["filename"]
                text = ctx["text"].strip()

                if fname in filename_to_text:
                    assert filename_to_text[fname] == text
                else:
                    filename_to_text[fname] = text

                if text in text_to_filename:
                    assert text_to_filename[text] == fname
                else:
                    text_to_filename[text] = fname
                    
def load_corpus(ds, splits=None):
    """Return list of unique context texts; index = implicit doc_id."""
    assert_1_to_1(ds, splits)
    splits = splits or list(ds.keys())
    texts = []
    fname_to_idx = {}
    seen_filenames = set()
    for split in splits:
        for row in ds[split]:
            for ctx in row["contexts"]:
                fname = ctx["filename"]
                if fname in seen_filenames:
                    continue
                seen_filenames.add(fname)
                fname_to_idx[fname] = len(texts)
                texts.append(ctx["text"].strip())
    return texts, fname_to_idx

def store_ctx_embs():
    texts, _ = load_corpus(ds)
    num_docs = len(texts)
    print(f"Loaded {num_docs} passages")
    model.to(DEVICE)
    model.eval()

    emb_dim = model.pooler.dense.out_features
    print(f"Embedding dimension: {emb_dim}")
    all_embs = torch.zeros((num_docs, emb_dim))
    num_batches = math.ceil(num_docs / BATCH_SIZE)
    print(f"Encoding in {num_batches} batches (batch_size={BATCH_SIZE})")

    start_time = time.time()
    with torch.no_grad():
        for b in tqdm(range(num_batches), desc="Embedding corpus"):
            start, end = b * BATCH_SIZE, min((b + 1) * BATCH_SIZE, num_docs)

            inputs = tokenizer(
                [f"passage: {t}" for t in texts[start:end]],
                padding=True,
                truncation=True,
                max_length=MAX_LENGTH,
                return_tensors="pt",
            ).to(DEVICE)

            with torch.amp.autocast("cuda", dtype=AMP_DTYPE):
                outputs = model(**inputs).last_hidden_state[:, 0, :]  
            all_embs[start:end, :] = F.normalize(outputs.detach(), p=2, dim=-1)

    torch.cuda.synchronize()
    end_time = time.time()
    total_time = end_time - start_time
    docs_per_sec = num_docs / total_time if total_time > 0 else float("inf")

    print(f"Embedded {num_docs} passages in {total_time:.2f} s "
            f"({docs_per_sec:.1f} passages/s)")

    all_embs = all_embs.contiguous().cpu().to(torch.float32)
    EMB_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    torch.save(all_embs, EMB_OUTPUT_PATH)
    print(f"Saved embeddings to {EMB_OUTPUT_PATH}")

#f()

# 2. Pre-Process Lookup

In [6]:
# 1) Build corpus using our single canonical helper
texts, fname_to_idx = load_corpus(ds)   
num_docs = len(texts)
print(f"Corpus has {num_docs} unique passages")

questions = []
gold_ids_per_query = []

for row in ds["train"]:
    question = row["question"]
    contexts = row["contexts"]
    gold_doc_ids = []
    for ctx in contexts:
        fname = ctx["filename"].strip()
        doc_id = fname_to_idx.get(fname, None)
        if doc_id is not None:
            gold_doc_ids.append(doc_id)

    questions.append(question)
    gold_ids_per_query.append(gold_doc_ids)

total = len(questions)
print(f"Evaluating retrieval on {total} TechQA examples (batched)...")

embs = torch.load(EMB_OUTPUT_PATH).to(DEVICE)
N = embs.shape[0]          
total = len(questions)     
gold_mask = torch.zeros(total, N, dtype=torch.bool, device=DEVICE)
for i, gold_ids in enumerate(gold_ids_per_query):
    if not gold_ids:
        continue
    gold_mask[i, gold_ids] = True 

Corpus has 496 unique passages
Evaluating retrieval on 910 TechQA examples (batched)...


# 3. Q Lookup

In [7]:
hits = 0
for start in tqdm(range(0, total, BATCH_Q), desc="Evaluating Hit@K batched"):
    end = min(start + BATCH_Q, total)
    batch_q = questions[start:end]
    q_embs = embed_query(batch_q)  # [B, dim]

    sims = q_embs @ embs.T                  # [B, N]
    _, top_idx = sims.topk(K_VALUES, dim=1)        # [B, K]

    batch_gold_mask = gold_mask[start:end]             # [B, N]
    retrieved_mask = torch.zeros_like(batch_gold_mask) # [B, N]
    retrieved_mask.scatter_(1, top_idx, True)
    batch_hits = (batch_gold_mask & retrieved_mask).any(dim=1)  # [B]
    hits += batch_hits.sum().item()

hit_at_k = hits / total
print(f"\nHit@{K_VALUES} (batched): {hits}/{total} = {hit_at_k:.3f}")


Evaluating Hit@K batched: 100%|██████████| 29/29 [00:12<00:00,  2.41it/s]


Hit@5 (batched): 553/910 = 0.608





In [None]:
def retrieve(question: str):
    """
    RAG retriever: question -> top-k (score, doc_id, text) from corpus.
    """
    q_emb = embed_query(question)            # [dim] on device
    sims = embs @ q_emb                      # [N]
    top_vals, top_idx = torch.topk(sims, k=K_VALUES)

    results = []
    for score, idx in zip(top_vals.tolist(), top_idx.tolist()):
        results.append(
            {
                "doc_id": idx,
                "score": float(score),
                "text": texts[idx],
            }
        )
    return results

q = ds["train"][0]["question"]
ret = retrieve(q)
for r in ret:
    print(f"doc_id={r['doc_id']}, score={r['score']:.3f}")
    print(r["text"][:200].replace("\n", " ") + "...\n")


doc_id=0, score=0.790
Title: IBM STREAMS 4.1.1.1 and 4.1.1.2  JOBS DO NOT INHERIT THE ENVIRONMENT VARIABLES SET IN .BASHRC, WHEN STREAMS IS RUN AS A SYSTEM SERVICE - United States  Text:  FLASH (ALERT)  ABSTRACT  In Stream...

doc_id=239, score=0.661
Title: IBM ERROR: Default DB path is not set, when adding database set on RHEL 5 - United States  Text: 1320206; ClearQuest; Command Line Tools; CQ; cqreg; add_dbset; initialize; CQ_DATABASES; CQDB_rg...

doc_id=235, score=0.649
Title: IBM Data Server Manager (DSM) showing SQLCODE=-206 "<name> is not valid in the context where it is used." - United States  Text: SQLCODE 206 -206 SQL0206 SQL0206N DSM incompatible db2level fixp...

doc_id=99, score=0.645
Title: IBM web server Plugin may need LD_LIBRARY_PATH when used with Apache 2.2 - United States  Text:  TECHNOTE (TROUBLESHOOTING)  PROBLEM(ABSTRACT)  IBM web server Plug-in provides the connection be...

doc_id=351, score=0.644
Title: IBM Known Issues for DB2 on Linux - United States  Text:

# 4. LLM

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B-Instruct").cuda()

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.78s/it]


In [17]:
def test_gwen():
	messages = [
		{"role": "user", "content": "Who are you?"},
	]
	inputs = qwen_tokenizer.apply_chat_template(
		messages,
		add_generation_prompt=True,
		tokenize=True,
		return_dict=True,
		return_tensors="pt",
	).to(qwen_model.device)

	outputs = qwen_model.generate(**inputs, max_new_tokens=40)
	print(qwen_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [21]:
def format_contexts(retrieved):
    chunks = []
    for i, r in enumerate(retrieved, start=1):
        text = r["text"]
        chunks.append(f"[DOC {i}] (score={r['score']:.3f})\n{text}")
    return "\n\n".join(chunks)


def build_rag_messages(question, retrieved):
    context_block = format_contexts(retrieved)
    system_prompt = (
        "You are a helpful technical support assistant. "
        "Explain the root cause briefly and then give the concrete workaround or commands "
        "the user should run, based ONLY on the documents. "
        "If the documents do not contain a clear solution, say you are not sure."
    )
    user_content = f"""Question:
    {question}

    Relevant documents:
    {context_block}

    Answer the question in your own words. If you use a document, mention its DOC number."""
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": user_content},
    ]
    return messages

def call_llm_qwen(messages, max_new_tokens=256, temperature=0.2, top_p=0.9):
    inputs = qwen_tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(qwen_model.device)

    with torch.no_grad():
        outputs = qwen_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )

    gen_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
    answer = qwen_tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()
    return answer

def answer_question(question: str):
    retrieved = retrieve(question) 
    messages = build_rag_messages(question, retrieved)
    answer = call_llm_qwen(messages)

    return {
        "answer": answer,
        "contexts": retrieved,
    }

In [22]:
q = ds["train"][0]["question"]
res = answer_question(q)
print("ANSWER:\n", res["answer"], "\n")
print("DOCS USED:")
for c in res["contexts"]:
    print(f"- doc_id={c['doc_id']}, score={c['score']:.3f}")


ANSWER:
 The root cause of the issue is that in IBM Streams 4.1.1.1 and 4.1.1.2, when Streams is run as a system service, environment variables set in the `.bashrc` file are not being inherited by the Streams jobs. This is different from earlier releases.

To work around this issue, you need to set the required environment variables directly in the Streams instance using `streamtool`. Specifically, you should use the following command:

```sh
streamtool setproperty -d <domain> -i <instance> --application-env <VARIABLE_NAME>=<VARIABLE_VALUE>
```

Replace `<domain>` with the name of your domain, `<instance>` with the name of your instance, `<VARIABLE_NAME>` with the name of the environment variable you want to set, and `<VARIABLE_VALUE>` with the value of the environment variable.

For example, if you need to set the `ODBCINI` variable, you would run:

```sh
streamtool setproperty -d mydomain -i myinstance --application-env ODBCINI=/path/to/odbc.ini
```

This will ensure that the necessa