# PC Advisor RAG Chatbot (Google Colab)

This notebook prepares the retrieval-augmented PC advisor chatbot in a Google Colab runtime. It installs the required packages, builds the ChromaDB vector store, loads the embedding and reranker models, and finally launches a Gradio chat interface that runs entirely on the Colab GPU using an open-source LLM—no external API key required. A final cell also shows how to call the pipeline directly from Python.


In [None]:
import os
import subprocess
import sys
from pathlib import Path

try:
    import google.colab  # type: ignore
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

PROJECT_ROOT = Path("/content/pc-advisor-chatbot" if IN_COLAB else ".").resolve()

if IN_COLAB:
    if not PROJECT_ROOT.exists():
        # TODO: Replace the placeholder below with the actual GitHub URL of your fork if needed.
        REPO_URL = "https://github.com/YOUR_ACCOUNT/pc-advisor-chatbot.git"
        if "YOUR_ACCOUNT" in REPO_URL:
            raise ValueError(
                "Please set REPO_URL to the GitHub HTTPS URL of your pc-advisor-chatbot repository before running this cell."
            )
        subprocess.run(["git", "clone", REPO_URL, str(PROJECT_ROOT)], check=True)
    os.chdir(PROJECT_ROOT)
else:
    if not (PROJECT_ROOT / "app.py").exists():
        raise RuntimeError(
            "Run this notebook from the root of the pc-advisor-chatbot repository or enable the Colab cloning block above."
        )
    os.chdir(PROJECT_ROOT)

print(f"Working directory: {Path.cwd()}")
print("Repository files:")
print("
".join(sorted(str(p.relative_to(Path.cwd())) for p in Path.cwd().iterdir())))


In [None]:
# Install dependencies (this may take a few minutes the first time).
!pip install -q -U pip
!pip install -q -r requirements.txt gradio==4.44.0 transformers accelerate bitsandbytes sentencepiece


In [None]:
# Build the ChromaDB vector store if it has not been created yet.
from pathlib import Path
import subprocess
import sys

chroma_path = Path("data/chromadb")
if chroma_path.exists():
    print("ChromaDB directory already exists – skipping embedding generation.")
else:
    print("Creating embeddings and populating ChromaDB (this can take several minutes)...")
    subprocess.run([sys.executable, "embedding.py"], check=True)
    print("Embedding build complete.")


In [None]:
# Load retrieval components (embedding model + reranker).
import retrieval

device = retrieval.setup_device()
embedding_model = retrieval.load_embedding_model()
reranker_data = retrieval.load_reranker_data(device)
print("Models loaded successfully.")


In [None]:
import torch
from typing import Dict, List, Optional
from transformers import AutoModelForCausalLM, AutoTokenizer


GENERATION_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"


def _select_best_dtype() -> torch.dtype:
    if torch.cuda.is_available():
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            return torch.bfloat16
        return torch.float16
    return torch.float32


print(f"Loading generation model '{GENERATION_MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    GENERATION_MODEL_NAME,
    torch_dtype=_select_best_dtype(),
    device_map="auto",
    trust_remote_code=True,
)
model.eval()


def parse_response(response: str):
    """Split the LLM response into hidden thoughts and the visible answer."""
    import re

    think_pattern = r"<think>(.*?)</think>"
    match = re.search(think_pattern, response, re.DOTALL)
    if match:
        thought_content = match.group(1).strip()
        clean_response = re.sub(think_pattern, "", response, count=1, flags=re.DOTALL).strip()
        return thought_content, clean_response
    return None, response.strip()


def _generate_with_local_model(
    messages: List[Dict[str, str]],
    *,
    temperature: float,
    max_new_tokens: int,
) -> str:
    chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(chat_prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    generation_kwargs = dict(
        **inputs,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=temperature > 0.0,
        temperature=max(temperature, 1e-5) if temperature > 0.0 else 0.0,
    )

    with torch.no_grad():
        outputs = model.generate(**generation_kwargs)

    generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return text.strip()


def rewrite_query_with_llm(query: str) -> str:
    system_prompt = (
        "You are an expert AI assistant that rewrites user queries for a vector database search. "
        "The database contains two types of documents: "
        "1. Individual PC components (CPU, GPU/VGA, RAM, SSD, Mainboard, PSU, Case, etc.). "
        "2. Pre-built or assembled PCs (ready-to-use systems). "
        "Your task is to analyze the user's natural-language query and rewrite it into a concise, "
        "keyword-rich search query that best captures the user's intent.

"
        "Guidelines:
"
        "- If the user wants to build or assemble a PC, focus on component-level keywords and budget splits.
"
        "- Dedicate roughly 80% of the budget to CPU/GPU/RAM/SSD and the rest to supporting parts.
"
        "- Provide approximate per-component prices when the total budget is known.
"
        "- If the user wants a pre-built PC, use keywords like 'pre-built PC' along with the target use case.
"
        "- Expand vague terms into precise hardware-related keywords.
"
        "- Include both Vietnamese and English keywords if helpful.
"
        "- All currency references are in VND.
"
        "Do not answer the question – only output the rewritten search query."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query},
    ]

    response = _generate_with_local_model(messages, temperature=0.0, max_new_tokens=256)
    cleaned = response.replace('"', "").strip()
    return cleaned.splitlines()[0] if cleaned else query


def format_history_for_llm(history: List[List[str]]) -> List[Dict[str, str]]:
    formatted: List[Dict[str, str]] = []
    for user_message, assistant_message in history:
        formatted.append({"role": "user", "content": user_message})
        if assistant_message:
            formatted.append({"role": "assistant", "content": assistant_message})
    return formatted


def rag_chat(
    message: str,
    history: List[List[str]],
    temperature: float,
    max_new_tokens: int,
):
    temperature = float(temperature)
    max_new_tokens = int(max_new_tokens)

    rewritten_query = rewrite_query_with_llm(message)
    retrieved_info = retrieval.perform_retrieval_and_reranking(
        rewritten_query,
        embedding_model,
        reranker_data,
    )

    history_messages = format_history_for_llm(history)
    history_messages.append({"role": "user", "content": message})

    system_prompt = (
        "You are a helpful Vietnamese assistant for a PC parts store. "
        "Always think through the answer inside <think></think> tags before speaking to the user. "
        "Use the retrieved context when it is available, cite prices in VND, and keep the final answer concise and friendly. "
        "If the context is empty, rely on your general PC knowledge and mention that you are doing so."
    )
    contextual_messages = [
        {
            "role": "system",
            "content": f"{system_prompt}

Retrieved context:
{retrieved_info or 'No relevant documents were found.'}",
        }
    ] + history_messages

    collected_response = _generate_with_local_model(
        contextual_messages,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
    )

    thoughts, clean_response = parse_response(collected_response)
    debug_payload: Dict[str, Optional[str]] = {
        "rewritten_query": rewritten_query,
        "retrieved_context": retrieved_info,
        "generation_temperature": f"{temperature:.2f}",
        "max_new_tokens": int(max_new_tokens),
    }
    if thoughts:
        debug_payload["llm_thoughts"] = thoughts

    return clean_response or collected_response, debug_payload


In [None]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("## PC Advisor RAG Chatbot")
    gr.Markdown(
        "The entire retrieval and generation pipeline runs locally on this Colab runtime. "
        "Adjust the temperature or token limit if you want more creative or longer responses."
    )
    chat_interface = gr.ChatInterface(
        fn=rag_chat,
        additional_inputs=[
            gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Generation Temperature"),
            gr.Slider(128, 1024, value=512, step=64, label="Max New Tokens"),
        ],
        additional_outputs=gr.JSON(label="RAG Debug Info"),
        title="PC Advisor Chatbot",
        description="Ask about PC parts or pre-built systems in Vietnamese or English.",
    )

demo.launch(share=True)


In [None]:
# Optional: call the pipeline programmatically from Python.
SAMPLE_QUESTION = "Tôi có 25 triệu, tư vấn cấu hình PC chơi game và làm đồ họa nhẹ?"

answer, debug_info = rag_chat(
    SAMPLE_QUESTION,
    history=[],
    temperature=0.2,
    max_new_tokens=512,
)

print("Question:", SAMPLE_QUESTION)
print("\nAnswer:\n", answer)
print("\nDebug Info:")
for key, value in debug_info.items():
    print(f"- {key}: {value}")
