In [2]:
import os

manuals_path = "/content/manuals"

os.makedirs(manuals_path, exist_ok=True)




In [3]:
!pip install pymupdf chromadb sentence-transformers transformers accelerate -q


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:

In [7]:
import glob, os
PDF_DIR = "/content/manuals"
pdf_paths = sorted(glob.glob(f"{PDF_DIR}/*.pdf"))
print("Found PDFs:", len(pdf_paths))
for p in pdf_paths: print("-", os.path.basename(p))
assert len(pdf_paths) >= 4, "Need 4 manuals."


Found PDFs: 4
- dish washer manual.pdf
- microwave manual.pdf
- refrigerator manual.pdf
- samsung tv manual.pdf


In [8]:
import fitz, re

def guess_heading(txt:str):
    for line in txt.splitlines():
        L = line.strip()
        if 3 < len(L) <= 120 and (L.isupper() or re.match(r'^\d+(\.\d+)*\s', L)):
            return L
    return None

raw_pages = []
for path in pdf_paths:
    doc_name = os.path.basename(path)
    pdf = fitz.open(path)
    for i in range(len(pdf)):
        txt = pdf.load_page(i).get_text("text")
        if txt.strip():
            raw_pages.append({
                "text": txt,
                "metadata": {"doc": doc_name, "page": i+1, "section": guess_heading(txt)}
            })
    pdf.close()

len(raw_pages), raw_pages[0]["metadata"]


(332,
 {'doc': 'dish washer manual.pdf',
  'page': 1,
  'section': '800 Series | 500 Series | 300 Series | Ascenta®'})

In [9]:
from collections import Counter
cnt_pages = Counter(p["metadata"]["doc"] for p in raw_pages)
cnt_pages


Counter({'dish washer manual.pdf': 40,
         'microwave manual.pdf': 48,
         'refrigerator manual.pdf': 180,
         'samsung tv manual.pdf': 64})

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " "]
)

chunks = []
for p in raw_pages:
    for c in splitter.split_text(p["text"]):
        chunks.append({"text": c, "metadata": p["metadata"]})

print("Chunks:", len(chunks))
# quick peek
print(chunks[0]["metadata"])
print(chunks[0]["text"][:300], "…")


Chunks: 1186
{'doc': 'dish washer manual.pdf', 'page': 1, 'section': '800 Series | 500 Series | 300 Series | Ascenta®'}
Dishwashers
800 Series | 500 Series | 300 Series | Ascenta®
© 2011 Sears Brands, LLC. All Rights Reserved
Downloaded from www.Manualslib.com manuals search engine …


In [11]:
from collections import Counter
cnt_chunks = Counter(c["metadata"]["doc"] for c in chunks)
cnt_chunks


Counter({'dish washer manual.pdf': 70,
         'microwave manual.pdf': 192,
         'refrigerator manual.pdf': 748,
         'samsung tv manual.pdf': 176})

In [12]:

from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL)

texts = [d["text"] for d in chunks]
metas = [d["metadata"] for d in chunks]

embs  = embedder.encode(
    texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True
)

DB_PATH = "/content/manuals_index"
client = chromadb.PersistentClient(path=DB_PATH, settings=Settings(anonymized_telemetry=False))
col = client.get_or_create_collection(name="manuals", metadata={"hnsw:space":"cosine"})

ids = [str(i) for i in range(len(texts))]
col.add(documents=texts, metadatas=metas, embeddings=embs, ids=ids)

print("Indexed chunks:", len(col.get()['ids']))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

TypeError: argument 'metadatas': failed to extract enum MetadataValue ('Bool | Int | Float | Str')
- variant Bool (Bool): TypeError: failed to extract field MetadataValue::Bool.0, caused by TypeError: 'NoneType' object cannot be converted to 'PyBool'
- variant Int (Int): TypeError: failed to extract field MetadataValue::Int.0, caused by TypeError: 'NoneType' object cannot be interpreted as an integer
- variant Float (Float): TypeError: failed to extract field MetadataValue::Float.0, caused by TypeError: must be real number, not NoneType
- variant Str (Str): TypeError: failed to extract field MetadataValue::Str.0, caused by TypeError: 'NoneType' object cannot be converted to 'PyString'

In [92]:

def to_scalar(v):
    if v is None:
        return ""
    if isinstance(v, (str, int, float, bool)):
        return v
    return str(v)

def clean_meta(m):
    return {k: to_scalar(v) for k, v in m.items()}

metas_clean = [clean_meta(m) for m in metas]



In [93]:

try:
    client.delete_collection("manuals")
except Exception:
    pass

col = client.create_collection(name="manuals", metadata={"hnsw:space":"cosine"})


In [94]:

def add_in_batches(texts, metas, embs, batch=3000):
    n = len(texts)
    for i in range(0, n, batch):
        sl = slice(i, min(i+batch, n))
        col.add(
            documents=texts[sl],
            metadatas=metas[sl],
            embeddings=embs[sl],
            ids=[str(j) for j in range(i, min(i+batch, n))]
        )

add_in_batches(texts, metas_clean, embs, batch=3000)
print("Indexed chunks:", len(col.get()['ids']))


Indexed chunks: 1186


In [96]:
from transformers import pipeline

gen = pipeline("text2text-generation", model="google/flan-t5-base", device_map="auto")
history = []


Device set to use cpu


In [97]:
import re

BAD_SECTIONS = {"warranty", "safety information", "index", "table of contents"}
BAD_PATTERNS = [r"Downloaded from www\.Manualslib\.com", r"\.{6,}"]

def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def looks_low_signal(text: str, section: str|None) -> bool:
    t = text.lower()
    if len(t) < 220:
        return True
    if section and section.strip().lower() in BAD_SECTIONS:
        return True
    for pat in BAD_PATTERNS:
        if re.search(pat, text):
            return True
    return False

def extract_keywords(q: str) -> list[str]:
    toks = re.findall(r"[a-zA-Z][a-zA-Z\-]+", q.lower())
    stop = {"what","when","where","which","with","this","that","have","from","your","manual","how","many"}
    return [w for w in toks if len(w) >= 4 and w not in stop][:6]


In [98]:
def retrieve_scored(query, k=12, where=None, min_keep=4):
    kws = extract_keywords(query)
    raw = col.query(
        query_texts=[query],
        n_results=max(k*3, 24),
        where=where,
        include=["documents","metadatas","distances"]
    )
    docs = raw["documents"][0]
    metas = raw["metadatas"][0]
    dists = raw["distances"][0]

    cleaned = []
    for d, m, dist in zip(docs, metas, dists):
        txt = clean_text(d)
        if not looks_low_signal(txt, m.get("section")):
            cleaned.append((txt, m, dist))
    if not cleaned:
        cleaned = [(clean_text(d), m, dist) for d, m, dist in zip(docs, metas, dists)]

    def bonus(text: str) -> float:
        t = text.lower()
        hits = sum(1 for kw in kws if kw in t)
        return -0.03 * hits

    rescored = [(dist + bonus(txt), txt, m, dist) for txt, m, dist in cleaned]
    rescored.sort(key=lambda x: x[0])
    take = rescored[:max(k, min_keep)]
    return [(txt, m, orig) for _, txt, m, orig in take]


In [106]:

last_docs = set()
last_kw = []

def chat(msg, k=8, where=None, threshold=0.58):
    global last_docs, last_kw

    def do_query(q, w=None):
        return retrieve_scored(q, k=k, where=w)

    mem = "\n".join([f"User: {u}\nAssistant: {a}" for u, a in history[-5:]])


    hits = do_query(msg, where)


    q_kw = extract_keywords(msg)
    looks_followup = (
        len(q_kw) <= 2 or
        any(t in msg.lower() for t in ["this", "that", "it", "they", "those", "do that", "do this"])
    )
    if where is None and looks_followup and last_docs:
        expanded = msg + " " + " ".join(last_kw[:4])
        scoped_where = {"doc": {"$in": list(last_docs)}}
        hits2 = do_query(expanded, scoped_where)
        if hits2 and (not hits or min(h[2] for h in hits2) <= min(h[2] for h in hits) + 0.02):
            hits = hits2

    if not hits:
        print("I couldn't find relevant information in the manuals for that question.")
        return

    best = min(h[2] for h in hits)
    gated = (where is None)
    if gated and best > threshold:
        print("This looks outside the scope of the manuals I indexed, so I don't have a reliable answer.\n")
        print("Top retrieved (low confidence):")
        for d, m, dist in hits[:3]:
            print(f"- {m['doc']} p.{m['page']} (distance={dist:.2f}) | section: {m.get('section','')}")
        return

    band = best + 0.12
    kept = [h for h in hits if h[2] <= band] or hits[:3]
    context = "".join([f"[{m['doc']} p.{m['page']}] {d}\n" for d, m, _ in kept])

    prompt = (
        "Use the CONTEXT to answer the QUESTION. If the answer is not present, say you don't know.\n"
        "Cite sources as (doc, page). Be concise.\n\n"
        f"CONTEXT:\n{context}\n"
        f"Conversation:\n{mem}\n"
        f"QUESTION: {msg}\nANSWER:"
    )
    out = gen(prompt, max_new_tokens=180, temperature=0.0)[0]["generated_text"].strip()
    history.append((msg, out))


    last_docs = {m['doc'] for _, m, _ in kept}
    last_kw = extract_keywords(msg)

    print(out, "\n\nSources:")
    for d, m, dist in kept:
        print(f"- {m['doc']} p.{m['page']} (distance={dist:.2f}) | section: {m.get('section','')}")


In [107]:
chat("How do I clean the dishwasher filter?")


Wipe the inner door panel and the oven front frame with a soft cloth and a mild detergent solution. Then rinse and wipe dry. This should be done weekly or more often, if needed. Never use cleaning powders or rough pads. Excessive oil splatters on the inside top will be difficult to remove if left for many days. Wipe splatters with a wet paper towel, especially after cooking chicken or bacon. 

Sources:
- dish washer manual.pdf p.11 (distance=0.50) | section: 
- microwave manual.pdf p.8 (distance=0.47) | section: TIPS FOR MICROWAVE
- dish washer manual.pdf p.23 (distance=0.55) | section: 
- microwave manual.pdf p.8 (distance=0.57) | section: TIPS FOR MICROWAVE
- dish washer manual.pdf p.9 (distance=0.58) | section: 


In [108]:
chat("And how often should I do that?")


weekly or more often, if needed. Never use cleaning powders or rough pads. Excessive oil splatters on the inside top will be difficult to remove if left for many days. Wipe splatters with a wet paper towel, especially after cooking chicken or bacon. 

Sources:
- dish washer manual.pdf p.11 (distance=0.48) | section: 
- microwave manual.pdf p.8 (distance=0.51) | section: TIPS FOR MICROWAVE
- dish washer manual.pdf p.23 (distance=0.55) | section: 
- dish washer manual.pdf p.9 (distance=0.52) | section: 
- dish washer manual.pdf p.7 (distance=0.59) | section: 
- dish washer manual.pdf p.6 (distance=0.57) | section: 


In [109]:
chat("What temperature should the refrigerator be set to?")


The Freezer temperature can be set between -8 °F and 5 °F (or between -23 °C and -15 °C) to suit your particular needs. Press the Freezer button repeatedly until the desired temperature is shown in the temperature display. The temperature will change by 1 °F (or 1 °C) with each press. See below. Fahrenheit : 37 °F 36 °F 35 °F 34 °F 44 °F 43 °F 42 °F 41 °F 40 °F 39 °F 38 °F 37 °F 

Sources:
- refrigerator manual.pdf p.34 (distance=0.32) | section: IMPORTANT:
- refrigerator manual.pdf p.35 (distance=0.34) | section: OPERATING
- refrigerator manual.pdf p.22 (distance=0.41) | section: COMPLETING THE SET UP


In [111]:
chat("why are cars faster than buses")

This looks outside the scope of the manuals I indexed, so I don't have a reliable answer.

Top retrieved (low confidence):
- samsung tv manual.pdf p.59 (distance=0.84) | section: 1920 x 1080
- dish washer manual.pdf p.6 (distance=0.87) | section: 
- refrigerator manual.pdf p.121 (distance=0.86) | section: GARANTÍA DEL REFRIGERADOR SAMSUNG


In [112]:
chat("where is egypt located")

This looks outside the scope of the manuals I indexed, so I don't have a reliable answer.

Top retrieved (low confidence):
- refrigerator manual.pdf p.64 (distance=0.87) | section: U.S.A
- refrigerator manual.pdf p.179 (distance=0.86) | section: TÉL : 905-542-3535 FAX : 905-542-3835
- refrigerator manual.pdf p.11 (distance=0.88) | section: SETTING UP


In [113]:
chat("how do i use the defrost function?")

EXPRESS DEFROST This feature should be used only when you defrost 1 pound of frozen ground beef. Example: To defrost 1 Ib Ground beef. Touch: Display Shows: 1. _ Time of day. 2. a_ Time counting down and EXPRESS DEFROST. * Ground beef 1Lbs only/0.5Kg only. 

Sources:
- microwave manual.pdf p.15 (distance=0.44) | section: AUTO
- refrigerator manual.pdf p.29 (distance=0.53) | section: OPERATING
- refrigerator manual.pdf p.8 (distance=0.50) | section: ADDITIONAL TIPS FOR
- refrigerator manual.pdf p.8 (distance=0.54) | section: ADDITIONAL TIPS FOR
- microwave manual.pdf p.6 (distance=0.54) | section: PARTS AND ACCESSORIES
- refrigerator manual.pdf p.24 (distance=0.56) | section: (-1 °C).


In [114]:
chat("where i can store the vegetables and fruits in the refrigerator")

a cool, dry room with adequate ventilation. Ensure that it is not exposed to direct sunlight and never put it near a direct source of heat (a radiator, for example). 

Sources:
- refrigerator manual.pdf p.8 (distance=0.36) | section: ADDITIONAL TIPS FOR
- refrigerator manual.pdf p.8 (distance=0.37) | section: ADDITIONAL TIPS FOR
