# Shcanne Bot
### The Goal of this project is to create an easy reference guide for elevator mechanics to quickly identify, troubleshoot, and solve issues.
### Author: Alex Gill

In [1]:
#Install Packages
!pip install pymupdf python-docx beautifulsoup4 html5lib sentence-transformers faiss-cpu groq python-dotenv


Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Collecting groq
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sentence

In [3]:
#Imports and setups
import os
import json
import fitz  # for PDFs
from bs4 import BeautifulSoup  # for HTML
from docx import Document  # for Word
from tqdm import tqdm  # progress bar


In [5]:
#Helper to Chunk Text
def chunk_text(text, size=500):
    words = text.split()
    return [' '.join(words[i:i+size]) for i in range(0, len(words), size)]


In [None]:
os.list

In [9]:
#Extract and chunk for processing
chunks = []
sources = []

manuals_folder = "manuals"

for filename in tqdm(os.listdir(manuals_folder)):
    path = os.path.join(manuals_folder, filename)
    ext = os.path.splitext(filename)[1].lower()

    try:
        if ext == ".pdf":
            doc = fitz.open(path)
            for page_num in range(len(doc)):
                text = doc[page_num].get_text()
                for chunk in chunk_text(text):
                    chunks.append(chunk)
                    sources.append({"manual": filename, "page": page_num + 1})

        elif ext == ".docx":
            doc = Document(path)
            text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
            for chunk in chunk_text(text):
                chunks.append(chunk)
                sources.append({"manual": filename, "page": 1})

        elif ext == ".txt" or ext == ".w51":
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
            for chunk in chunk_text(text):
                chunks.append(chunk)
                sources.append({"manual": filename, "page": 1})

        elif ext == ".html" or ext == ".htm":
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                soup = BeautifulSoup(f, "html5lib")
                text = soup.get_text(separator=' ')
            for chunk in chunk_text(text):
                chunks.append(chunk)
                sources.append({"manual": filename, "page": 1})

        else:
            print(f"Unsupported file: {filename}")

    except Exception as e:
        print(f"Failed: {filename} — {str(e)}")

# Save to disk
with open("chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f)

with open("sources.json", "w", encoding="utf-8") as f:
    json.dump(sources, f)

print(f"✅ Extracted {len(chunks)} chunks from {len(sources)} sources.")


  1%|          | 4/519 [00:00<00:30, 16.86it/s]

Unsupported file: 103717EN.WPW


  8%|▊         | 40/519 [00:11<03:59,  2.00it/s]

Unsupported file: 50av absolutes.doc
Unsupported file: 7893c1.tst1


 19%|█▉        | 98/519 [00:12<00:30, 13.99it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



 23%|██▎       | 121/519 [00:14<00:25, 15.43it/s]

Unsupported file: balancing.doc
Unsupported file: BRAKE.SUP


 34%|███▍      | 177/519 [00:15<00:03, 86.78it/s]

Unsupported file: COMM.B17
Unsupported file: D48_Burn_ 9000 hex files.doc
Unsupported file: document decriptions.xls


 39%|███▊      | 201/519 [00:15<00:05, 56.70it/s]

Unsupported file: dummy vanes + EAQ.doc


 41%|████      | 213/519 [00:16<00:12, 23.67it/s]

Unsupported file: fault_desc.xls


 48%|████▊     | 251/519 [00:21<00:17, 15.30it/s]

Unsupported file: Image 2016-07-28 07_38 PM.jpg
Unsupported file: Image 2016-10-31 10-50-26.jpeg
Unsupported file: Image 2017-01-17 12-45-28.jpeg
Unsupported file: Image 2017-07-19 13-09-30.jpeg
Unsupported file: Image 2018-06-19 09-01-10.jpeg
Unsupported file: Image 2018-09-14 07-40-46.jpeg
Unsupported file: Image 2019-07-02 12-06-29.jpeg
Unsupported file: Image 2022-11-03 11-05-53.jpeg
Unsupported file: Image 2022-11-05 14-15-58.jpeg
Unsupported file: Image 2023-05-01 09-06-38.jpeg
Unsupported file: Image 2023-05-01 09-06-55.jpeg
Unsupported file: Image 2023-05-01 09-07-13.jpeg


 53%|█████▎    | 277/519 [00:29<01:06,  3.62it/s]

Unsupported file: INSTALL.BAT
Unsupported file: INSTALLC.BAT


 58%|█████▊    | 302/519 [00:30<00:16, 13.06it/s]

Unsupported file: Load Weighing Adjustment.doc


 59%|█████▉    | 308/519 [00:32<00:28,  7.50it/s]

Unsupported file: MCE Tech Bulletin VVMC controls .png


 61%|██████    | 315/519 [00:33<00:24,  8.25it/s]

Unsupported file: Miprom HS_ST Parameters in Service Tool.xls
Unsupported file: Miprom ST fault code list.xls


 64%|██████▍   | 333/519 [00:37<00:39,  4.76it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



 66%|██████▌   | 341/519 [00:37<00:24,  7.32it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



 66%|██████▌   | 343/519 [00:38<00:24,  7.12it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



 67%|██████▋   | 348/519 [00:38<00:15, 10.70it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



 68%|██████▊   | 352/519 [00:38<00:18,  9.15it/s]

Unsupported file: PULSE.H57


 69%|██████▊   | 356/519 [00:39<00:16,  9.97it/s]

Unsupported file: Quick Start-Up List - v1.72 - PM Motors, Serial Control.doc


 74%|███████▍  | 385/519 [00:39<00:04, 27.49it/s]

Unsupported file: Ropegripper TROUBLE SHOOTING.doc
Unsupported file: SAFETY-TEST-SHORTCUTS.xls


 79%|███████▊  | 408/519 [00:43<00:12,  8.98it/s]

Unsupported file: Scindler TX ACUM.jpg
Unsupported file: Shop Rite Union Mill R&R unit.msg
Unsupported file: SmartRise 6 DENNY RD  software 181016-005.msg


 82%|████████▏ | 423/519 [00:43<00:05, 17.40it/s]

Unsupported file: Southeastern PT.JPG
Unsupported file: Spares_connectors.rtf


 83%|████████▎ | 432/519 [00:45<00:08,  9.96it/s]

Unsupported file: TAC 32 pressure test.jpeg


 90%|████████▉ | 465/519 [00:57<00:08,  6.27it/s]

Unsupported file: TEST.CFG
Unsupported file: TEST.PTR
Unsupported file: Testaid.doc


 90%|█████████ | 468/519 [00:57<00:07,  7.04it/s]

Unsupported file: TMS 900  FIELD  MODULE .doc
Unsupported file: TMS50av update doc.doc
Unsupported file: Tms600.zip


 95%|█████████▌| 495/519 [01:03<00:02,  8.41it/s]

Unsupported file: USING.CFG


 99%|█████████▉| 515/519 [01:11<00:01,  2.31it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



100%|██████████| 519/519 [01:12<00:00,  7.18it/s]


✅ Extracted 24648 chunks from 24648 sources.


In [37]:
import multiprocessing
print("Total CPU cores available:", multiprocessing.cpu_count())


Total CPU cores available: 16


In [41]:
import os

# Use all available CPU threads
os.environ["OMP_NUM_THREADS"] = str(os.cpu_count())  # or set a manual limit
os.environ["TOKENIZERS_PARALLELISM"] = "true"


In [47]:
#Set model and batch size
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import time

# Load your chunks
with open("chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

# Choose embedding model
model = SentenceTransformer("hkunlp/instructor-base")
#model = SentenceTransformer("all-MiniLM-L6-v2")  

# Setup batching
batch_size = 64
save_every = 1000

# Resume from checkpoint if exists
if os.path.exists("embeddings_partial.npy"):
    embeddings = list(np.load("embeddings_partial.npy"))
    start = len(embeddings)
    print(f"🔁 Resuming from {start} / {len(chunks)}")
else:
    embeddings = []
    start = 0
    print(f"🚀 Starting from scratch (0 / {len(chunks)})")

🚀 Starting from scratch (0 / 24648)


In [49]:
total = len(chunks)
start_time = time.time()

for i in range(start, total, batch_size):
    batch = chunks[i:i+batch_size]
    try:
        batch_embeddings = model.encode(batch, show_progress_bar=False, batch_size=batch_size)
        embeddings.extend(batch_embeddings)
    except Exception as e:
        print(f"⚠️ Error embedding batch {i}-{i+batch_size}: {e}")
        continue

    # Manual progress bar
    percent = (i + batch_size) / total * 100
    bar = "#" * int(percent // 2)
    print(f"\r[{bar:<50}] {percent:.2f}% ({i+batch_size}/{total})", end="")

    # Save every N steps or at the end
    if (i + batch_size) % save_every == 0 or (i + batch_size) >= total:
        np.save("embeddings_partial.npy", np.array(embeddings))
        elapsed = time.time() - start_time
        print(f"\n💾 Saved progress at {i + batch_size} chunks ({elapsed:.1f}s elapsed)")

print("\n✅ Finished all batches.")
np.save("embeddings.npy", np.array(embeddings))
print("✅ Saved final embeddings to embeddings.npy")


[################                                  ] 32.46% (8000/24648)
💾 Saved progress at 8000 chunks (2093.5s elapsed)
[################################                  ] 64.91% (16000/24648)
💾 Saved progress at 16000 chunks (4264.8s elapsed)
[################################################  ] 97.37% (24000/24648)
💾 Saved progress at 24000 chunks (6232.4s elapsed)
[##################################################] 100.23% (24704/24648)
💾 Saved progress at 24704 chunks (6409.2s elapsed)

✅ Finished all batches.
✅ Saved final embeddings to embeddings.npy


In [55]:
#Store Embeddings in FAISS to search
import faiss
import numpy as np

# Convert list to NumPy array
embeddings = np.array(embeddings)

# Confirm shape
dimension = embeddings.shape[1]

# Build FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save index
faiss.write_index(index, "faiss_index.index")
print("✅ FAISS index created and saved.")

✅ FAISS index created and saved.


In [57]:
#Load Key and groq client
from dotenv import load_dotenv
from groq import Groq
load_dotenv()

groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))


In [59]:
#Ask Question
# Load saved data
with open("sources.json", "r", encoding="utf-8") as f:
    sources = json.load(f)

def search_chunks(query, k=5):
    query_vec = model.encode([query])
    _, indices = index.search(np.array(query_vec).reshape(1, -1), k)
    return [(chunks[i], sources[i]) for i in indices[0]]

def ask_groq(query):
    context_chunks = search_chunks(query)
    context = "\n\n".join([
        f"{text}\n(Source: {meta['manual']} - Page {meta['page']})"
        for text, meta in context_chunks
    ])

    messages = [
        {"role": "system", "content": "You're an expert assistant for elevator mechanics. Use the following manual text to answer clearly."},
        {"role": "user", "content": f"{context}\n\nQuestion: {query}"}
    ]

    response = groq_client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=messages,
        temperature=0.3
    )

    return response.choices[0].message.content


'**Resetting a hydraulic‑elevator after a “low‑oil” shutdown**  \n(Using the excerpts you supplied from the AME‑10.20.007 and HMC‑1000 manuals)\n\n---\n\n## 1.\u202fSafety first – make sure it is safe to work on the machine\n\n| What to check | Why it matters |\n|--------------|---------------|\n| **Power OFF** (or put the controller in “maintenance” mode) | Prevents accidental movement while you are working on the oil system. |\n| **Lock‑out / Tag‑out** the main power and the **Start‑Permit** chain (see error\u202f86) | The manual says “Start permit input has been in incorrect state during drive – check the start‑permit chain. Requires power OFF.” |\n| **Verify the elevator is stopped at a floor** (the car will stay at the floor after a failed relevelling) | The elevator will stay at the floor after three failed relevelling attempts (error\u202f81). This is the normal “stopped‑at‑floor” state before you can reset. |\n\n---\n\n## 2.\u202fDiagnose the low‑oil condition\n\n1. **Read the 