In [4]:
!pip install pdfplumber
!pip install PyMuPDF
!pip install faiss-cpu
!pip install sentence_transformers
!pip -q install streamlit transformers sentence-transformers faiss-cpu
!pip install streamlit pyngrok

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import os
from pathlib import Path
import pdfplumber
import pandas as pd
import fitz  # PyMuPDF
import re
from collections import Counter, defaultdict
import time
import spacy
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch
import toml
from huggingface_hub import login, snapshot_download
import shutil
import glob
import gc



In [16]:


toc_dir = Path('/content/drive/MyDrive/Engineering_tech_manual/')
toc_files = list(toc_dir.glob("*.pdf"))
num_toc_files = len(toc_files)

print(f"Number of '-TOC' PDF files: {num_toc_files}")


os.chdir("/content/drive/MyDrive/rag_app")
print("Current directory:", os.getcwd())


Number of '-TOC' PDF files: 2006
Current directory: /content/drive/MyDrive/rag_app


In [8]:
def extract_text_from_pdf(file_path):
    """
    Extracts all text content from a given PDF file.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        str: Full text extracted from all pages of the PDF.
             Returns an empty string if the file does not exist.
    """
    if not os.path.exists(file_path):
        return ""

    doc = fitz.open(file_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text("text", flags=0)
    return full_text


def parse_formatted_toc(toc_path, manuals_dir="manuals"):
    """
    Parses a formatted Table of Contents (TOC) PDF to extract section and subsection details,
    and retrieves the corresponding text from linked manual PDFs.

    The TOC is expected to have:
    - Section headers in ALL CAPS (e.g., 'INTRODUCTION').
    - Subsections listed with a dotted line followed by a file code (e.g., 'Subsection ..... 12345-001').

    Args:
        toc_path (Path): Path to the TOC PDF file.
        manuals_dir (str): Directory where the manual PDFs are stored.

    Returns:
        list[dict]: List of dictionaries containing:
            - equipment_name (str): Equipment name derived from TOC filename.
            - section_subsection (str): Hierarchical section/subsection title.
            - pdf_filename (str): Expected filename of the corresponding manual PDF.
            - pdf_found (bool): Whether the manual PDF was found in manuals_dir.
            - pdf_text (str): Extracted text from the manual PDF (empty if not found).
    """
    equipment_name = toc_path.stem.replace("-TOC", "")
    rows = []

    with pdfplumber.open(toc_path) as pdf:
        current_section = None
        for page_num, page in enumerate(pdf.pages):
            lines = page.extract_text().split("\n")
            print(f"---- Page {page_num + 1} of {toc_path.name} ----")
            for line_num, line in enumerate(lines):
                line = line.strip()
                print(f"Line {line_num + 1}: {line}")

                # Detect SECTION HEADERS (ALL UPPERCASE)
                if line.isupper() and not re.search(r'\d{5,}-\d{3}', line):
                    current_section = line
                    print(f"  Found Section Header: {current_section}")
                    continue

                # Match subsection lines with a filename and revision (Rev ignored)
                match = re.match(r"(.+?)\.{3,}\s+(\d{5,}-\d{3})", line)
                if match:
                    subsection = match.group(1).strip()
                    file_code = match.group(2).strip()
                    expected_pdf = f"{file_code}.pdf"
                    print(f"  Found Subsection: {subsection}, File: {expected_pdf}")

                    manual_path = Path(manuals_dir) / expected_pdf
                    if manual_path.exists():
                        pdf_text = extract_text_from_pdf(str(manual_path))
                        found = True
                    else:
                        pdf_text = ""
                        found = False
                        print(f"    [Warning] PDF not found for: {expected_pdf}")

                    rows.append({
                        "equipment_name": equipment_name,
                        "section_subsection": f"{current_section} > {subsection}",
                        "pdf_filename": expected_pdf,
                        "pdf_found": found,
                        "pdf_text": pdf_text
                    })

    return rows


def build_df_from_tocs(toc_dir="tocs", manuals_dir="manuals"):
    """
    Builds a consolidated pandas DataFrame from all TOC PDFs in a directory.
    Each TOC is parsed to extract section/subsection mappings and corresponding manual text.

    Args:
        toc_dir (str): Directory containing TOC PDF files (named like '<equipment>-TOC.pdf').
        manuals_dir (str): Directory containing manual PDFs referenced by TOCs.

    Returns:
        pandas.DataFrame: DataFrame with columns:
            - equipment_name
            - section_subsection
            - pdf_filename
            - pdf_found
            - pdf_text
    """
    all_rows = []
    for toc_file in Path(toc_dir).glob("*-TOC.pdf"):
        print(f"Parsing {toc_file.name}")
        rows = parse_formatted_toc(toc_file, manuals_dir)
        all_rows.extend(rows)
    return pd.DataFrame(all_rows)


# Example usage
dir = '/content/drive/MyDrive/Engineering_tech_manual/'
df = build_df_from_tocs(toc_dir=dir, manuals_dir=dir)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Parsing 8258XA-FMTR-AMO-TOC.pdf
---- Page 1 of 8258XA-FMTR-AMO-TOC.pdf ----
Line 1: TABLE OF CONTENTS
  Found Section Header: TABLE OF CONTENTS
Line 2: MANUAL TITLE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8258XA HIGH RESOLUTION FLOWMETER
  Found Section Header: MANUAL TITLE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8258XA HIGH RESOLUTION FLOWMETER
Line 3: PART NUMBER . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 130430-915
Line 4: GENERAL INFORMATION
  Found Section Header: GENERAL INFORMATION
Line 5: Equipment Information Sheet and BOM . . . . . . . . . . . . . . . . . . . . 130430-000 . . . . . . . . . . . . . D1 $
Line 6: Equipment Specification Sheet . . . . . . . . . . . . . . . . . . . . . . . . . . 130430-925 . . . . . . . . . Apr, 91
Line 7: Brief Description of Service/Equipment . . . . . . . . . . . . . . . . 

In [9]:
print(df.shape)

(1016, 5)


In [10]:
def hybrid_chunk_from_text(pdf_text):
    """
    Cleans, filters, and segments raw PDF text into semantically meaningful
    content chunks based on section headers, numbering patterns, and fallback heuristics.

    This function is designed for technical or engineering PDFs where section
    boundaries are often inconsistent or embedded in noisy OCR output.
    It applies several preprocessing steps before chunking, including:
      - Removing confidential notices, TOC pages, and repeated headers/footers
      - Filtering out schematic, figure, and table references
      - Detecting numbered or all-uppercase section headers
      - Merging subsections under parent sections
      - De-duplicating header lines
      - Falling back to heuristic segmentation if no numbered structure is detected

    Args:
        pdf_text (str): Raw text extracted from a PDF file. It may include page
            separators ('\\f'), headers/footers, and non-textual elements.

    Returns:
        list[tuple[str, str]]: A list of (section_title, section_text) tuples, where:
            - section_title (str): The detected or inferred title/heading of the section.
            - section_text (str): The cleaned text belonging to that section.

    Behavior:
        - If the document has numbered sections (e.g., "2.3.1 INTRODUCTION"), those
          are used as chunk headers.
        - If numbered sections are absent, fallback heuristics detect titles using
          text patterns (uppercase lines, short title-cased lines).
        - If no structural patterns are found at all, a single chunk is returned
          with a synthesized title from the first few uppercase words in the document.

    Examples:
        >>> text = "1.0 INTRODUCTION\\nThis manual covers...\\n2.0 SPECIFICATIONS\\n..."
        >>> hybrid_chunk_from_text(text)
        [
            ("1.0 INTRODUCTION", "This manual covers..."),
            ("2.0 SPECIFICATIONS", "...")
        ]

    Notes:
        - Designed for engineering or technical manuals containing TOC pages,
          schematics, and numbered sections.
        - Robust against OCR artifacts and repetitive page elements.
        - Intended for downstream use in RAG (Retrieval-Augmented Generation)
          or document indexing pipelines.
    """
    pdf_text = re.sub(r"^CONFIDENTIAL.*$\n?", "", pdf_text, flags=re.IGNORECASE | re.MULTILINE)

    pages = pdf_text.split("\f")
    clean_pages = []

    toc_keywords = {"TABLE OF CONTENTS", "SECTION", "TITLE", "PAGE", ". . . . .", "-----", "----"}

    # Remove likely TOC pages
    for page in pages:
        lines = page.strip().split("\n")
        if len(lines) < 5:
            continue
        toc_lines = sum(
            1 for line in lines
            if any(kw in line.upper() for kw in toc_keywords) or
               re.match(r'^\s*\d+(\.\d+)*\s+.*\.{2,}\s*\d+\s*$', line)
        )
        if toc_lines / len(lines) > 0.3:
            continue
        clean_pages.append(page)

    # Remove common headers/footers
    filtered_text = "\n".join(clean_pages)
    lines = filtered_text.split("\n")
    line_counts = Counter(lines)
    common_lines = {line for line, count in line_counts.items() if count > 1 and len(line.strip()) > 10}
    filtered_lines = [line for line in lines if line.strip() not in common_lines]

    # Remove TOC-style lines and figure/schematic blocks
    cleaned_lines = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if re.match(r'^\s*\d+(\.\d+)*\s+.*\.{2,}\s*\d+\s*$', line):
            i += 1
            continue
        if re.match(r'^\s*(FIGURE|FIG\.|DIAGRAM|SCHEMATIC|BLOCK DIAGRAM|TABLE)\b.*', line, re.IGNORECASE):
            i += 3
            continue
        cleaned_lines.append(line)
        i += 1

    final_text = "\n".join(cleaned_lines)

    chunks = []

    # Extract sections from headers (for PDFs without explicit sections)
    if not chunks:
        header_regex = re.compile(
            r'^\s*(?P<num>\d{1,2}(?:\.\d{1,2}){0,2})\s+(?P<title>[A-Z][A-Z0-9 /().,-]*)$', re.MULTILINE
        )
        matches = list(header_regex.finditer(final_text))

        if matches:
            for i, match in enumerate(matches):
                section_number = match.group("num").strip()
                section_title = match.group("title").strip()
                if re.search(r'\b(FIGURE|FIG\.|DIAGRAM|SCHEMATIC|TABLE)\b', section_title, re.IGNORECASE):
                    continue
                section_header = f"{section_number} {section_title}"
                start = match.start()
                end = matches[i + 1].start() if i + 1 < len(matches) else len(final_text)
                chunk_text = final_text[start:end].strip()
                chunks.append((section_header, chunk_text))

    # Merge subsections under parent
    merged = defaultdict(str)
    section_titles = {}

    for header, text in chunks:
        base_match = re.match(r"^(\d+(?:\.\d+)?)(?:\.\d+)?\s+(.*)$", header)
        if base_match:
            base_num = base_match.group(1)
            title = base_match.group(2)
            section_titles.setdefault(base_num, title)
            merged[base_num] += "\n" + text

    result = [(f"{num} {section_titles[num]}", content.strip()) for num, content in merged.items()]

    # Remove duplication in chunks
    final_result = []
    for header, text in result:
        lines = text.split("\n")
        unique_lines = []
        seen_headers = set()

        for line in lines:
            if line.strip() == header.strip():
                if line.strip() not in seen_headers:
                    unique_lines.append(line)
                    seen_headers.add(line.strip())
            else:
                unique_lines.append(line)

        final_result.append((header, "\n".join(unique_lines).strip()))

    # Fallback if no chunks found
    if not final_result:
        fallback_chunks = []
        current_header = None
        current_text = ""

        skip_fallback_keywords = {
            "FIGURE", "FIG.", "DIAGRAM", "BLOCK", "TABLE", "PIN", "CONNECTOR",
            "BOARD", "SUPPLY", "CABLE", "ASSEMBLY", "SCHEMATIC"
        }

        def is_schematic_label(line):
            words = line.upper().split()
            return (
                any(kw in words for kw in skip_fallback_keywords) or
                sum(c.isdigit() for c in line) > len(line) * 0.3 or
                (len(words) <= 3 and all(w.isupper() or w.isdigit() for w in words))
            )

        for line in cleaned_lines:
            stripped = line.strip()
            if (
                (stripped.isupper() or stripped.istitle()) and
                2 <= len(stripped.split()) <= 10 and
                not is_schematic_label(stripped)
            ):
                if current_header and current_text:
                    fallback_chunks.append((current_header, current_text.strip()))
                    current_text = ""
                current_header = stripped
            else:
                current_text += stripped + "\n"

        if current_header and current_text:
            fallback_chunks.append((current_header, current_text.strip()))
        final_result = fallback_chunks

    # Final strict filter: Handle pages without numbered sections
    clean_final_chunks = []
    has_numbered_sections = any(re.match(r'^\d+\.\d+(\.\d+)?\s+.+$', title.strip()) for title, _ in result)

    if has_numbered_sections:
        for title, text in result:
            if re.match(r'^\d+\.\d+(\.\d+)?\s+.+$', title.strip()) and len(title.strip().split()) >= 2:
                clean_final_chunks.append((title.strip(), text.strip()))
                continue

            header_match = re.search(r'^\s*([A-Z][A-Z0-9 /().,-]+)\s*$', text, re.MULTILINE)
            if header_match:
                section_title = header_match.group(1).strip()
                if len(section_title.split()) >= 2 and section_title.upper() not in ["TABLE OF CONTENTS", "LIST OF FIGURES"]:
                    clean_final_chunks.append((section_title, text.strip()))

    else:
        prefix_words = []
        for line in pdf_text.split('\n'):
            if line.strip() and line.strip()[0].isupper():
                prefix_words.extend(word for word in line.split() if word.isupper())
            else:
                break
        section_title = " ".join(prefix_words)
        clean_final_chunks.append((section_title, pdf_text.strip()))

    return clean_final_chunks


In [11]:
# Now require GPU and load the model
spacy.require_gpu()
print("Using GPU:", spacy.prefer_gpu())


# --- Load spaCy lightweight model for fast processing ---
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "lemmatizer", "attribute_ruler", "tok2vec", "morphologizer"])
nlp.enable_pipe("senter")  # Only sentence segmentation
print("Loaded spaCy model.")

# --- Helper functions ---


def clean_chunk_text(text):
    text = re.sub(r'(?:\.\s*){5,}', '', text)  # Remove dotted lines (even with spaces)
    text = re.sub(r'\n+\s*\.{3,}\s*\n+', '\n', text)
    text = re.sub(r"^\s*\.{3,}\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(r'\s*\.{3,}\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'(?<=[\d])(?=[A-Z])', ' ', text)  # Fix assembly2When → assembly 2 When
    text = re.sub(r'^\s*[\w\s\-()]+(\.{2,}|\s*\.\s*)+\d+\s*$', '', text, flags=re.MULTILINE)  # Remove TOC style lines
    return text

def is_header_only(text):
    lines = text.strip().splitlines()
    joined = " ".join(lines).strip()
    return len(joined) < 50 and joined.isupper()

def remove_repeated_header_lines(text, header):
    pattern = re.escape(header.strip())
    return re.sub(rf'(?i)^{pattern}$', '', text, flags=re.MULTILINE).strip()

# --- Your main function ---

def chunk_pdf_text_for_df(df):
    chunks = []
    raw_chunks = []

    print("Starting chunking...")
    t0 = time.time()

    for _, row in df.iterrows():
        for sh, ct in hybrid_chunk_from_text(row["pdf_text"]):
            raw_chunks.append({
                "equipment_name": row["equipment_name"],
                "section_subsection": row["section_subsection"],
                "pdf_filename": row["pdf_filename"],
                "pdf_found": row["pdf_found"],
                "section_header": sh.strip(),
                "chunk_text": clean_chunk_text(ct)
            })
    print(f"Step 1: Raw chunking complete. Time: {time.time() - t0:.2f} sec")

    # --- Preprocessing chunk texts ---
    t1 = time.time()
    texts = [remove_repeated_header_lines(c["chunk_text"], c["section_header"]) for c in raw_chunks]
    print(f"Step 2: Text cleanup complete. Time: {time.time() - t1:.2f} sec")

    # --- Use GPU for batch NLP ---
    t2 = time.time()
    docs = list(nlp.pipe(texts, batch_size=512, n_process=1))
    print(f"Step 3: NLP pipe (GPU) complete. Time: {time.time() - t2:.2f} sec")

    # --- Postprocessing and merging ---
    t3 = time.time()
    i = 0
    while i < len(docs):
        chunk = raw_chunks[i]
        text = docs[i].text.strip()
        sent_count = len(list(docs[i].sents))

        if (sent_count <= 1 or is_header_only(text)) and i + 1 < len(docs):
            next_chunk = raw_chunks[i + 1]
            if all(chunk[k] == next_chunk[k] for k in ("equipment_name", "section_subsection", "pdf_filename", "pdf_found")):
                chunk["section_header"] += " " + next_chunk["section_header"]
                text += " " + clean_chunk_text(next_chunk["chunk_text"])
                i += 1

        chunk["chunk_text"] = text
        chunks.append(chunk)
        i += 1
    print(f"Step 4: Merge and finalize chunks. Time: {time.time() - t3:.2f} sec")

    return pd.DataFrame(chunks)


Using GPU: True
Loaded spaCy model.


In [12]:
# Replace this with the filename you want to filter

chunked_df = chunk_pdf_text_for_df(df)

target_pdf = "130364-955.pdf"

# Filter rows for that PDF
filtered_chunks = chunked_df[chunked_df["pdf_filename"] == target_pdf]

print(filtered_chunks.shape)

# Iterate over the filtered rows and print chunks
for _, row in filtered_chunks.iterrows():
    # Print the section/subsection and its corresponding chunked text
    print(f"\n--- Section Header: {row['section_header']} ---")
    print(row['chunk_text'])  # Display the full chunk text

Starting chunking...
Step 1: Raw chunking complete. Time: 2.23 sec
Step 2: Text cleanup complete. Time: 0.22 sec
Step 3: NLP pipe (GPU) complete. Time: 11.01 sec
Step 4: Merge and finalize chunks. Time: 0.48 sec
(7, 6)

--- Section Header: REQUIREMENTS . . . . . . . . . . . . . . . .   2 1.1 EQUIPMENT ---
1
REQUIREMENTS 2
1
REQUIREMENTS 

EQUIPMENT
o
T-Handle 1/4 -20 thread (#056036)
o
Assorted Allen Wrenches and Screwdrivers
o
32-pin bulkhead inserter
o
Pin wrench
o
Strap wrench

--- Section Header: 1.2 REFERENCE DRAWINGS . . . . . . . . . . . .   2 ASSEMBLY PROCEDURES ---
REFERENCE DRAWINGS
1633 EA Electronics Assembly R151203-000
1633 EA Mechanical Assembly
R152302-000
Vacuum Flask Assembly D150316-000
1633 EA Power Supply Assembly
R149266-000
Pigtail and Ribbon Assembly D150317-000
Bottom Sub Assembly for WTS C148965-000
Top Sub Assembly for WTS
C148957-000 2
ASSEMBLY PROCEDURES
2
2
ASSEMBLY PROCEDURES

--- Section Header: 2.1 VACUUM FLASK INSTALLATION ---
VACUUM FLASK INSTALLATION

In [13]:
print(chunked_df.shape)

(2598, 6)


In [None]:
# Save final chunked_df to a CSV file

file = '/content/drive/MyDrive/Engineering_tech_manual/chunked_df.csv'
chunked_df.to_csv(file, index=False)




In [None]:
chunked_df = pd.read_csv('/content/drive/MyDrive/Engineering_tech_manual/chunked_df.csv')

In [None]:
#Filter out rows where pdf_found is False
filtered_df = chunked_df[chunked_df['pdf_found'] == True]

filtered_df.loc[:, 'section_subsection'] = filtered_df['section_subsection'].str.replace(r'^.*?>\s*', '', regex=True)

# Fill missing values before concatenation
filtered_df['section_subsection'] = filtered_df['section_subsection'].fillna('')
filtered_df['section_header'] = filtered_df['section_header'].fillna('')

# Concatenate using .loc on the copied DataFrame
filtered_df.loc[:, 'combined_section'] = filtered_df['section_subsection'] + ' - ' + filtered_df['section_header']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['section_subsection'] = filtered_df['section_subsection'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['section_header'] = filtered_df['section_header'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, 'combined_section'] = filtered_d

In [None]:
# Select desired columns
df = filtered_df[['equipment_name', 'combined_section', 'chunk_text']]

In [None]:
# Rename 'combined_section' to 'topic'
df = df.rename(columns={'combined_section': 'topic'})

In [None]:
df

Unnamed: 0,equipment_name,topic,chunk_text
1,1633EA-SBT-AMO,Brief Description of Service/Equipment -,1\n1633 EA ACOUSTIC ELECTRONICS -- BRIEF DESCR...
2,1633EA-SBT-AMO,Circuit Description - POWER SUPPLY CHASSIS R14...,1\n\n3\n1\nCJ3\nCONNECTOR\nBOARD\nCJ1\nBOARD\n...
3,1633EA-SBT-AMO,Circuit Description - 2.2 SINGLE-CONDUCTOR TRA...,2.2.1\nRECEIVER SECTION\n4\n2.2.2\n\nSINGLE-CO...
4,1633EA-SBT-AMO,Circuit Description - 2.3 RUC INTERFACE BOARD ...,RUC INTERFACE BOARD C154459 5\n2.3.1\nGR AND N...
5,1633EA-SBT-AMO,Circuit Description - FLASKED ELECTRONICS ASSE...,3\n\n8\n3\nJ1\nRCMD\nRCLK\nRDTA\nCCL OUT\nCABL...
...,...,...,...
2592,1590EA-AMO,Preventive Maintenance Level 3 - 2.0 REQUIRED ...,REQUIRED DRAWINGS\nDrawing No.\nDescription\n•...
2593,1590EA-AMO,Preventive Maintenance Level 3 - 3.0 REQUIRED ...,REQUIRED PROCEDURES\nDocument No.\nDescription...
2594,1590EA-AMO,Preventive Maintenance Level 3 - 4.0 REQUIRED ...,REQUIRED DOCUMENTS\n•\nF131023841\nSpare Parts...
2595,1590EA-AMO,Preventive Maintenance Level 3 - 6.0 PREVENTIV...,Mechanical Disassembly\n1. Remove the main ele...


In [None]:
# Load model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Semantic embeddings for chunk_text
semantic_embeddings = embedding_model.encode(df["chunk_text"].tolist(), show_progress_bar=True, convert_to_numpy=True)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Save semantic embeddings using FAISS
semantic_index = faiss.IndexFlatL2(semantic_embeddings.shape[1])
semantic_index.add(semantic_embeddings)
faiss.write_index(semantic_index, "semantic_index.faiss")


# Save metadata with index alignment
df.to_csv("embedding_metadata.csv", index=False)




In [21]:
# Load pre-calculated embeddings and metadata
semantic_index = faiss.read_index("./metadata/semantic_index.faiss")
df = pd.read_csv("./metadata/embedding_metadata.csv")

def semantic_rag(query, top_filter_k=30, top_retrieve_k=5,
                 embedding_model=None, qa_pipeline=None):
    # Step 1: Embed the query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)

    # Step 2a: Extract equipment name from query (unchanged)
    equipment_name_match = re.search(r"\b(\d{4}[A-Z]{2})\b", query)
    equipment_name = equipment_name_match.group(1).strip() if equipment_name_match else ""

    # Step 2b: Filter by equipment name (if found) (unchanged)
    if equipment_name:
        # Extract base equipment name (e.g., 1633EA)
        base_equipment_name = equipment_name.split('-')[0]

        # Filter using the base equipment name
        filtered_df = df[df["equipment_name"].str.startswith(base_equipment_name)].copy()
        if filtered_df.empty:
            # Fallback: If equipment not found, search entire index
            D_filter, I_filter = semantic_index.search(query_embedding, top_filter_k) # search entire index
            filtered_df = df.iloc[I_filter[0]].copy()
    else:
        # If no equipment name in query, search entire index
        D_filter, I_filter = semantic_index.search(query_embedding, top_filter_k) #search entire index
        filtered_df = df.iloc[I_filter[0]].copy()

    # ***Step 3: Retrieve chunks using pre-calculated embeddings***
    # Get embeddings for the filtered chunks
    filtered_chunk_indices = filtered_df.index.tolist()
    filtered_chunk_embeddings = semantic_index.reconstruct_n(filtered_chunk_indices[0], len(filtered_chunk_indices))
    # Create a temporary index for the filtered chunks
    temp_index = faiss.IndexFlatL2(filtered_chunk_embeddings.shape[1])
    temp_index.add(filtered_chunk_embeddings)
    # Search the temporary index to get the most relevant chunks
    D_topic, I_topic_local = temp_index.search(query_embedding, top_retrieve_k)
    # Get the global indices of the most relevant chunks
    I_topic = [filtered_chunk_indices[i] for i in I_topic_local[0]]


    # Step 4: Prepare and return results (unchanged)
    final_results = df.iloc[I_topic].copy()
    final_results["score"] = D_topic[0]  # Use topic scores directly


    return final_results[["equipment_name", "topic", "chunk_text", "score"]]

In [None]:
query = "Explain how to install 1633EA electronics?"
results = semantic_rag(query, embedding_model=embedding_model)

for i, row in results.iterrows():
    print(f"\n🔧 Equipment: {row['equipment_name']}")
    print(f"📂 Topic: {row['topic']}")
    print(f"📝 Chunk:\n{row['chunk_text'][:500]}...")


🔧 Equipment: 1633EA-SBT-AMO
📂 Topic: Disassembly/Assembly Procedures - 2.4 INSTALLATION OF ELECTRONICS INTO
📝 Chunk:
INSTALLATION OF ELECTRONICS INTO PRESSURE HOUSING
All numbers in brackets refer to callouts on drawing R152302.
1.
After the WTS subs are assembled and the threads lubricated,
install the bottom WTS sub [16] first.  Secure the housing
with a strap wrench, and use a pin wrench to rotate the sub.
2.
Insert the electronics [19] (keyed snout first) into the
housing [1].  IMPORTANT:  Make sure the flat ribbon cable is
on the upper side of the electronics when sliding it into
the housing to avoid drag...

🔧 Equipment: 1633EA-SBT-AMO
📂 Topic: Disassembly/Assembly Procedures - 1.2 REFERENCE DRAWINGS . . . . . . . . . . . .   2 ASSEMBLY PROCEDURES
📝 Chunk:
REFERENCE DRAWINGS
1633 EA Electronics Assembly R151203-000
1633 EA Mechanical Assembly
R152302-000
Vacuum Flask Assembly D150316-000
1633 EA Power Supply Assembly
R149266-000
Pigtail and Ribbon Assembly D150317-000
Bottom Sub

In [18]:

model_cache_dir = "/content/drive/MyDrive/huggingface_models"
os.makedirs(model_cache_dir, exist_ok=True)

# Also tell Transformers and HF Hub to use it globally
os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
os.environ["HF_HOME"] = model_cache_dir

# ✅ Load secrets from .streamlit/secrets.toml
secrets_path = "/content/drive/MyDrive/rag_app/.streamlit/secrets.toml"
secrets = toml.load(secrets_path)
hf_token = secrets["hf_token"]

# ✅ Set up Hugging Face login
login(token=hf_token)


model_id = "mistralai/Mistral-7B-Instruct-v0.1"

local_model_path = "/content/drive/MyDrive/huggingface_models/mistral-7b-instruct-v0.1"

#if not os.path.exists(local_model_path):
print("⬇️ Downloading model safely into Drive cache...")
snapshot_download(
    repo_id=model_id,
    local_dir=local_model_path,
    local_dir_use_symlinks=False,  # ✅ force real copies, not symlinks
    resume_download=True,
)


⬇️ Downloading model safely into Drive cache...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]



pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

'/content/drive/MyDrive/huggingface_models/mistral-7b-instruct-v0.1'

In [16]:


# ✅ Load secrets from .streamlit/secrets.toml
secrets_path = "/content/drive/MyDrive/Colab Notebooks/.streamlit/secrets.toml"
hf_token = secrets["hf_token"]

# ✅ Set up Hugging Face login
login(token=hf_token)



model_name = "mistralai/Mistral-7B-Instruct-v0.1"
local_model_path = "/content/drive/MyDrive/huggingface_models/mistral-7b-instruct-v0.1"


# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()

# Load model in float16 for speed + lower memory
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)

# Streamer for faster and non-blocking output
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Create text generation pipeline
qa_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)



`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


In [22]:


def answer_question(query, embedding_model=None, qa_pipeline=None):
    # 1. Retrieve relevant chunks
    retrieved_chunks = semantic_rag(query, embedding_model=embedding_model, qa_pipeline=qa_pipeline)
    retrieved_chunks = retrieved_chunks.head(3)

    # 2. Combine chunks into a single context
    context = "\n\n".join(
        [f"**{row['topic']}**: {row['chunk_text']}" for _, row in retrieved_chunks.iterrows()]
    )

    # 3. Format prompt with real context and query
    prompt = f"""
You are an AI assistant that answers questions strictly based on the provided context.
Do not use any outside knowledge. If the answer cannot be found in the context,
reply exactly with: "The information required to answer this question is not available in the provided context."

Follow these instructions carefully:
- Use clear, correct grammar and spelling.
- Be concise and factual.
- Do not mention the word "context" in your answer.

[EXAMPLE]
Context:
The cat is on the mat.

Question:
Where is the cat?

Answer:
The cat is on the mat.

Now, use the same pattern for the following input.

[CONTEXT]
{context}

[QUESTION]
{query}

[ANSWER]
"""

    # 4. Tokenize and move to model device
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs.input_ids.shape[1]

    # 5. Generate the answer
    output_ids = model.generate(
        **inputs,
        max_new_tokens=1024,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False
    )

    response = tokenizer.decode(output_ids[0][input_len:], skip_special_tokens=True)

    # 6. Optional: cleanup
    if qa_pipeline is not None:
        del qa_pipeline
    gc.collect()
    torch.cuda.empty_cache()

    return response.strip()




In [23]:


gc.collect()  # Run garbage collection
torch.cuda.empty_cache()

def format_paragraphs(text):
  """Adds paragraph breaks after sentence-ending punctuation."""
  return re.sub(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", "\n\n", text)

# Load model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Example usage
user_query = "Describe how to lodge income tax return?"
generated_answer = answer_question(user_query, embedding_model=embedding_model, qa_pipeline=qa_pipeline )


answer = format_paragraphs(generated_answer)

print(answer)


The information required to answer this question is not available in the provided context.


In [24]:

from rag_classes import SemanticRAG, AnswerQuestion
from sentence_transformers import SentenceTransformer


# ✅ Load secrets from .streamlit/secrets.toml
secrets_path = "/content/drive/MyDrive/rag_app/.streamlit/secrets.toml"
secrets = toml.load(secrets_path)
hf_token = secrets["hf_token"]

# ✅ Set up Hugging Face login
login(token=hf_token)


# Set model path to your local model in Drive
os.environ["MODEL_PATH"] = "/content/drive/MyDrive/huggingface_models/mistral-7b-instruct-v0.1"



# Define the full paths to the index and metadata files
index_path = "/content/drive/MyDrive/rag_app/metadata/semantic_index.faiss"
metadata_csv_path = "/content/drive/MyDrive/rag_app/metadata/embedding_metadata.csv"
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

# Files present?
print("Index exists:", os.path.exists(index_path))
print("CSV exists:", os.path.exists(metadata_csv_path))

# Peek at index dim & CSV head
if os.path.exists(index_path) and os.path.exists(metadata_csv_path):
    try:
        # Load FAISS index and metadata CSV
        idx = faiss.read_index(index_path)
        print("FAISS dim:", idx.d)

        df_meta = pd.read_csv(metadata_csv_path)
        display(df_meta.head(2))

        # Instantiate retriever with required arguments
        retriever = SemanticRAG(index_path, metadata_csv_path, embedding_model_name)

        # Instantiate QA with retriever
        qa = AnswerQuestion(retriever)

        # Run retrieval
        hits = retriever.retrieve(
            "Describe fluid identification in sampleview",
            top_filter_k=30,
            top_retrieve_k=5
        )
        display(hits)

    except Exception as e:
        print(f"An error occurred while loading files or running retrieval: {e}")
else:
    print("FAISS index or metadata CSV not found at the specified path.")


Index exists: True
CSV exists: True
FAISS dim: 384


Unnamed: 0,equipment_name,topic,chunk_text
0,1633EA-SBT-AMO,Brief Description of Service/Equipment -,1\n1633 EA ACOUSTIC ELECTRONICS -- BRIEF DESCR...
1,1633EA-SBT-AMO,Circuit Description - POWER SUPPLY CHASSIS R14...,1\n\n3\n1\nCJ3\nCONNECTOR\nBOARD\nCJ1\nBOARD\n...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,equipment_name,topic,chunk_text,score
505,1970IB-SVW-AMO,DESCRIPTION OF SERVICE - DESCRIPTION OF SERVICE,4\n\n\n1970 IB SAMPLEVIEW\n\n\nP/N 130949-935\...,0.95089
502,1970IB-SVW-AMO,DESCRIPTION OF SERVICE - DESCRIPTION OF SERVICE,1\n\n\n1970 IB SAMPLEVIEW\n\n\nP/N 130949-935\...,1.198896
19,1633EA-SBT-AMO,Operating Guide - 3.1 MAIN MENU SUMMARY,MAIN MENU SUMMARY\nThe following summarizes th...,1.244861
523,1970IB-SVW-AMO,LAB SAFETY - LAB SAFETY,3\n\n\n1970 IB SAMPLE VIEW\n\n\n\n____________...,1.272879
521,1970IB-SVW-AMO,LAB SAFETY - LAB SAFETY,2\n\n\n1970 IB SAMPLE VIEW\n\n\n\n____________...,1.317534


In [25]:


# Environment Variables
# --------------------------
index_path = os.environ.get("FAISS_INDEX_PATH", "semantic_index.faiss")
csv_path = os.environ.get("FAISS_META_CSV", "embedding_metadata.csv")
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_name = os.environ.get("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.1")
model_path = os.environ.get("MODEL_PATH", "")
max_new_tokens = int(os.environ.get("MAX_NEW_TOKENS", "1024"))
load_in_8bit = os.environ.get("LOAD_IN_8BIT", "false").lower() == "true"

# Optional: enable 8-bit loading (requires bitsandbytes)
os.environ["LOAD_IN_8BIT"] = "false"



In [27]:
from rag_classes import SemanticRAG, AnswerQuestion
import os
import torch # Import torch
#import get

os.environ["FAISS_INDEX_PATH"] = "/content/drive/MyDrive/rag_app/metadata/semantic_index.faiss"
os.environ["FAISS_META_CSV"] = "/content/drive/MyDrive/rag_app/metadata/embedding_metadata.csv"
os.environ["EMBEDDING_MODEL_NAME"] = "sentence-transformers/all-MiniLM-L6-v2"
os.environ["MODEL_PATH"] = "/content/drive/MyDrive/huggingface_models/mistral-7b-instruct-v0.1"
os.environ["MAX_NEW_TOKENS"] = "1024"
os.environ["LOAD_IN_8BIT"] = "false"


retriever = SemanticRAG(
    index_path = os.environ.get("FAISS_INDEX_PATH", "semantic_index.faiss"),
    metadata_csv_path = os.environ.get("FAISS_META_CSV", "embedding_metadata.csv"),
    embedding_model_name = os.environ.get("EMBEDDING_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
)
# qa = AnswerQuestion(model_path=os.environ.get("MODEL_PATH", ""), model_name=os.environ.get("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.1"))
qa = AnswerQuestion(retriever)

q = "Describe how sampleview distinguishes Water/Oil/Gas. Is the bulb UV or IR? 1633EA"
hits = retriever.retrieve(q, top_filter_k=30, top_retrieve_k=5)

context = qa.build_context(hits, top_n=3)
prompt = qa.build_prompt(context, q)
print("=== Prompt ===\n", prompt[:1000], "\n")

ans = qa.generate(prompt, temperature=0.0, max_new_tokens=512)
print("=== Answer ===\n", ans[:1500])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== Prompt ===
 You are an AI assistant that answers questions strictly based on the provided context.
Do not use any outside knowledge. If the answer cannot be found in the context,
reply exactly with: "The information required to answer this question is not available in the provided context."

Follow these instructions carefully:
- Use clear, correct grammar and spelling.
- Be concise and factual.
- Do not mention the word "context" in your answer.

[EXAMPLE]
Context:
The cat is on the mat.

Question:
Where is the cat?

Answer:
The cat is on the mat.

Now, use the same pattern for the following input.

[CONTEXT]
**Operating Guide - 4.1 DESCRIPTION OF DISPLAYED VARIABLES . . . .   6**: DESCRIPTION OF DISPLAYED VARIABLES
Service:
Micro CBL or SBT, based on LOG MODE variable.
CHV:
Measured Cablehead Voltage.  Valid only if
data is being updated.
FLTM:
Internal Flask Temperature (not implemented).
ETIME:
Elapsed time since the last acquisition.
Near DL:
Near Delay setting in microseconds

In [28]:

import threading, time, os
from google.colab import output

# Kill any previous processes on 8501 (optional safety)
!fuser -n tcp -k 8501 || true
!pkill -f "streamlit run app.py" || true

def run():
    cmd = (
        "STREAMLIT_SERVER_HEADLESS=true streamlit run app.py "
        "--server.port 8501 --server.address 0.0.0.0 "
        "--server.enableCORS false --server.enableXsrfProtection false"
    )
    os.system(cmd)

thread = threading.Thread(target=run, daemon=True)
thread.start()
time.sleep(5)
print("Open Streamlit:", output.eval_js("google.colab.kernel.proxyPort(8501)"))



^C
Open Streamlit: https://8501-gpu-a100-hm-1h2ka5mq8sxoj-c.asia-southeast1-1.prod.colab.dev


In [19]:
!git init


Reinitialized existing Git repository in /content/drive/MyDrive/rag_app/.git/


In [30]:
!git config --global user.email "samar.vasran@hotmail.com"
!git config --global user.name "vasrsam"

In [31]:
!git add .
!git commit -m "commit from Colab"


^C
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   .streamlit/secrets.toml[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mNotebooks/chunking_metadata_filtering.ipynb[m
	[31m__pycache__/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
import getpass
token = getpass.getpass("Enter your GitHub token: ")

# Replace USERNAME and REPO with your GitHub username and repo name
!git remote add origin https://vasrsam:{token}@github.com/vasrsam/rag-demo.git


Enter your GitHub token: ··········


In [None]:
!git branch -M main
!git push -u origin main


Enumerating objects: 13, done.
Counting objects:   7% (1/13)Counting objects:  15% (2/13)Counting objects:  23% (3/13)Counting objects:  30% (4/13)Counting objects:  38% (5/13)Counting objects:  46% (6/13)Counting objects:  53% (7/13)Counting objects:  61% (8/13)Counting objects:  69% (9/13)Counting objects:  76% (10/13)Counting objects:  84% (11/13)Counting objects:  92% (12/13)Counting objects: 100% (13/13)Counting objects: 100% (13/13), done.
Delta compression using up to 12 threads
Compressing objects: 100% (10/10), done.
Writing objects: 100% (13/13), 3.61 MiB | 6.67 MiB/s, done.
Total 13 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/vasrsam/rag-demo.git
 * [new branch]      main -> main
Branch 'main' set up to track remote branch 'main' from 'origin'.
