In [1]:
import os
os.chdir(r"C:\Assignments_Predii\vehicle_specs_extraction")
print(os.getcwd())

print("PDF exists:", os.path.exists("data/service_manual.pdf"))


C:\Assignments_Predii\vehicle_specs_extraction
PDF exists: True


In [2]:
import os, sys

project_root = r"C:\Assignments_Predii\vehicle_specs_extraction"

# Move working directory to project root
os.chdir(project_root)
print("Working directory:", os.getcwd())

# Ensure src/ is added correctly
sys.path.insert(0, os.path.join(project_root, "src"))


Working directory: C:\Assignments_Predii\vehicle_specs_extraction


Imports & Environment Setup

In [3]:
# Notebook Test Environment
import os
from dotenv import load_dotenv
load_dotenv()

# Show environment variables loaded
print("PDF PATH:", os.getenv("PDF_PATH"))
print("VECTOR DB:", os.getenv("VECTOR_DB_PATH"))
print("LLM MODEL:", os.getenv("LLM_MODEL"))
print("EMBED MODEL:", os.getenv("EMBEDDING_MODEL"))


PDF PATH: data/service_manual.pdf
VECTOR DB: data/vector_db
LLM MODEL: None
EMBED MODEL: all-MiniLM-L6-v2


In [4]:
from pdf_loader import load_pdf


pages = load_pdf()

print(f"Total pages extracted: {len(pages)}")
print("Sample page text:\n")
print(pages[0]["text"][:500])


Total pages extracted: 852
Sample page text:

Suspension System 
Inspection and Verification 
1.
Road test. 
z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are 
apparent, refer to Section 100-04 . 
2.
Inspect tires. 
z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer to the 
Vehicle Certification (VC) label. 
z Verify that all tires are sized to specification. Refer to the VC label. 
z Inspect the tires for incorrect wear and damage. Install new tires as ne


Test Preprocessing

In [5]:
from src.preprocess import preprocess_pages

clean_pages = preprocess_pages(pages)

print("Cleaned page sample:\n")
print(clean_pages[0]["text"][:500])


Cleaned page sample:

Suspension System Inspection and Verification 1. Road test. z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are apparent, refer to Section 100-04 . 2. Inspect tires. z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer to the Vehicle Certification (VC) label. z Verify that all tires are sized to specification. Refer to the VC label. z Inspect the tires for incorrect wear and damage. Install new tires as necessary. 


Test Chunking

In [6]:
from src.chunker import create_chunks

chunks = create_chunks(clean_pages)

print("Total chunks:", len(chunks))
print("\nSample chunk:\n")
print(chunks[0]["text"])


Total chunks: 990

Sample chunk:

Suspension System Inspection and Verification 1. Road test. z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are apparent, refer to Section 100-04 . 2. Inspect tires. z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer to the Vehicle Certification (VC) label. z Verify that all tires are sized to specification. Refer to the VC label. z Inspect the tires for incorrect wear and damage. Install new tires as necessary. 3. Inspect chassis and underbody. 4. Inspect for aftermarket equipment. z Check for aftermarket changes to the steering, suspension, wheel and tire components (such as competition, heavy duty, etc.). The specifications shown in this manual do not apply to vehicles equipped with aftermarket equipment. Visual Inspection Chart 5. If an obvious cause for an observed or reported condition is found, correct the cause (if possible) before proceeding to the next step. 6

In [7]:
import numpy as np
print("NumPy Version Loaded:", np.__version__)
print(np.__file__)


NumPy Version Loaded: 1.26.4
c:\Assignments_Predii\vehicle_specs_extraction\venv3\Lib\site-packages\numpy\__init__.py


Test VectorDB Building (FAISS)

In [8]:
from src.embed_store import VectorDB

# Create new Vector DB instance
vdb = VectorDB()

# Add chunks (embedding + storing)
vdb.add_documents(chunks)

print("Vector DB creation complete.")
print("Total records stored:", len(vdb.metadata))


  from tqdm.autonotebook import tqdm, trange


Vector DB creation complete.
Total records stored: 6165


Test Retrieval

In [9]:
query = "Torque for brake caliper bolts"

results = vdb.search(query, top_k=3)

print("Retrieved Chunks:\n")
for r in results:
    print(f"[Page {r['page_number']}] → {r['text'][:300]}")
    print("-" * 50)


Retrieved Chunks:

[Page 734] → Torque Specifications a Refer to the procedure in this section. SECTION 206-07: Power Brake Actuation 2014 F-150 Workshop Manual SPECIFICATIONS Procedure revision date: 10/25/2013 Description Nm lb-ft lb-in Brake booster nuts a — — — Brake master cylinder nuts 25 18 — Brake vacuum pump bolts a — — —
--------------------------------------------------
[Page 734] → Torque Specifications a Refer to the procedure in this section. SECTION 206-07: Power Brake Actuation 2014 F-150 Workshop Manual SPECIFICATIONS Procedure revision date: 10/25/2013 Description Nm lb-ft lb-in Brake booster nuts a — — — Brake master cylinder nuts 25 18 — Brake vacuum pump bolts a — — —
--------------------------------------------------
[Page 734] → Torque Specifications a Refer to the procedure in this section. SECTION 206-07: Power Brake Actuation 2014 F-150 Workshop Manual SPECIFICATIONS Procedure revision date: 10/25/2013 Description Nm lb-ft lb-in Brake booster nuts a — — — Brak

Test Mistral Model Import

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer OK")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,   # CPU-friendly
    device_map="auto"            # works automatically
)

print("Model Loaded Successfully!")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer OK


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Model Loaded Successfully!


Test LLM Extraction

In [11]:
from src.llm_extractor import extract_specs

raw_output = extract_specs(query, results)
print(raw_output)


Loading Phi-3 Mini model... (first call)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are not running the flash-attention implementation, expect numerical differences.


You read vehicle service manuals and extract only the requested technical specifications. Always answer in valid JSON.


User query:
Torque for brake caliper bolts

Return ONLY a JSON list with objects like:
[
  {
    "component": "Brake master cylinder",
    "spec_type": "Torque",
    "value": "25",
    "unit": "Nm",
    "page_number": 734
  }
]

Use only information from the context.
If not found, return [].

Context (manual excerpts):
 [Page 734] Torque Specifications a Refer to the procedure in this section. SECTION 206-07: Power Brake Actuation 2014 F-150 Workshop Manual SPECIFICATIONS Procedure revision date: 10/25/2013 Description Nm lb-ft lb-in Brake booster nuts a — — — Brake master cylinder nuts 25 18 — Brake vacuum pump bolts a — — — Coolant expansion tank/lower Air Cleaner (ACL) housing assembly bolts 15 — 133 Page 1 sur 1 2014 F-150 Workshop Manual 2014-03-01 file:///C:/TSO/tsocache/VDTOM2_10764/SE2~us~en~file=SE267001.HTM~gen~ref.HT...
[Page 734] Torque Specifications a R

Validate JSON

In [12]:
from src.postprocess import validate_json

json_data = validate_json(raw_output)

print("Parsed JSON:")
json_data


Parsed JSON:


Save Output

In [13]:
from src.postprocess import save_json

save_json(json_data, "outputs/test_output.json")

print("Saved to outputs/test_output.json")


Saved to outputs/test_output.json
