In [2]:
import json
import os
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
import time

print("Libraries imported successfully.")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully.


In [3]:
DATA_DIR = '/Users/dilshantharushika/Desktop/laptop agent/data'  
BACKEND_DIR = '/Users/dilshantharushika/Desktop/laptop agent/backend' 
JSON_FILES = [
    'ThinkPad_E14_Gen_5_Intel.json',
    'ThinkPad_E14_Gen_5_AMD.json',
    'HP_ProBook_450_G10.json',
    'HP_ProBook_440_G11.json'
]
MODEL_NAME = 'all-MiniLM-L6-v2' 


os.makedirs(BACKEND_DIR, exist_ok=True)

print(f"Data will be read from: {os.path.abspath(DATA_DIR)}")
print(f"Artifacts will be saved to: {os.path.abspath(BACKEND_DIR)}")

Data will be read from: /Users/dilshantharushika/Desktop/laptop agent/data
Artifacts will be saved to: /Users/dilshantharushika/Desktop/laptop agent/backend


In [4]:
print(f"Loading embedding model: {MODEL_NAME}...")
start_time = time.time()

model = SentenceTransformer(MODEL_NAME)

end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f} seconds.")
print(model)

Loading embedding model: all-MiniLM-L6-v2...
Model loaded in 7.12 seconds.
SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [5]:
all_chunks_text = []
metadata_store = []

print(f"Reading JSON files from {DATA_DIR}...")


for file_name in JSON_FILES:
    file_path = os.path.join(DATA_DIR, file_name)
    print(f"Processing {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
        for section in data:
            sku = section.get('source_model', 'unknown')
            
            
            if section.get('content'):
                text = section['content']
                all_chunks_text.append(text)
                metadata_store.append({
                    "sku": sku,
                    "text": text,
                    "section_title": section.get('section_title', ''),
                    "citations": section.get('source_citations', [])
                })
            
    
            if section.get('subfeatures'):
                for subfeature in section['subfeatures']:
                    sub_text = subfeature['content']
                    all_chunks_text.append(sub_text)
                    metadata_store.append({
                        "sku": sku,
                        "text": sub_text,
                        "section_title": section.get('section_title', ''),
                        "citations": subfeature.get('source_citations', [])
                    })

print("\n Processing Complete")
print(f"Total text chunks found: {len(all_chunks_text)}")
print(f"Total metadata entries: {len(metadata_store)}")


print("\n Verification (First 3 Items)")
for i in range(3):
    print(f"\nChunk {i+1}: {all_chunks_text[i][:100]}...")
    print(f"Meta {i+1}: {metadata_store[i]}")

Reading JSON files from /Users/dilshantharushika/Desktop/laptop agent/data...
Processing /Users/dilshantharushika/Desktop/laptop agent/data/ThinkPad_E14_Gen_5_Intel.json...
Processing /Users/dilshantharushika/Desktop/laptop agent/data/ThinkPad_E14_Gen_5_AMD.json...
Processing /Users/dilshantharushika/Desktop/laptop agent/data/HP_ProBook_450_G10.json...
Processing /Users/dilshantharushika/Desktop/laptop agent/data/HP_ProBook_440_G11.json...

 Processing Complete
Total text chunks found: 291
Total metadata entries: 291

 Verification (First 3 Items)

Chunk 1: A detailed listing of the ports on the ThinkPad E14 Gen 5 (Intel) chassis and their corresponding nu...
Meta 1: {'sku': 'ThinkPad E14 Gen 5 (Intel)', 'text': "A detailed listing of the ports on the ThinkPad E14 Gen 5 (Intel) chassis and their corresponding numbers in the image:\n1. USB 2.0 \n2. Ethernet (RJ-45) \n3. Kensington Nano Security Slot \n4. USB-C 3.2 Gen 2 \n5. Thunderbolt 4 \n6. USB 3.2 Gen 1 (Always On) \n7. HDMI \n8. He

In [6]:
print("Creating embeddings for all text chunks")
start_time = time.time()


embeddings = model.encode(all_chunks_text, show_progress_bar=True)


embeddings = np.array(embeddings).astype('float32')

end_time = time.time()

print(f"\n  Embedding Complete")
print(f"Embeddings created in {end_time - start_time:.2f} seconds.")
print(f"Shape of embeddings array: {embeddings.shape}")

Creating embeddings for all text chunks


Batches: 100%|██████████| 10/10 [00:01<00:00,  5.09it/s]


  Embedding Complete
Embeddings created in 2.01 seconds.
Shape of embeddings array: (291, 384)





In [7]:
print("Building FAISS index...")
d = embeddings.shape[1]  

index = faiss.IndexFlatL2(d)


print(f"Index type: {type(index)}")


index.add(embeddings)

print(f" FAISS Index Built")
print(f"Total vectors in index: {index.ntotal}")


index_path = os.path.join(BACKEND_DIR, 'laptops.index')
print(f"Saving FAISS index to {index_path}...")
faiss.write_index(index, index_path)

print("Index saved.")

Building FAISS index...
Index type: <class 'faiss.swigfaiss.IndexFlatL2'>
 FAISS Index Built
Total vectors in index: 291
Saving FAISS index to /Users/dilshantharushika/Desktop/laptop agent/backend/laptops.index...
Index saved.


In [8]:
metadata_path = os.path.join(BACKEND_DIR, 'laptops_metadata.json')
print(f"Saving metadata to {metadata_path}...")

with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(metadata_store, f, indent=2)

print("Metadata saved.")
print("\n All Artifacts Saved Successfully!")

Saving metadata to /Users/dilshantharushika/Desktop/laptop agent/backend/laptops_metadata.json...
Metadata saved.

 All Artifacts Saved Successfully!


In [9]:

print("--- Running a quick test search ---")


print("Loading model...")
test_model = SentenceTransformer(MODEL_NAME)
print("Loading FAISS index (IndexFlatL2)...")
test_index = faiss.read_index(os.path.join(BACKEND_DIR, 'laptops.index'))
print("Loading metadata...")
with open(os.path.join(BACKEND_DIR, 'laptops_metadata.json'), 'r') as f:
    test_metadata = json.load(f)


query_text = "Does the HP Probook have a 5MP camera?"
k = 3 


query_vector = test_model.encode([query_text]).astype('float32')


print(f"\nSearching index with {type(test_index)}...")
D, I = test_index.search(query_vector, k) 


print(f"\nQuery: '{query_text}'")
print(f"Top {k} results (Indices): {I[0]}")

print("\n--- Retrieved Metadata ---")
for i in I[0]:
    meta = test_metadata[i]
    print(f"Result (Index {i}):")
    print(f"  SKU: {meta['sku']}")
    print(f"  Section: {meta['section_title']}")
    print(f"  Text: {meta['text']}")
    print(f"  Citations: {meta['citations']}\n")

--- Running a quick test search ---
Loading model...
Loading FAISS index (IndexFlatL2)...
Loading metadata...

Searching index with <class 'faiss.swigfaiss.IndexFlatL2'>...

Query: 'Does the HP Probook have a 5MP camera?'
Top 3 results (Indices): [203 268 270]

--- Retrieved Metadata ---
Result (Index 203):
  SKU: HP ProBook 450 G10 — Datasheet
  Section: Camera
  Text: Available camera options.
  Citations: []

Result (Index 268):
  SKU: HP ProBook 440 14 inch G11 Notebook PC
  Section: Camera
  Text: Available camera options are FHD or 5 MP IR cameras (select models).
  Citations: [241]

Result (Index 270):
  SKU: HP ProBook 440 14 inch G11 Notebook PC
  Section: Camera
  Text: 5 MP IR camera (select models)
  Citations: [241, 217]

