In [1]:
%pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer

embeddings = np.load("../data/processed/hs_tree_embeddings.npy")


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def get_top_k_matches(query, model, stored_embeddings, k=3):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, stored_embeddings)[0]
    top_indices = similarities.argsort()[::-1][:k]
    return top_indices, similarities[top_indices]

In [3]:
import pandas as pd

hs_tree = pd.read_csv("../data/clean/clean_HS_Tree.csv")
model = SentenceTransformer("all-MiniLM-L6-v2")

query = "Cereals"
indices, scores = get_top_k_matches(query, model, embeddings, k=3)

for i, score in zip(indices, scores):
    print(f"HS6: {hs_tree.iloc[i]['HS6']}, Desc: {hs_tree.iloc[i]['description']}, Score: {score:.3f}")


HS6: 110320.0, Desc: cereal pellets, Score: 0.779
HS6: 100310.0, Desc: cereals barley seed, Score: 0.750
HS6: nan, Desc: cereals barley seed, Score: 0.750


In [16]:
hs_tree[['description', 'HS6']].to_csv('../data/hs_tree_lookup.csv', index=False)

In [19]:
%pip install -U faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/15.0 MB 1.8 MB/s eta 0:00:09
    --------------------------------------- 0.3/15.0 MB 3.5 MB/s eta 0:00:05
   - -------------------------------------- 0.7/15.0 MB 5.2 MB/s eta 0:00:03
   --- ------------------------------------ 1.1/15.0 MB 6.0 MB/s eta 0:00:03
   --- ------------------------------------ 1.4/15.0 MB 5.8 MB/s eta 0:00:03
   ---- ----------------------------------- 1.7/15.0 MB 6.0 MB/s eta 0:00:03
   ----- ---------------------------------- 2.1/15.0 MB 6.3 MB/s eta 0:00:03
   ------ --------------------------------- 2.4/15.0 MB 6.4 MB/s eta 0:00:02
   ------- -------------------------------- 2.8/15.0 MB 6.7 MB/s eta 0:00:02
   ---

In [22]:
import faiss 

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, "../models/faiss_index.index")
