# all-mpnet-base-v2 (model)

In [105]:
# see https://sbert.net/ on installation details of SentenceTransformer
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import re
import torch

## Loading data

In [106]:
# Load the data you have created using faker or any data that you used for building the search
item_description = pd.read_csv("inventory_item_desc_filtered_data.csv")

In [107]:
item_description.item_desc.head(10)

0                  ELTGASRANG 
1                   OTL FS RNG
2      ELECTRIC D ROP IN RANGE
3          SAMS 28CF  FDBM GNG
4                RANGE S I GAS
5    28 CU  FT  SAM  3 DOOR FR
6                    69472 KME
7                36 IN GAS  CT
8                   36 GAS CT 
9     WHITE  RAN KME GE  SLIDE
Name: item_desc, dtype: object

In [108]:
item_description.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455 entries, 0 to 454
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   inventory_id     455 non-null    int64 
 1   store_unit       455 non-null    int64 
 2   item_number      455 non-null    int64 
 3   active_flag      455 non-null    int64 
 4   id               455 non-null    int64 
 5   item_number_str  455 non-null    int64 
 6   item_desc        455 non-null    object
dtypes: int64(6), object(1)
memory usage: 25.0+ KB


## Cleaning the data to remove special chars

In [109]:
# Utility method to clean up special chars
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', str(text))

In [110]:
item_description["item_desc"] = item_description["item_desc"].apply(remove_special_characters)

### Data after clean-up

In [111]:
item_description.item_desc.head(10)

0                  ELTGASRANG 
1                   OTL FS RNG
2      ELECTRIC D ROP IN RANGE
3          SAMS 28CF  FDBM GNG
4                RANGE S I GAS
5    28 CU  FT  SAM  3 DOOR FR
6                    69472 KME
7                36 IN GAS  CT
8                   36 GAS CT 
9     WHITE  RAN KME GE  SLIDE
Name: item_desc, dtype: object

## Applying - all-mpnet-base-v2 model to the item description

### Making the item descriptoin dataframe colum to a list

In [112]:
item_desc_array = item_description.item_desc.tolist()

In [113]:
len(item_desc_array)

455

In [114]:
# If you dont have GPU remove 'device=cuda'
model_minlm_l12 = SentenceTransformer("all-MiniLM-L12-v2", device="cuda")

In [115]:
item_desc_emb_minilm_l12 = model_minlm_l12.encode(item_desc_array)

Note: like mentioned in the article https://medium.com/@venku.buragadda/typesense-federated-multi-geo-and-sematic-ml-search-0eb37fc7d7bc dimension is `384`

In [116]:
item_desc_emb_minilm_l12.shape

(455, 384)

In [117]:
type(item_desc_emb_minilm_l12)

numpy.ndarray

## Lets map each row with its embeddings.

In [118]:
all_embds = item_desc_emb_minilm_l12.tolist() 


In [119]:
item_description["embedding"] = all_embds

In [122]:
item_description.head(1)

Unnamed: 0,inventory_id,store_unit,item_number,active_flag,id,item_number_str,item_desc,embedding
0,37307867,13611,84509,0,37307867,84509,ELTGASRANG,"[-0.047509580850601196, -0.03400413319468498, ..."


**Below embeddings are good to be loaded into Typesens.**

In [121]:
# store in jsonl to load into typesense
item_description["item_number_str"]  = item_description["item_number_str"].astype(str)
item_description["id"]  = item_description["id"].astype(str)
item_description.to_json('inventory_ml_own_embd_to_load.jsonl', orient='records', lines=True)

# How do you search the data using the embeddings created above?

## Searching the data using above embeddings

In [123]:
search_term = ["hot food"]

We are using the same model object to encode the search term. This creates the embeddings according to the model

In [44]:
search_term_emb = model_minlm_l12.encode(search_term)

In [90]:
search_term_emb.shape

(1, 384)

In [94]:
# this should return embeddings to search the word
searc_term_embd_to_use_with_typesense_search = search_term_emb[0].tolist()

Find the distance between the search term and other data in the corpus

In [48]:
search_term_dis = util.cos_sim(search_term_emb, item_desc_emb_minilm_l12)

In [49]:
search_term_dis.shape

torch.Size([1, 455])

Now find similar items

In [51]:
# flatten the array first
search_term_emb_flat = search_term_dis.cpu().numpy().flatten()

In [89]:
# list(search_term_emb_flat.tolist())

In [64]:
similar_items = [item_desc_array[idx] for idx, score in enumerate(search_term_emb_flat) if score >= 0.296]

In [65]:
similar_items

['COOKTOP 36  GAS',
 'COOKTOP 36  GAS',
 'KAD DISH ',
 'KME DISH B KME I BLACKS',
 'KME DISH B KME I BLACKS',
 'KME DISH B KME I BLACKS']

### You can do the same thing using a convienence method

In [55]:
similarity_scores = model_minlm_l12.similarity(search_term_emb, item_desc_emb_minilm_l12)[0]
similarity_scores.shape

torch.Size([455])

In [56]:
top_k = 5
scores, indices = torch.topk(similarity_scores, k=top_k)
scores, indices 

(tensor([0.3625, 0.3266, 0.3266, 0.2962, 0.2962]),
 tensor([117,  50,  51, 211, 119]))

In [57]:
indices.tolist()

[117, 50, 51, 211, 119]

In [58]:
[item_desc_array[idx] for idx in indices.tolist()]

['KAD DISH ',
 'COOKTOP 36  GAS',
 'COOKTOP 36  GAS',
 'KME DISH B KME I BLACKS',
 'KME DISH B KME I BLACKS']