In [2]:
import pandas as pd

In [3]:
data = [
    ['What is the weather like today?', 'general'],
    ['Can you provide the latest stock market updates?', 'finance'],
    ['Recommend a good Italian restaurant nearby', 'food'],
    ['How do I reset my password?', 'tech support'],
    ['Tell me a joke', 'entertainment'],
    ['What are the symptoms of a flu?', 'health'],
    ['Book a flight to New York', 'travel'],
    ['How to make a chocolate cake?', 'cooking'],
    ['Whats the score in the football game?', 'sports'],
    ['Im feeling happy today', 'personal emotion']
]

In [4]:
frame = df = pd.DataFrame(data, columns=['text', 'category'])

In [5]:
frame

Unnamed: 0,text,category
0,What is the weather like today?,general
1,Can you provide the latest stock market updates?,finance
2,Recommend a good Italian restaurant nearby,food
3,How do I reset my password?,tech support
4,Tell me a joke,entertainment
5,What are the symptoms of a flu?,health
6,Book a flight to New York,travel
7,How to make a chocolate cake?,cooking
8,Whats the score in the football game?,sports
9,Im feeling happy today,personal emotion


In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
# take only the text from the frame

text = frame['text']


In [8]:
# use pre trained models for genrating semantic meqaning full sentences

encoder = SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens")

In [9]:
encoder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [10]:
embeddings = encoder.encode(text)

In [11]:
print(embeddings)

[[-0.07014389 -1.2804506   2.4701333  ... -0.73540807 -0.07419275
  -0.3445145 ]
 [-0.01914381 -0.80404085  1.6078941  ... -0.45232838 -0.11034355
  -0.4712346 ]
 [-0.6479589  -0.5422218   1.1934566  ... -0.22400853 -0.36273238
   0.09537402]
 ...
 [-0.5161804   0.90830016  0.3541043  ... -0.31298518 -0.3336213
   1.0297551 ]
 [-0.25683105 -0.33773777  0.8183755  ... -0.08970172 -0.32920805
   0.6984345 ]
 [ 0.08747523 -0.6351942   1.6031832  ...  0.48280492  0.28970277
  -0.35146865]]


In [12]:
embeddings.shape

(10, 768)

In [13]:
import faiss

In [15]:
dimension = embeddings.shape[1]
print(dimension)

768


In [16]:


# creating an index for similarity search
index = faiss.IndexFlatL2(dimension) # uses l2 / euclidean distance and brute force comparission agains the user query
faiss.normalize_L2(embeddings) # to have a unit length
index.add(embeddings) # storing the embedding into the vector database






In [17]:
print(index)

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x146143fc0> >


In [18]:
import numpy as np

In [19]:
user_query = "hi i am good at soccer and My Fav club is manchester united"

In [20]:
user_query_vec = encoder.encode(user_query)

In [21]:
print(user_query_vec)

[-0.5246818   0.5870422   0.7650312  -0.56523335 -0.50847435 -1.5637009
 -0.14866737 -0.06526566  0.49705654 -0.03896036  1.0632689  -0.6900523
  0.5468394   0.40253115  0.6428171  -0.14203355  0.00259318 -0.07197702
  0.59811985 -0.19257379 -0.477442   -0.01418298 -0.37684345 -0.17484407
  0.59815526  0.183665    0.18427357 -0.26385337 -0.872091    0.16605157
 -0.1483368  -0.44667274  0.21628079 -1.1019652  -0.25865215  0.39239088
  0.46736002  0.7762835   0.13740046 -0.7453327  -0.19614017 -0.17619738
  0.5644792  -0.51691765 -0.7444471  -0.47979236 -1.5812258   1.2155225
  0.40886736 -1.0310419   0.41070554  0.9116865  -0.7479063   0.01633762
 -0.41708255  0.48216388 -0.3562048  -0.5761913  -1.6603739  -0.60368776
  0.8358904  -0.06935151  0.23630048 -0.4703606   0.707524    0.7015624
  0.1935111   0.6952035  -0.58328223  0.00220822  0.21143633 -0.7073302
 -0.0807537   0.05745737  0.1790441  -0.99009156  0.36488152  0.02987641
  1.5009913   1.0200038   0.06894059  0.13749191  0.4641

In [22]:
user_query_vec.shape

(768,)

In [23]:
user_vector = np.array([user_query_vec])

In [25]:
print(user_vector)
print(user_vector.shape)

[[-0.5246818   0.5870422   0.7650312  -0.56523335 -0.50847435 -1.5637009
  -0.14866737 -0.06526566  0.49705654 -0.03896036  1.0632689  -0.6900523
   0.5468394   0.40253115  0.6428171  -0.14203355  0.00259318 -0.07197702
   0.59811985 -0.19257379 -0.477442   -0.01418298 -0.37684345 -0.17484407
   0.59815526  0.183665    0.18427357 -0.26385337 -0.872091    0.16605157
  -0.1483368  -0.44667274  0.21628079 -1.1019652  -0.25865215  0.39239088
   0.46736002  0.7762835   0.13740046 -0.7453327  -0.19614017 -0.17619738
   0.5644792  -0.51691765 -0.7444471  -0.47979236 -1.5812258   1.2155225
   0.40886736 -1.0310419   0.41070554  0.9116865  -0.7479063   0.01633762
  -0.41708255  0.48216388 -0.3562048  -0.5761913  -1.6603739  -0.60368776
   0.8358904  -0.06935151  0.23630048 -0.4703606   0.707524    0.7015624
   0.1935111   0.6952035  -0.58328223  0.00220822  0.21143633 -0.7073302
  -0.0807537   0.05745737  0.1790441  -0.99009156  0.36488152  0.02987641
   1.5009913   1.0200038   0.06894059  0.13

In [26]:
faiss.normalize_L2(user_vector)

In [28]:
index.search(user_vector, k=3)

(array([[1.2548951, 1.3141702, 1.4343551]], dtype=float32), array([[9, 2, 8]]))

In [29]:
x,y = index.search(user_vector, k=3)
print(x)
print(y)

[[1.2548951 1.3141702 1.4343551]]
[[9 2 8]]


In [44]:
dist,match = index.search(user_vector, k=10)
close_result = pd.DataFrame({"close distance": dist[0], "close_match": match[0]})

In [48]:
merge = pd.merge(close_result, frame, left_on="close_match", right_index=True, how='left')

In [46]:
print("close_result info:")
print(close_result.info())
print("\nclose_result head:")
print(close_result.head())

close_result info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   close distance  10 non-null     float32
 1   close_match     10 non-null     int64  
dtypes: float32(1), int64(1)
memory usage: 252.0 bytes
None

close_result head:
   close distance  close_match
0        1.254895            9
1        1.314170            2
2        1.434355            8
3        1.585649            5
4        1.618261            7


In [47]:
print("frame info:")
print(frame.info())
print("\nframe head:")
print(frame.head())

frame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      10 non-null     object
 1   category  10 non-null     object
dtypes: object(2)
memory usage: 292.0+ bytes
None

frame head:
                                               text       category
0                   What is the weather like today?        general
1  Can you provide the latest stock market updates?        finance
2        Recommend a good Italian restaurant nearby           food
3                       How do I reset my password?   tech support
4                                    Tell me a joke  entertainment


In [49]:
print(merge)

   close distance  close_match  \
0        1.254895            9   
1        1.314170            2   
2        1.434355            8   
3        1.585649            5   
4        1.618261            7   
5        1.619036            4   
6        1.636168            6   
7        1.668450            1   
8        1.681805            3   
9        1.761183            0   

                                               text          category  
0                            Im feeling happy today  personal emotion  
1        Recommend a good Italian restaurant nearby              food  
2             Whats the score in the football game?            sports  
3                   What are the symptoms of a flu?            health  
4                     How to make a chocolate cake?           cooking  
5                                    Tell me a joke     entertainment  
6                         Book a flight to New York            travel  
7  Can you provide the latest stock market update