In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['HF_TOKEN']=os.getenv('HF_TOKEN')

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
embeddings.embed_query("hello AI")

[-0.03338824212551117,
 0.03453981503844261,
 0.05947450175881386,
 0.059286098927259445,
 -0.06353531032800674,
 -0.06819586455821991,
 0.08823321014642715,
 0.0344407856464386,
 -0.03278516232967377,
 -0.01581495814025402,
 0.020981721580028534,
 -0.018340323120355606,
 -0.03983224928379059,
 -0.0804707333445549,
 -0.014469229616224766,
 0.03326485678553581,
 0.014259278774261475,
 -0.03404996171593666,
 -0.142915740609169,
 -0.023083431646227837,
 -0.021380223333835602,
 0.002633583964779973,
 -0.04729269817471504,
 -0.010752756148576736,
 -0.06866802275180817,
 0.031124936416745186,
 0.07594586908817291,
 0.0011282607447355986,
 0.011632048524916172,
 -0.036039240658283234,
 0.04483754187822342,
 0.018390731886029243,
 0.12672801315784454,
 -0.0013597395736724138,
 0.00820669624954462,
 0.06909967958927155,
 -0.08076362311840057,
 -0.05841310694813728,
 0.0537545382976532,
 0.02622750587761402,
 -0.0068285781890153885,
 -0.05635844171047211,
 0.0032929808367043734,
 -0.072501882910

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
documents=[
    "what is the capital of India?",
    "who is prime minister of India?",
    "what is capital if maharashtra?"
]

In [9]:
doc_embedding=embeddings.embed_documents(documents)
doc_embedding

[[0.08940590918064117,
  0.028886105865240097,
  -0.054000310599803925,
  0.04900706559419632,
  -0.049924641847610474,
  -0.05717902258038521,
  0.06893069297075272,
  0.02387196384370327,
  -0.01973072811961174,
  -0.021420259028673172,
  -0.021476006135344505,
  -0.14845523238182068,
  0.030220557004213333,
  -0.03851091116666794,
  0.005427372641861439,
  -0.08523868769407272,
  0.0373053252696991,
  -0.01565399020910263,
  0.08032331615686417,
  -0.059880830347537994,
  0.001289683161303401,
  0.02980988845229149,
  -0.005029390100389719,
  -0.0564408153295517,
  0.06752237677574158,
  0.024096351116895676,
  0.03111928515136242,
  -0.036759573966264725,
  -0.029620908200740814,
  -0.004787714686244726,
  0.0654677003622055,
  -0.055164460092782974,
  0.006645567715167999,
  0.004239302594214678,
  -0.07002180814743042,
  0.0005717212334275246,
  -0.03353415057063103,
  0.058922722935676575,
  0.1490166336297989,
  -0.051437217742204666,
  0.03008083812892437,
  0.0119196614250540

In [10]:
query="is naredra modi prime minister of india?"
query_embedding=embeddings.embed_query(query)

In [11]:
cosine_similarity([query_embedding],doc_embedding)

array([[0.40141646, 0.73122192, 0.28169771]])

In [12]:
from sklearn.metrics.pairwise import euclidean_distances

euclidean_distances([query_embedding],doc_embedding)

array([[1.09415131, 0.73318221, 1.1985844 ]])

| Metric            | Similarity Score Range | Behavior                              |
| ----------------- | ---------------------- | ------------------------------------- |
| Cosine Similarity | \[-1, 1]               | Focuses on angle only |
| L2 Distance       | \[0, ∞)                | Focuses on **magnitude + direction**  |


### FAISS -- Faiss is a library for efficient similarity search and clustering of dense vectors

In [13]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [14]:
index=faiss.IndexFlatL2(384)

In [15]:
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [18]:
vector_store.add_texts(['AI is the future','AI will replace many jobs','Cat has powersfull reflex'])

['7d383269-309c-4f2b-ab57-a688b966fcca',
 '6b703b70-766c-4457-bfd0-fd2318a4baa8',
 '44f4d0b4-c99f-4c25-a7ac-79787ab8f1b8']

In [20]:
vector_store.index_to_docstore_id

{0: '7d383269-309c-4f2b-ab57-a688b966fcca',
 1: '6b703b70-766c-4457-bfd0-fd2318a4baa8',
 2: '44f4d0b4-c99f-4c25-a7ac-79787ab8f1b8'}

In [21]:
vector_store.index_to_docstore_id[1]

'6b703b70-766c-4457-bfd0-fd2318a4baa8'

In [22]:
vector_store.docstore.search('6b703b70-766c-4457-bfd0-fd2318a4baa8').page_content

'AI will replace many jobs'

In [24]:
result=vector_store.similarity_search("Tell me about AI",k=3)
result

[Document(id='7d383269-309c-4f2b-ab57-a688b966fcca', metadata={}, page_content='AI is the future'),
 Document(id='6b703b70-766c-4457-bfd0-fd2318a4baa8', metadata={}, page_content='AI will replace many jobs'),
 Document(id='44f4d0b4-c99f-4c25-a7ac-79787ab8f1b8', metadata={}, page_content='Cat has powersfull reflex')]