## Search Engine on the local database by running llm locally
- This notebook demonstrates how to use the langchain library and ollama to create a search engine on the local database by running the language model locally.
- The search engine will search for the similar documents in the database and return the relevant documents.
- The search engine uses the embeddings generated by the language model to find the similar documents in the database.


In [3]:
# Load Libraries
from langchain_community.vectorstores import InMemoryVectorStore # use chroma for storing the embeddings into a file on disk to retain.
from langchain_ollama import OllamaEmbeddings # To instatiate different llm embeddings model supported by ollama running locally
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document # Format to store documents and its content
import os
from getpass import getpass # to give key for openai

#### LOAD SPECIFIC EMBEDDINGS MODEL

In [5]:

# DONOT USE OpenAI and following code if you want to run locally.
#os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API Key: ")
#EMBEDDING_MODEL = OpenAIEmbeddings(model="text-embedding-3-small")

# DO NOT USE. The following model is not specfically designed for the text embeddings. It gives random outputs for similarity search.
#EMBEDDING_MODEL = OllamaEmbeddings(model="deepseek-r1:1.5b") # Embedding model for internal database documents

#To create the embeddings locally using Ollama
EMBEDDING_MODEL = OllamaEmbeddings(model="nomic-embed-text") # Embedding model for internal database documents

# EMBEDDING_MODEL = HuggingFaceEmbeddings(model_name="jina_embeddings")

# Each text is converted into a 768 dimension vector
embds = EMBEDDING_MODEL.embed_documents(["Royal enfield classic motor bike.", "Chocolate chip cookies are the best"])
print(len(embds), len(embds[0]), len(embds[1]), '\n', embds[0][0:10], '\n', embds[1][0:10])


2 768 768 
 [-0.014669459, -0.044619765, -0.19731998, -0.010156427, 0.020799182, 0.018944671, -0.040338904, 0.06402744, -0.00789125, -0.03171669] 
 [0.015491951, 0.0466017, -0.1637841, -0.002217627, 0.0073978957, -0.007360134, -0.038634654, 0.02167486, -0.016737062, -0.048966836]


#### USE THE LANGCHAIN DOCUMENT FORMAT TO STORE THE TEXT AND METADATA

In [11]:
texts = ["Royal enfield classic motor bike.", "Chocolate chip cookies are the best", "Love cookies and motor bikes", "love mountaints too"]
documents = []
for text, source in zip(texts, range(1, len(texts) + 1)):
    document = Document(page_content=text, metadata={"source": f"doc {source}"})
    documents.append(document)

print(documents)

[Document(metadata={'source': 'doc 1'}, page_content='Royal enfield classic motor bike.'), Document(metadata={'source': 'doc 2'}, page_content='Chocolate chip cookies are the best'), Document(metadata={'source': 'doc 3'}, page_content='Love cookies and motor bikes'), Document(metadata={'source': 'doc 4'}, page_content='love mountaints too')]


#### CREAT VECTOR STORE TO STORE THE EMBEDDINGS

In [12]:
VECTOR_STORE = InMemoryVectorStore(EMBEDDING_MODEL)  # store the embeddings for the text
# -- Create and store the embeddings for the text --
# vector_store is a class that stores the embeddings for the text
_ = VECTOR_STORE.add_documents(documents)

In [None]:
#### SEARCH FOR SIMILAR DOCUMENTS

In [17]:
import numpy as np
query = "love motorbikes"
similar_context_docs = VECTOR_STORE.similarity_search_with_score(query, k=5)
_ = [print(np.round(score,2), ' ', doc.page_content) for doc, score in similar_context_docs]

0.75   Love cookies and motor bikes
0.7   Royal enfield classic motor bike.
0.56   love mountaints too
0.4   Chocolate chip cookies are the best
