In [18]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader=TextLoader("speech.txt")
documents=loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
docs= text_splitter.split_documents(documents)

In [19]:
docs

[Document(metadata={'source': 'speech.txt'}, page_content="ou can find LeetCode links for problems \navailable on the internet. However few problems \nare not there on LeetCode for which you will not \nfind a practice link attached. We cannot use \nthird-party links due to legal constraints. \nAlso the newly added TUF+ practice links are \nto give you a free trial of TUF+ which a lot\n of people asked for. If you don't wish to \n upgrade, you can still use the TUF platform,\n  nothing has changed.\n\nRemember, you started using our website because \nof our content and not because of some third party\nlinks")]

In [24]:
pip install faiss-cpu

Collecting faiss-cpuNote: you may need to restart the kernel to use updated packages.

  Downloading faiss_cpu-1.11.0-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0-cp310-cp310-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/15.0 MB 2.1 MB/s eta 0:00:07
   -- ------------------------------------- 0.8/15.0 MB 1.6 MB/s eta 0:00:09
   --- ------------------------------------ 1.3/15.0 MB 1.7 MB/s eta 0:00:08
   ---- ----------------------------------- 1.6/15.0 MB 1.6 MB/s eta 0:00:09
   ----- ---------------------------------- 2.1/15.0 MB 1.8 MB/s eta 0:00:08
   ------ --------------------------------- 2.4/15.0 MB 1.7 MB/s eta 0:00:08
   ------ --------------------------------- 2.6/15.0 MB 1.6 MB/s eta 0:00:08
   ------- -------------------------------- 2.9/15.0 MB 1.7 MB/s eta 0:00:08
   --------- ----

In [25]:
embeddings=OllamaEmbeddings(model="gemma:2b")
db=FAISS.from_documents(docs, embeddings)
db

<langchain_community.vectorstores.faiss.FAISS at 0x29bf862c760>

In [27]:
# querrying
query = "What is the main topic of the speech?"
docs = db.similarity_search(query)
docs[0].page_content

"ou can find LeetCode links for problems \navailable on the internet. However few problems \nare not there on LeetCode for which you will not \nfind a practice link attached. We cannot use \nthird-party links due to legal constraints. \nAlso the newly added TUF+ practice links are \nto give you a free trial of TUF+ which a lot\n of people asked for. If you don't wish to \n upgrade, you can still use the TUF platform,\n  nothing has changed.\n\nRemember, you started using our website because \nof our content and not because of some third party\nlinks"

In [29]:
retriever=db.as_retriever()
docs=retriever.invoke(query)
docs[0].page_content

"ou can find LeetCode links for problems \navailable on the internet. However few problems \nare not there on LeetCode for which you will not \nfind a practice link attached. We cannot use \nthird-party links due to legal constraints. \nAlso the newly added TUF+ practice links are \nto give you a free trial of TUF+ which a lot\n of people asked for. If you don't wish to \n upgrade, you can still use the TUF platform,\n  nothing has changed.\n\nRemember, you started using our website because \nof our content and not because of some third party\nlinks"

In [31]:
docs_and_scores = db.similarity_search_with_score(query)
docs_and_scores

[(Document(id='281b54e9-7423-498f-88f3-855b2f2ee542', metadata={'source': 'speech.txt'}, page_content="ou can find LeetCode links for problems \navailable on the internet. However few problems \nare not there on LeetCode for which you will not \nfind a practice link attached. We cannot use \nthird-party links due to legal constraints. \nAlso the newly added TUF+ practice links are \nto give you a free trial of TUF+ which a lot\n of people asked for. If you don't wish to \n upgrade, you can still use the TUF platform,\n  nothing has changed.\n\nRemember, you started using our website because \nof our content and not because of some third party\nlinks"),
  np.float32(4257.8384))]

In [32]:
embeddings_vector=embeddings.embed_query(query)
embeddings_vector

[-0.1935083121061325,
 -1.4424089193344116,
 0.042892344295978546,
 1.297197699546814,
 2.250494956970215,
 2.091463804244995,
 1.5533978939056396,
 0.0010264222510159016,
 0.14818957448005676,
 -0.5476565957069397,
 2.0608091354370117,
 0.9310420155525208,
 0.8540452718734741,
 1.6649457216262817,
 -0.5588254332542419,
 -0.48153549432754517,
 2.453629970550537,
 1.6757596731185913,
 -0.45083609223365784,
 -0.09197998046875,
 2.6350975036621094,
 -0.39540037512779236,
 0.2326069176197052,
 -1.1981265544891357,
 0.010826952755451202,
 -1.5652018785476685,
 -0.2637545168399811,
 0.8863563537597656,
 1.050551414489746,
 -0.6130000948905945,
 -0.12860934436321259,
 0.2642175853252411,
 -0.44929200410842896,
 -0.3926234245300293,
 0.28045764565467834,
 0.6837626099586487,
 0.3309215307235718,
 0.5481241941452026,
 0.6977055668830872,
 -0.5569707751274109,
 -2.0800774097442627,
 -0.9858137369155884,
 0.6470327377319336,
 -0.9864200949668884,
 -0.29370570182800293,
 -1.0162434577941895,
 1.24

In [33]:
docs_and_scores = db.similarity_search_with_score(query)
docs_and_scores

[(Document(id='281b54e9-7423-498f-88f3-855b2f2ee542', metadata={'source': 'speech.txt'}, page_content="ou can find LeetCode links for problems \navailable on the internet. However few problems \nare not there on LeetCode for which you will not \nfind a practice link attached. We cannot use \nthird-party links due to legal constraints. \nAlso the newly added TUF+ practice links are \nto give you a free trial of TUF+ which a lot\n of people asked for. If you don't wish to \n upgrade, you can still use the TUF platform,\n  nothing has changed.\n\nRemember, you started using our website because \nof our content and not because of some third party\nlinks"),
  np.float32(4257.8384))]

In [34]:
# saving and loading the vector store
db.save_local("faiss_index")

In [36]:
new_df=FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
new_df

<langchain_community.vectorstores.faiss.FAISS at 0x29bf9a5d750>