In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
os.environ["HUGGINGFACE_TOKEN"]=os.getenv("HUGGINGFACE_TOKEN")

In [4]:
pdf_loader=PyPDFLoader("llama2.pdf")
pdf_doc=pdf_loader.load()
len(pdf_doc)

77

In [5]:
pdf_spliter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)

In [6]:
splited_text=pdf_spliter.split_documents(pdf_doc)
len(splited_text)

615

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from langchain_community.docstore.in_memory import InMemoryDocstore
index=faiss.IndexFlatIP(384)
vector_store=FAISS(embedding_function=embeddings,index=index,
                   docstore=InMemoryDocstore(),
                   index_to_docstore_id={})

In [9]:
vector_store.add_documents(splited_text)

['2ee83b82-3c8a-43af-8099-8ffcdb2be25d',
 '32708207-5591-4931-b8b4-e94fc50e2ed2',
 'd2ed1752-d299-483a-a734-8aecfd219ece',
 'dbda4084-534b-481e-9d2c-46d51563b442',
 'ea4cf27d-38f7-45d3-a70f-433cbe32e710',
 'a949f4f4-169c-4bf1-a43b-82df7aeb5537',
 '4e9b1032-679f-4fd9-b100-ac1f17e9fae7',
 '0bdd1677-adee-4cfc-8dee-054b4156e104',
 '5b484fd5-c0b1-49a8-a968-99c4ae71b2ab',
 '18d24a25-a1df-48c0-8b92-39747478e5d5',
 '592e3c3e-2a2d-40ec-aba3-bdf29a5ecf9f',
 '53dfa218-2f52-4cc5-9d34-d71afcd3f964',
 '7d4cbe43-064b-45a8-b68b-361294e5866a',
 'd61d580f-9506-480d-a28c-0b238bee5bc1',
 'e124fad8-d5eb-4eb0-bf80-8b7cf46fc1de',
 '0561142b-7afd-4e86-9203-f045ff9d9325',
 '0334cb9f-89bb-4d0b-affe-45d2156f1f86',
 '6b574f2d-acf2-493c-879c-7af31cb38801',
 '6a6313d1-4699-4b12-b192-3380561a03f5',
 'e9fa5b23-c477-415b-b7d3-eccf39566708',
 '24d63822-46fa-42b7-a37d-5b9e68739a36',
 '981a2cc9-7cea-4a51-a8b7-fbbf158c1559',
 '6ddcef13-7bd9-4ba2-a01d-114988bcd91c',
 '1a92afac-f392-4c62-98af-ebf6070d1d64',
 '8c06ac4a-cf9a-

In [10]:
#Vector store working as a similarity search engine
similar_data=vector_store.similarity_search("What is lamma model?",k=2)
similar_data

[Document(id='f3314de4-e893-4023-adf2-cd54d2b11a05', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'llama2.pdf', 'total_pages': 77, 'page': 76, 'page_label': '77'}, page_content='specific applications of the model. Please see the Responsible Use Guide available available at\nhttps://ai.meta.com/llama/responsible-user-guide\nTable 52: Model card forLlama 2.\n77'),
 Document(id='04911fb4-2737-4539-a9c9-f84e48ff09db', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 202

In [14]:
#Working as a retriever

#retriver=vector_store.as_retriever(search_kwargs={"k":30}) 
retriver=vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k":3,"score_threshold":0.5})

In [15]:
result=retriver.invoke("What is langchain framework?")

In [16]:
len(result)

3

In [17]:
from langchain_google_genai import ChatGoogleGenerativeAI

model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [18]:
from langchain import hub
prompt=hub.pull('rlm/rag-prompt')

In [19]:
import pprint
pprint.pprint(prompt.messages)

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [20]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough   # used for take query only in run time

In [21]:
def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

In [22]:
chain=(
    {"context": retriver | format_docs,"question":RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser())

In [23]:
result=chain.invoke("What is llama model?")

In [24]:
result

"Llama is a large language model developed by Meta AI.  It's available for both research and commercial use, with versions tuned for chat and others adaptable for various natural language tasks.  A Stanford version, Alpaca, is an instruction-following model based on Llama."

In [25]:
result=chain.invoke("What is langchain?")
result

"I am sorry, but this context does not contain any information about Langchain.  Therefore, I don't know."