In [1]:
import os
from langchain_community.document_loaders import DirectoryLoader # To load the document
from langchain_community.document_loaders import PyPDFLoader # To specify the type of document, PDF is this case

In [2]:
# define the path to save the embeddings
BasePath = "./faiss_db_test"

In [3]:
## https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/file_directory/
#Load the documents

loader = DirectoryLoader(path = './PDF', glob="./*.pdf", loader_cls=PyPDFLoader, show_progress=True,exclude='Lecture_8.pdf')
docs = loader.load()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.38s/it]


In [4]:
len(docs)

155

In [5]:
input_docs = docs[0:50]
input_docs[-1]

Document(metadata={'source': 'PDF\\Tutorial_EDIT.pdf', 'page': 49}, page_content="Python Tutorial, Release 3.7.0\n(continued from previous page)\n[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89]\n>>>fibo.__name__\n'fibo'\nIf you intend to use a function often you can assign it to a local name:\n>>>fib=fibo.fib\n>>>fib(500)\n0 1 1 2 3 5 8 13 21 34 55 89 144 233 377\n6.1More on Modules\nA module can contain executable statements as well as function deﬁnitions. These statements are intended\nto initialize the module. They are executed only the ﬁrsttime the module name is encountered in an import\nstatement.1(They are also run if the ﬁle is executed as a script.)\nEach module has its own private symbol table, which is used as the global symbol table by all functions\ndeﬁned in the module. Thus, the author of a module can use global variables in the module without\nworrying about accidental clashes with a user’s global variables. On the other hand, if you know what you\nare doing you can touch a m

## Splitting the text

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
r_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 1000,
            chunk_overlap = 200,
            separators = ["\n\n", "\n", "(?<=\.)", " ", ""])

In [7]:
split_docs = r_splitter.split_documents(input_docs)
len(split_docs)

123

## Embeddings

In [8]:
## https://python.langchain.com/v0.2/docs/integrations/text_embedding/ollama/

from langchain_community.embeddings import OllamaEmbeddings


embeddings = (
    OllamaEmbeddings(model='all-minilm')
)  

In [9]:
print(embeddings)

base_url='http://localhost:11434' model='all-minilm' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


## Use Vector DB to store the embeddings

In [10]:
from langchain.vectorstores import FAISS

db = FAISS.from_documents(split_docs, embeddings)

In [11]:
db.save_local(BasePath)

In [12]:
contexts = db.similarity_search_with_score("What is python?", k=3)

In [13]:
print(contexts[0][0].page_content)

Python Tutorial, Release 3.7.0
18 Chapter 3. An Informal Introduction to Python


In [14]:
print(contexts[0][0].metadata)

{'source': 'PDF\\Tutorial_EDIT.pdf', 'page': 23}


## Prompt Engineering

In [15]:
import ollama
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model="tinyllama", format="json", temperature=0)

In [16]:
retriever = FAISS.load_local(BasePath, embeddings,allow_dangerous_deserialization =True).as_retriever()

In [17]:
from langchain.prompts import PromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [18]:
system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [19]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

In [20]:
response = chain.invoke({"input": "What is python?"})

In [21]:
response.keys()

dict_keys(['input', 'context', 'answer'])

In [22]:
response['context']

[Document(metadata={'source': 'PDF\\Tutorial_EDIT.pdf', 'page': 23}, page_content='Python Tutorial, Release 3.7.0\n18 Chapter 3. An Informal Introduction to Python'),
 Document(metadata={'source': 'PDF\\Tutorial_EDIT.pdf', 'page': 6}, page_content='idea of the language’s ﬂavor and style. After reading it, you will be able to read and write Python modules\nand programs, and you will be ready to learn more about the various Python library modules described in\nlibrary-index.\nThe Glossary is also worth going through.\nCONTENTS 1'),
 Document(metadata={'source': 'PDF\\Tutorial_EDIT.pdf', 'page': 13}, page_content='Python Tutorial, Release 3.7.0\n8 Chapter 2. Using the Python Interpreter'),
 Document(metadata={'source': 'PDF\\Tutorial_EDIT.pdf', 'page': 6}, page_content='Python Tutorial, Release 3.7.0\nPython is an easy to learn, powerful programming language. It has eﬃcient high-level data structures and\na simple but eﬀective approach to object-oriented programming. Python’s elegant synt