In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
#from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
import os

BasePath = "./faiss_db"

In [2]:
loader = DirectoryLoader(path = './PDF', glob="./*.pdf", loader_cls=PyPDFLoader, show_progress=True,exclude='Tutorial_EDIT.pdf')
docs = loader.load()

  0%|                                                                                                                                                                                       | 0/1 [00:00<?, ?it/s]Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 76 0 (offset 0)
Ignoring wrong pointing object 83 0 (offset 0)
Ignoring wrong pointing object 90 0 (offset 0)
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 112 0 (offset 0)
Ignoring wrong pointing object 123 0 (offset 0)
Ignoring wrong pointing object 130 0 (offset 0)
Ignoring wrong pointing object 153 0 (offset 0)
Ignoring wrong pointing object 165 

In [4]:
len(docs)

20

In [5]:
some_docs = docs[0:50]
some_docs[-1]

Document(metadata={'source': 'PDF\\Lecture_8.pdf', 'page': 19}, page_content='Sets•Special data type introduced since Python 2.4 onwards to support mathematical set theory operations. •Unorderedcollectionof unique items.•Set itself is mutable, BUT every item inthesethastobeanimmutabletype.•So,setscan have numbers, strings and tuples as items but cannot havelistsordictionariesasitems.')

In [6]:
r_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 1000,
            chunk_overlap = 200,
            separators = ["\n\n", "\n", "(?<=\.)", " ", ""])

In [7]:
splits = r_splitter.split_documents(some_docs)
len(splits)

20

In [8]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = (
    OllamaEmbeddings(model='all-minilm')
)  

In [9]:
print(embeddings)

base_url='http://localhost:11434' model='all-minilm' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


In [10]:
db = FAISS.from_documents(splits, embeddings)

In [11]:
db.save_local(BasePath)

In [12]:
contexts = db.similarity_search_with_score("What is python?", k=3)

In [13]:
print(contexts[0][0].page_content)

Python Interpreter•The system component of Python is the interpreter.•The interpreter isindependentofyourcode and is required to execute your code.•Two major versions of interpreter are currently available:•Python 2.7.X (broader support, legacy libraries)•Python3.6.X (newer features, better future support)


In [14]:
print(contexts[0][0].metadata)

{'source': 'PDF\\Lecture_8.pdf', 'page': 2}


Prompt Engineering:

In [15]:
import ollama
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model="tinyllama", format="json", temperature=0)

In [17]:
retriever = FAISS.load_local(BasePath, embeddings,allow_dangerous_deserialization =True).as_retriever()

In [18]:
from langchain.prompts import PromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [19]:
system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [20]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

In [21]:
response = chain.invoke({"input": "What is python?"})

In [22]:
response.keys()

dict_keys(['input', 'context', 'answer'])

In [23]:
response['context']

[Document(metadata={'source': 'PDF\\Lecture_8.pdf', 'page': 2}, page_content='Python Interpreter•The system component of Python is the interpreter.•The interpreter isindependentofyourcode and is required to execute your code.•Two major versions of interpreter are currently available:•Python 2.7.X (broader support, legacy libraries)•Python3.6.X (newer features, better future support)'),
 Document(metadata={'source': 'PDF\\Lecture_8.pdf', 'page': 1}, page_content='Why Python?•Readabilityandease-of-maintenance•Python focuses on well-structured easy to read code•Easier to understand source code…•..hence easier to maintain code base•Portability•Scripting language hence easily portabble•Python interpreter is supported onmostmodern OS’s•Extensibilitywithlibraries•Large base of third-party libraries that greatly extend functionality. Eg., NumPy, SciPyetc.'),
 Document(metadata={'source': 'PDF\\Lecture_8.pdf', 'page': 8}, page_content='What is an Object?•Almost everything is an object in Python