## Document Loaders
- Load various kind of documents from the web and local files.
- Apply LLM to the documents for summarization and question answering.

In [None]:
from dotenv import load_dotenv

load_dotenv('./../.env')

### Project 1: Question Answering from PDF Document
- We will load the document from the local file and apply LLM to answer the questions.
- Lets use research paper published on the missuse of the health supplements for workout. 

rag-dataset: git@github.com:laxmimerit/rag-dataset.git

```bash
git clone git@github.com:laxmimerit/rag-dataset.git
```

In [10]:
# !git clone git@github.com:laxmimerit/rag-dataset.git
# !pip install pymupdf tiktoken 


In [12]:
### Read PDF File
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("./rag-dataset/gym supplements/1. Analysis of Actual Fitness Supplement.pdf")

docs = loader.load()

doc = docs[0]
# print(doc.page_content)

In [13]:
### Get the list of all available PDF files
import os

pdfs = []
for root, dirs, files in os.walk('rag-dataset'):
    # print(root, dirs, files)
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

In [None]:
### Read all pages of pdf files
docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()

    docs.extend(pages)


len(docs)

In [19]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

context = format_docs(docs)
# print(context)

In [None]:
### Count Total Tokens
import tiktoken

encoding = tiktoken.encoding_for_model('gpt-4o-mini')
len(encoding.encode(docs[0].page_content)), len(encoding.encode(context))

In [58]:
### Question Answering using LLM

from langchain_ollama import ChatOllama

from langchain_core.prompts import (
                                        SystemMessagePromptTemplate,
                                        HumanMessagePromptTemplate,
                                        ChatPromptTemplate
                                        )

from langchain_core.output_parsers import StrOutputParser

base_url = "http://localhost:11434"
model = 'llama3.2:3b'

llm = ChatOllama(base_url=base_url, model=model)

system = SystemMessagePromptTemplate.from_template("""You are helpful AI assistant who answer user question based on the provided context. 
                                                    Do not answer in more than {words} words""")

prompt = """Answer user question based on the provided context ONLY! If do not know the answer, just say "I don't know".
            ### Context:
            {context}

            ### Question:
            {question}

            ### Answer:"""


prompt = HumanMessagePromptTemplate.from_template(prompt)

messages = [system, prompt]
template = ChatPromptTemplate(messages)

qna_chain = template | llm | StrOutputParser()

# template



In [43]:
# template.invoke({'context': context, 'question': 'What is the best gym supplement?'})

In [None]:
response = qna_chain.invoke({'context': context, 'question': 'What is the best gym supplement?', 'words': 30})
print(response)

In [None]:
response = qna_chain.invoke({'context': context, 'question': 'What is the best planet to live on?', 'words': 30})
print(response)

In [None]:
response = qna_chain.invoke({'context': context, 'question': 'How to gain muscle mass?', 'words': 30})
print(response)

In [None]:
response = qna_chain.invoke({'context': context, 'question': 'side effects of gym supplements?', 'words': 30})
print(response)

### Project 2: PDF Document Summarization

In [59]:
system = SystemMessagePromptTemplate.from_template("""You are helpful AI assistant who works as document summarizer. 
                                                   You must not hallucinate or provide any false information.""")

prompt = """Summarize the given context in {words}.
            ### Context:
            {context}

            ### Summary:"""


prompt = HumanMessagePromptTemplate.from_template(prompt)

messages = [system, prompt]
template = ChatPromptTemplate(messages)

summary_chain = template | llm | StrOutputParser()

In [None]:
response = summary_chain.invoke({'context': context, 'words': 100})
print(response)

In [None]:
### qna chain as summarizer

response = qna_chain.invoke({'context': context, 'question': 'Summarize the given context', 'words': 100})
print(response)

In [None]:
response = qna_chain.invoke({'context': context, 'question': 'Provide a detailed report from the provided context. Write answer in Markdown', 'words': 1000})
print(response)