In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [None]:
local_path = "data/UploadFile_9029.pdf"
# online_path = ""

if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [None]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [None]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

In [None]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [None]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are equipped with Retrieval-Augmented Generation (RAG) technology specifically tuned for educational applications. Your primary task is to transform a single user question into five distinct, strategically varied versions. These variations should be designed to effectively retrieve relevant educational materials from a vector-based database.
    Goals:
    Diversity: Generate question variations that explore different facets or interpretations of the original question to cover a broader range of related educational content.
    Avoidance of Redundancy: Each variant should be unique, minimizing overlap in phrasing and focus to avoid redundancy in the retrieved documents.
    Enhanced Clarity: Clarify ambiguities in the original question where possible, making each variation more specific to increase the relevance of search results.
    Logical Structure: Arrange the questions to gradually expand on the original concept, aiding in a logical progression of information retrieval that aligns with educational learning flows.
    Instructions:
    Provide each question variation on a new line for clear separation.
    Ensure that each question maintains educational relevance and adheres to the principles of effective query formulation.
    Original Question: {question}""",
)

In [None]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke("What are the 5 pillars of global cooperation?")

In [None]:
# Delete all collections in the db
vector_db.delete_collection()