# Importing Libraries

In [1]:
# updated version
from time import time
import torch
from langchain_community.llms import Ollama
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma

import warnings
warnings.filterwarnings('ignore')

In [39]:
GEMMA = "gemma:2b"
LLAMA = "llama3"
QWEN = "qwen:4b"

# Getting the Language Models

In [44]:
llm = Ollama(model=GEMMA)

# Data Ingestion

In [27]:
# Data ingestion using PDF loader
loader = PyPDFLoader("pdf/Physics Classes 9-10.pdf")
documents = loader.load()
#print(documents)

In [28]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(documents)
#print(all_splits)

In [60]:
device = "cuda" if torch.cuda.is_available() else "cpu"
all_mpnet = "sentence-transformers/all-mpnet-base-v2"
all_miniLM = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = { "device": device }

# embeddings = HuggingFaceEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs
# )

In [61]:
def create_embeddings(model_name, model_kwargs):
    print(f"Using device: {device}")
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs
    )
    return embeddings

embeddings = create_embeddings(all_mpnet, model_kwargs)

Using device: cpu


In [62]:
print(embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='sentence-transformers/all-mpnet-base-v2' cache_folder=None model_kwargs={'device': 'cpu'} encode_kwargs={} multi_process=False show_progress=False


In [63]:
vectordb = Chroma.from_documents(
    documents=all_splits, 
    embedding=embeddings, persist_directory="chroma_db"
)

# Retrieval

In [64]:
retriever = vectordb.as_retriever(
    search_type="similarity_score_threshold", 
    search_kwargs={"score_threshold": 0.4}
)

In [65]:
retrieved_docs = retriever.invoke("What is motion?")
len(retrieved_docs)

4

In [66]:
for doc in retrieved_docs:
    print(f"Docs: {doc.page_content}")
    print("\n")

Docs: to the surroundings. A body is said to be in motion with respect to its surroundings when 
it changes its position with time. And this change of position with time is called motion.  
We discussed earlier that to understand whether an object is in rest or in motion it is 
necessary to chose a reference object or reference frame. If the relative positions of this Do by yourself : Hold a pen in your hand . 
Do by yourself : Keep the pen in your hand moving to and fro .


Docs: to the surroundings. A body is said to be in motion with respect to its surroundings when 
it changes its position with time. And this change of position with time is called motion.  
We discussed earlier that to understand whether an object is in rest or in motion it is 
necessary to chose a reference object or reference frame. If the relative positions of this Do by yourself : Hold a pen in your hand . 
Do by yourself : Keep the pen in your hand moving to and fro .


Docs: to the surroundings. A body is sai

# Text Generation 

In [67]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [68]:
example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()
example_messages

[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: filler question \nContext: filler context \nAnswer:")]

In [69]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [70]:
rag_chain.invoke("What is motion?")

'Sure, the definition of motion is the change of position of a body with time.'