In [1]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv()

True

### Doc Loader

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

file_path = "./data/gym/1_Analysis_of_Actual_Fitness_Supplement.pdf"
loader = PyMuPDFLoader(file_path)

docs = loader.load()

In [3]:
doc = docs[0]
# print(doc.page_content)

In [4]:
import os

pdfs = []
for root, dirs, files in os.walk('data'):
    # print(root, dirs, files)
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

In [5]:
docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()

    docs.extend(pages)


### Document Chunking

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)

chunks = text_splitter.split_documents(docs)

In [7]:
len(docs), len(chunks)

(32, 152)

In [8]:
len(docs[0].page_content), len(chunks[0].page_content)

(4490, 946)

In [11]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))

(1016, 271)

### Embedding

In [12]:
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore


In [13]:
embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")

single_vector = embeddings.embed_query("This is text")

In [14]:
index = faiss.IndexFlatL2(len(single_vector))
index.ntotal, index.d

(0, 768)

In [15]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [16]:
# help(vector_store)

In [17]:
ids = vector_store.add_documents(documents=chunks)

In [18]:
vector_store.index_to_docstore_id
len(ids)

152

In [None]:
# #store vector database
# db_name = "data_vector"
# vector_store.save_local(db_name)

# # Load vector database
# new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)
# len(new_vector_store.index_to_docstore_id)

152

### Retrieval

In [19]:
question = "what is used to measure the intensity of exercise?"
docs = vector_store.search(query=question, search_type='similarity')

for doc in docs:
    print(doc.page_content)
    print("\n\n")

efficacy influences exercise behavior by boosting confidence in athletic abilities. Higher
exercise self-efficacy contributes to a more enjoyable and enduring engagement in physical
activity, fostering a lifelong habit of sports. This has also verified the results of Gacek [56],
which revealed that individuals with higher self-efficacy engaged in behaviors that promote
health and enhance physical fitness, and they engaged less in behaviors that are detrimental
to their health, such as drinking alcoholic beverages.
The results revealed that controlled precursors have a significant effect toward be-
havioral intention (β: 0.272; p < 0.001), which based on the indicators, were voluntary
evaluation of the benefits of the supplements, matching their consumption with proper
diet and exercise, recognizing their usefulness, and acknowledging the value of a healthy
lifestyle affect consumers’ intent to engage in activities that are aligned with fitness. This



to consume fitness supplements. A

In [20]:
retrieval = vector_store.as_retriever(search_type="mmr", search_kwargs={'k':3, 'fetch_k': 100, 'lambda_mult': 1})

In [21]:
# docs = retrieval.invoke(question)
# # for doc in docs:
# #     print(doc.page_content)
# #     print("\n\n")

# make question
question = "what are the benefits of BCAA supplements?"
docs = retrieval.invoke(question)

### RAG with LLAMA 3.2 on OLLAMA

In [22]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

from langchain_ollama import ChatOllama

In [23]:
model = ChatOllama(
    model="llama3.2:1b",
    base_url="http://127.0.0.1:11434",
)

model.invoke("hi")

AIMessage(content='How can I help you today?', additional_kwargs={}, response_metadata={'model': 'llama3.2:1b', 'created_at': '2025-04-05T12:33:45.790138644Z', 'done': True, 'done_reason': 'stop', 'total_duration': 8471670428, 'load_duration': 6203803368, 'prompt_eval_count': 26, 'prompt_eval_duration': 1215153854, 'eval_count': 8, 'eval_duration': 999306508, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-6150e669-673a-4bca-b282-ab246fb7522a-0', usage_metadata={'input_tokens': 26, 'output_tokens': 8, 'total_tokens': 34})

In [25]:
prompt = hub.pull("daethyra/rag-prompt", include_model=True)

In [42]:
prompt = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
    if possible answer in bullet points. Make sure your answer is reevant to the question and it is answered from the context only
    Question: {question} 
    Context: {context} 
    Answer:
"""

In [43]:
prompt = ChatPromptTemplate.from_template(prompt)

In [44]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# print(format_docs(docs))

In [45]:
rag_chain = (
    {
        "context": retrieval|format_docs,
        "question": RunnablePassthrough(),
    } 
    | prompt
    | model
    | StrOutputParser()
)

In [46]:
# question = "what are the benefits of BCAA supplements?"
question = "what is used to gain muscle mass?"
output = rag_chain.invoke(question)

In [47]:
print(output)

Here are the possible answers in bullet points:

* Caffeine: reduces perceived effort, minimizes fatigue and pain, and improves endurance.
* Creatine monohydrate: used to gain muscle mass and support performance and recovery.

I don't know if creatine supplements contain other substances besides these two.
