# Ollama RAG Example

## Import/Setup


In [None]:
import logging
from datetime import datetime
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma

logging.basicConfig(level=logging.INFO)


## Loading PDF file(s)


In [None]:
logging.info("Loading PDF")
loader = PyPDFLoader("./ISTQB_CTFL_Syllabus_v4.0.1.pdf")
pages = loader.load()
logging.info(f"Loaded {len(pages)} pages")


## Splitting the pages into chunks


In [None]:
logging.info("Splitting text")
text_splitter = CharacterTextSplitter(
    separator=" ",
    is_separator_regex=True,
    chunk_size=1000,
    chunk_overlap=100,
)
docs = text_splitter.split_documents(pages)
logging.info(f"Split into {len(docs)} documents")


INFO:root:Loading PDF
INFO:root:Loaded 78 pages
INFO:root:Splitting text
INFO:root:Split into 255 documents


## Loading the chunks into the vector DB


In [68]:
COLLECTION_NAME = f"istqb_ctfl_{datetime.now().strftime('%Y%m%d%H%M%S')}"

logging.info("Embedding text")
embedding_model = OllamaEmbeddings(model="nomic-embed-text")
vector_db = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_model,
)
vector_db.add_documents(docs)
logging.info(f"Embedded {len(vector_db)} documents")


INFO:root:Embedding text
INFO:root:Embedded 510 documents


## Index search using the vector DB


In [69]:
docs = vector_db.similarity_search_with_score("boundary value analysis", k=10)

# sort by score descending
docs = sorted(docs, key=lambda x: x[1], reverse=True)

for i, (doc, score) in enumerate(docs):
    print(f"""
# {i} ##########################
Document {doc.metadata}:
Score: {score}
Length: {len(doc.page_content)}
Content:
--------------------------
{doc.page_content[:100]}...
##########################
""")



# 0 ##########################
Document {'page': 39, 'source': './ISTQB_CTFL_Syllabus_v4.0.1.pdf'}:
Score: 382.41259765625
Length: 996
Content:
--------------------------
coverage items are the equivalence partitions. To achieve 100% coverage with this test 
technique, t...
##########################


# 1 ##########################
Document {'page': 39, 'source': './ISTQB_CTFL_Syllabus_v4.0.1.pdf'}:
Score: 382.41259765625
Length: 996
Content:
--------------------------
coverage items are the equivalence partitions. To achieve 100% coverage with this test 
technique, t...
##########################


# 2 ##########################
Document {'page': 39, 'source': './ISTQB_CTFL_Syllabus_v4.0.1.pdf'}:
Score: 367.7126770019531
Length: 471
Content:
--------------------------
exercised, divided by the total number of identified 
boundary values and their neighbors, and is ex...
##########################


# 3 ##########################
Document {'page': 39, 'source': './ISTQB_CTFL_Syllabus