In [46]:
import logging
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma

logging.basicConfig(level=logging.INFO)

logging.info("Loading PDF")
loader = PyPDFLoader("./ISTQB_CTFL_Syllabus_v4.0.1.pdf")
pages = loader.load()
logging.info(f"Loaded {len(pages)} pages")

logging.info("Splitting text")
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=100,
)
docs = text_splitter.split_documents(pages)
logging.info(f"Split into {len(docs)} documents")

logging.info("Embedding text")
embedding_model = OllamaEmbeddings(model="nomic-embed-text")
vector_db = Chroma(
    collection_name="istqb_ctfl",
    embedding_function=embedding_model,
)
vector_db.add_documents(docs)
logging.info(f"Embedded {len(vector_db)} documents")


INFO:root:Loading PDF
INFO:root:Loaded 78 pages
INFO:root:Splitting text
INFO:root:Split into 250 documents
INFO:root:Embedding text
INFO:root:Embedded 500 documents


In [54]:
docs = vector_db.similarity_search_with_score("automation", k=10)

for i, (doc, score) in enumerate(docs):
    print(f"""
# {i} ##########################
Document {doc.metadata}:
Score: {score}
Length: {len(doc.page_content)}
Content:
--------------------------
{doc.page_content[:100]}...
##########################
""")



# 0 ##########################
Document {'page': 57, 'source': './ISTQB_CTFL_Syllabus_v4.0.1.pdf'}:
Score: 396.69671630859375
Length: 438
Content:
--------------------------
Certified Tester 
Foundation Level 
 
  
 
v4.0.1 Page 58 of 78  2024-09-15 
© International Softwar...
##########################


# 1 ##########################
Document {'page': 57, 'source': './ISTQB_CTFL_Syllabus_v4.0.1.pdf'}:
Score: 396.69671630859375
Length: 438
Content:
--------------------------
Certified Tester 
Foundation Level 
 
  
 
v4.0.1 Page 58 of 78  2024-09-15 
© International Softwar...
##########################


# 2 ##########################
Document {'page': 58, 'source': './ISTQB_CTFL_Syllabus_v4.0.1.pdf'}:
Score: 404.17584228515625
Length: 924
Content:
--------------------------
process(es), CI/CD 
• Collaboration tools – facilitate communication 
• Tools supporting scalability...
##########################


# 3 ##########################
Document {'page': 58, 'source': './ISTQB_CTFL_S