In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.settings import Settings
from IPython.display import Markdown, display
import chromadb

In [49]:
# Setting up LLM and Embedding with Chroma DB Connection
chroma_client = chromadb.HttpClient()
chroma_collection = chroma_client.get_or_create_collection("test-3")

llm = Ollama(model="llama3.1")
embed_model = OllamaEmbedding(model_name="nomic-embed-text")

Settings.llm = llm
Settings.embed_model = embed_model

In [50]:
"""
This is later apply for a real world chat, where every file gets an unique ID.
Every file gets an index. Every index will be casted to a query_engine.
The query engines will be attached to a SubQuestionQueryEngine for complex queries and querying through multiple data sources.
"""
# load documents
documents = SimpleDirectoryReader(input_files=["./data/Salary_Data.csv", "./data/Employee_Monthly.csv"]).load_data()

# giving the files unique ids
file_ids = [
    'foo',
    'bar'
]
for i, doc in enumerate(documents):
    doc.metadata = {'file_id': file_ids[i]}

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# embedding the data files. Will be stored to remote Chroma DB
indexes = [
    VectorStoreIndex.from_documents(documents=[document], storage_context=storage_context, embed_model=embed_model, show_progress=True)
    for document in documents
]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/196 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/127 [00:00<?, ?it/s]

In [51]:
# load from chroma db vector (loads all Indexes, no filtering, goes through all Documents!)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model, show_progress=True)

In [52]:
query_engine = index.as_query_engine()

In [54]:
print(query_engine.query("What is it about?"))

A database of employee profiles with various characteristics.


In [55]:
# now, index a pdf file to the DB
documents = SimpleDirectoryReader(input_files=['./data/PM-Leitfaden.pdf']).load_data()

for doc in documents:
    doc.metadata = {'file_id': 'pdf-sample'}

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

pdf_index = VectorStoreIndex.from_documents(documents=documents, storage_context=storage_context, embed_model=embed_model, show_progress=True)

Parsing nodes:   0%|          | 0/42 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/48 [00:00<?, ?it/s]

In [58]:
"""
Since there is no filtering, the LLM goes through all documents indexed in the DB.
The PDF takes a lot of embeddings which is why we get this response.
"""
print(pdf_index.as_query_engine().query("What is it about?"))

It appears to be a guide for effective project management, outlining best practices for planning, decision-making, communication, and risk management within projects. The document emphasizes the importance of clear goals, well-defined roles and responsibilities, and transparent documentation to ensure successful project outcomes.


In [59]:
# Now, get the indexes of the csv files
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

# the foo and bar files
filters = [
    MetadataFilters(
        filters=[
            MetadataFilter(
                key="file_id",
                operator=FilterOperator.EQ,
                value=f"{file_id}",
            )
        ]
    ) for file_id in file_ids
]

# should show the csvs
vector_store = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model, show_progress=True)
new_indexes = [
    vector_store.as_query_engine(filters=_filters) for _filters in filters
]

In [60]:
for new_index in new_indexes:
    print(new_index.query("What is it about? \n"))

It appears to be a list of employee records with various attributes such as name, age, education level, job title, and salary.
It appears to be a personnel or employee database listing various individuals with their personal details, job titles, and other work-related information.


In [63]:
"""
Get pdf from another chat (maybe?) for querying through this document
"""
filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="file_id",
            value="pdf-sample",
            operator=FilterOperator.EQ,
        )
    ]
)

# get index of the pdf
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
pdf_index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model, show_progress=True)
pdf_query_engine = pdf_index.as_query_engine(filters=filters)

In [66]:
print(pdf_query_engine.query("Worüber geht es?"))

Es geht um die Planung und Umsetzung eines Projektes.


In [67]:
"""
Now, getting an non existing Index, since there is no file attached to a existing Chat.
"""

filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="file_id",
            value="rapper-drake-conspiracy",
            operator=FilterOperator.EQ,
        )
    ]
)

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
non_existing_index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model, show_progress=True)
query_engine_for_non_existing_index = non_existing_index.as_query_engine(filters=filters)

In [68]:
print(query_engine_for_non_existing_index.query("What is it about? \n"))

Empty Response


In [72]:
"""
Getting an Index of a File by its Metadata.
"""
chroma_collection.get(where={'file_id': {'$eq': 'pdf-sample'}})

{'ids': ['5cbe558b-740e-4737-a8ca-7485d00798e4',
  '2af90597-fdb4-4606-bf47-c767be7eb8e6',
  '89ed9813-b844-4998-85ed-4df56bdc9df9',
  'de8caee8-070f-4f65-8c08-c6913a54eb03',
  'f52a9ca2-3398-4196-8595-1ebd2c3e18bd',
  '8c533372-3fac-4a50-a655-a2c1112adce2',
  '92242745-0c57-458d-bb5d-5d3603ed4e57',
  '07a510c1-ac39-4500-b389-a4360c6c4f72',
  '53d5bf5f-1363-47f7-baf5-bbac91f6e91d',
  '4aa4ebfc-c322-4adb-ac4b-b2f8c1322723',
  '4f05917a-80a1-4b41-9bd8-e3e25c94f704',
  '0c691d3b-edcb-4f89-9913-663ac2ea6232',
  '69ac6650-61f8-4e83-90ec-2a1bc5b6d363',
  'f76f4209-58b0-4a68-8cf9-8e38aec6a1ed',
  '0cbf9483-151d-4423-8806-35d326ca0701',
  'c867dafc-c070-4c99-b0b1-a4572f28bd21',
  'cd44fadf-ca73-4e6d-a6e1-09cf83c2c19f',
  'ceb171ff-2103-4c38-986c-c3c8390e4fa8',
  '2b7e1829-d6f9-442f-844f-7902bc9c78b7',
  '478c7cc4-619c-487b-96ae-99f8f2987add',
  '52073537-874b-43f4-8e6f-4fb5cbd0ac19',
  '9f50c31d-8d80-4af5-9a2b-f0e302e7d565',
  '764b84db-9813-4f99-8a0f-1e03c5ff2cba',
  '38bb7061-16c0-437c-9cc1-

In [73]:
chroma_collection.get(where={'file_id': {'$eq': 'rapper-drake-conspiracy'}})

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'data': None,
 'uris': None,
 'included': ['documents', 'metadatas']}

In [75]:
"""
Index the kfz.txt file in the DB.
Then delete the Index by the metadata.
"""
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

file_id = "kfz-file"
documents = SimpleDirectoryReader(input_files=['./data/kfz.txt']).load_data()
for document in documents:
    document.metadata = {'file_id': file_id}

index = VectorStoreIndex.from_documents(documents=documents, storage_context=storage_context, embed_model=embed_model, show_progress=True)
new_query_for_kfz = index.as_query_engine()

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/33 [00:00<?, ?it/s]

In [76]:
print(new_query_for_kfz.query("What is it about? \n"))

The given text appears to be a combination of two different sources: a legislative document ( possibly a German law or regulation) and a list of personnel data. The first source discusses changes to laws and regulations regarding insurance and liability, while the second source seems to be an employee database with information about various individuals, including their age, education level, job title, and salary.


In [77]:
chroma_collection.get(where={'file_id': {'$eq': file_id}})

{'ids': ['3c13add7-b9bf-40f0-910c-adc8f9b15b94',
  '66e70bb2-197e-4025-bbd2-53a9e6625eb4',
  '683aed8d-9b45-4be9-a555-fe7973390267',
  '3c1dc983-02e0-4c71-bfe3-1bc83256fe2e',
  '163d22d4-e399-4c81-acfe-689170385fe9',
  'e1b7d1c9-de44-4fab-acd3-a8a1510851bc',
  'a98c5711-80be-4e1c-81cc-0c67d69d6b30',
  '842ac846-4c16-4a0f-9e9d-43e441f11d30',
  'e8a81464-eba1-4541-8cd3-1eec0c340141',
  '3405e5f1-2137-41af-a791-15cd255b8e38',
  'def450f4-c24d-4a39-b4a1-4699cf099199',
  '4dd9bddc-8bec-4d45-817e-02bdde5f8284',
  '576709b6-5293-448c-ba55-15485b698c05',
  'f877c043-a898-46cb-97d8-82327d42a15c',
  '87a384c6-2730-4c17-9491-15c95dca10b8',
  'c5505d7b-3f0e-4909-9c40-35465e76fca0',
  'b2fa1eba-875c-4494-beb2-37dbf1bc30d9',
  '174d3bd6-da39-4ade-ac41-0b7d2a70fbd8',
  '77c35fad-4b4b-40ce-a656-f9def1762fac',
  'd83fae1b-c7d4-4907-9cc1-46b5d9d7eefb',
  'a02eaa56-79ad-4c6c-8ff4-cf2ff661e870',
  'be797e15-5bb4-4d5a-9294-b0cd71764b68',
  '3f3fa2e0-cd12-4b8c-a0b8-c90165a26491',
  'ec4128de-e723-4418-a41b-

In [78]:
"""
Delete the kfz.txt file in the DB by its metadata file_id attribute.
"""
chroma_collection.delete(where={'file_id': {'$eq': file_id}})

In [79]:
"""
Tries to find the Index of kfz.txt file in the DB.
"""
chroma_collection.get(where={'file_id': {'$eq': file_id}})

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'data': None,
 'uris': None,
 'included': ['documents', 'metadatas']}