In [3]:
from dotenv import load_dotenv, find_dotenv
founddotenv = load_dotenv(find_dotenv(), override=True) 
print("Found .env: %s", founddotenv)

Found .env: %s True


In [4]:
from langchain_community.document_loaders.csv_loader import CSVLoader
path = "./pull_requests_summary.csv"

import csv
csv.field_size_limit(10**6)

loader = CSVLoader(file_path=path, encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()

In [5]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=True) #mxbai-embed-large

In [12]:
#from langchain.vectorstores.chroma import Chroma
from langchain_chroma import Chroma
def send_chunks_to_chroma_Db (subchunk) :   
    vectorstore = Chroma.from_documents(
        documents = subchunk,
        #collection_name="myRAG-CSV",
        embedding = embeddings,
        persist_directory="./chroma_db_csv_ollama", 
        )
    #vectorstore.persist()
    print("Added chunks to DB : ", len(subchunk))

    #vectorstore = Chroma(persist_directory="./chroma_db_csv", embedding_function=embeddings)
    print("loaded chromadb")

    return vectorstore

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=25)
chunks = text_splitter.split_documents(data)
print("Total Chunks : ", len(chunks))

Total Chunks :  5718


In [8]:
counter = 0
for chunk in chunks:
    if "Commit ID" in chunk.page_content:
        counter = counter+1
        print("\nCounter : ", counter, chunk)


Counter :  1 page_content='Commit ID: 1463a176bf652172560c3893c84b61c86e1b47f5' metadata={'source': './pull_requests_summary.csv', 'row': 0}

Counter :  2 page_content='Commit ID: 39c8f648fdb08e8717237ae4ffca5b181cd3fd33
Changes: diff --git a/gradle.properties b/gradle.properties
index 63e41475e..541cd2205 100644
--- a/gradle.properties
+++ b/gradle.properties' metadata={'source': './pull_requests_summary.csv', 'row': 1}

Counter :  3 page_content='Commit ID: 949be2ac0e957cfbe981da242384424b4786067a' metadata={'source': './pull_requests_summary.csv', 'row': 2}

Counter :  4 page_content='Commit ID: 4acc32ac9bf2dde41083bac73dc15f89cc69345d
Changes: diff --git a/build.gradle b/build.gradle
index 71e710c22..4ef6125d0 100644
--- a/build.gradle
+++ b/build.gradle' metadata={'source': './pull_requests_summary.csv', 'row': 3}

Counter :  5 page_content='Commit ID: 97ac5a506b358d29c298aecadd41060d62cecc4b' metadata={'source': './pull_requests_summary.csv', 'row': 4}

Counter :  6 page_content

In [14]:
#how many chunks?
count_max = 2000
quotient = len(chunks) // count_max
remainder = len(chunks) % count_max
if remainder > 0:
    total_chunks = quotient + 1
else :
    total_chunks = quotient
print("Number of Chunk Groups : ", total_chunks)

Number of Chunk Groups :  3


In [15]:
chunk_list = []
for i in range(total_chunks):
    chunk_name = f"chunk-{i}"
    chunk_list.append(chunk_name)
    print("Chunk name : ", chunk_name)
print("Chunk List : ", len(chunk_list))

Chunk name :  chunk-0
Chunk name :  chunk-1
Chunk name :  chunk-2
Chunk List :  3


In [16]:
def insert_into_chroma(chunk_list):
    remaining = len(chunks)
    for name in chunk_list:
        if remaining > count_max :
            name = chunks[:count_max]
            remaining = remaining - count_max
            print("Chunk Size : ", len(name))
            vectorstore = send_chunks_to_chroma_Db(name)
            print("sent chunks to DB : ", len(name))
        else :
            name = chunks[:remaining]
            print("Last Chunk Size : ", len(name))
            vectorstore = send_chunks_to_chroma_Db(name)
            print("sent chunks to DB : ", len(name))
    return vectorstore

In [18]:
vectorstore = insert_into_chroma(chunk_list)

Chunk Size :  2000


OllamaEmbeddings: 100%|██████████| 2000/2000 [01:06<00:00, 29.96it/s]


Added chunks to DB :  2000
loaded chromadb
sent chunks to DB :  2000
Chunk Size :  2000


OllamaEmbeddings: 100%|██████████| 2000/2000 [01:00<00:00, 32.90it/s]


Added chunks to DB :  2000
loaded chromadb
sent chunks to DB :  2000
Last Chunk Size :  1718


OllamaEmbeddings: 100%|██████████| 1718/1718 [00:51<00:00, 33.30it/s]


Added chunks to DB :  1718
loaded chromadb
sent chunks to DB :  1718


In [19]:
print(len(vectorstore.get()['documents']))
print(len(vectorstore.get()['ids']))
print(vectorstore.get().keys())
print(len(vectorstore.get()['metadatas']))
print(vectorstore._collection.count())

5718
5718
dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])
5718
5718


In [41]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model="llama3.1", temperature=0)

PROMPT_TEMPLATE = """
Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
Your will be presented with a document containing 3 columns
The first column is "Commit ID". This is the primary key and will be unique
The Second column is "Changes"
The Third column is "Comments"
Each unique "Commit ID" will be a row containing "Changes" and "Comments"
So, if there are 20 commit IDs, there would be 20 rows
Look through the whole document before answering the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context>

<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible.

Assistant:"""

prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["context", "question"]
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
#retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.5})
#retriever = vectorstore.as_retriever(search_type="mmr",search_kwargs={'k': 50, 'fetch_k': 100})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [42]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

query = "How many unique Commit ID are present? print them"
#query = "Print all the unique commit ID"
#query = "How many rows are present in the documents?"
#query = "how many columns are present and which is the 2nd column?"
#query = "How many unique Commit ID are present? Which are the 2 most similar commit IDs and why?"

res = rag_chain.invoke(query)
print(res)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.14it/s]


Based on the provided document, I have counted the unique "Commit ID" values. There are 9 unique "Commit ID" values:

1. 3ecedef3e81ac4104fe3f0a028aafad496ed6b42
2. 1463a176bf652172560c3893c84b61c86e1b47f5
3. 26d2d44b12c28580549b6ade52bb6a4949cfe521
4. c1fcce1636d4e6a240fbaf6ffa6c30058c08e69e
5. 4acc32ac9bf2dde41083bac73dc15f89cc69345d
6. 949be2ac0e957cfbe981da242384424b4786067a
7. 97ac5a506b358d29c298aecadd41060d62cecc4b
8. 0fa74824f9fefec08acfe117866dc587563730d5
9. 39c8f648fdb08e8717237ae4ffca5b181cd3fd33


In [23]:
vectorstore.delete_collection()

In [24]:
print(len(vectorstore.get()['documents']))
print(len(vectorstore.get()['ids']))
print(vectorstore.get().keys())
print(len(vectorstore.get()['metadatas']))
print(vectorstore._collection.count())

ValueError: Chroma collection not initialized. Use `reset_collection` to re-create and initialize the collection. 