In [1]:
from dotenv import load_dotenv, find_dotenv
founddotenv = load_dotenv(find_dotenv(), override=True) 
print("Found .env: %s", founddotenv)

Found .env: %s True


In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader
path = "./pull_requests_summary.csv"

import csv
csv.field_size_limit(10**6)

loader = CSVLoader(file_path=path, encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()

In [3]:
from langchain_openai import OpenAIEmbeddings
#embeddings = OpenAIEmbeddings()
#embeddings = OpenAIEmbeddings(model= "text-embedding-3-large", dimensions=1536) #text-embedding-3-small
embeddings = OpenAIEmbeddings(model= "text-embedding-3-large") #text-embedding-3-small

In [4]:
from langchain_milvus import Milvus
def send_chunks_to_Milvus_Db (subchunk) :   
    vectorstore = Milvus.from_documents(  # or Zilliz.from_documents
        documents=subchunk,
        embedding=embeddings,
        collection_name="milvus_csv",
        connection_args={"uri": "./milvus_csv.db"},
        #drop_old=True,  # Drop the old Milvus collection if it exists
    )
    print("Added chunks to DB : ", len(subchunk))

    #load the store
    vectorstore = Milvus(
        embeddings,
        connection_args={"uri": "./milvus_csv.db"},
        collection_name="milvus_csv"
    )
    print("loaded mulvus")
    
    return vectorstore

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=25)
chunks = text_splitter.split_documents(data)
print("Total Chunks : ", len(chunks))

Total Chunks :  4281


In [13]:
vectorstore = send_chunks_to_Milvus_Db(chunks)

Added chunks to DB :  4526


In [6]:
#how many chunks?
count_max = 2000
quotient = len(chunks) // count_max
remainder = len(chunks) % count_max
if remainder > 0:
    total_chunks = quotient + 1
else :
    total_chunks = quotient
print("Number of Chunk Groups : ", total_chunks)

Number of Chunk Groups :  3


In [7]:
chunk_list = []
for i in range(total_chunks):
    chunk_name = f"chunk-{i}"
    chunk_list.append(chunk_name)
    print("Chunk name : ", chunk_name)
print("Chunk List : ", len(chunk_list))

Chunk name :  chunk-0
Chunk name :  chunk-1
Chunk name :  chunk-2
Chunk List :  3


In [8]:
def insert_into_milvus(chunk_list):
    remaining = len(chunks)
    for name in chunk_list:
        if remaining > count_max :
            name = chunks[:count_max]
            remaining = remaining - count_max
            print("Chunk Size : ", len(name))
            vectorstore = send_chunks_to_Milvus_Db(name)
        else :
            name = chunks[:remaining]
            print("Last Chunk Size : ", len(name))
            vectorstore = send_chunks_to_Milvus_Db(name)
    return vectorstore

In [9]:
vectorstore = insert_into_milvus(chunk_list)

Chunk Size :  2000
Added chunks to DB :  2000
loaded mulvus
Chunk Size :  2000
Added chunks to DB :  2000
loaded mulvus
Last Chunk Size :  281
Added chunks to DB :  281
loaded mulvus


In [41]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

PROMPT_TEMPLATE = """
Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
Your will be presented with a document containing 3 columns
The first column is "Commit ID". This is the primary key and will be unique
The Second column is "Changes"
The Third column is "Comments"
Each unique "Commit ID" will be a row containing "Changes" and "Comments"
So, if there are 20 commit IDs, there would be 20 rows
Look through the whole document before answering the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context>

<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible.

Assistant:"""

prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["context", "question"]
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 250})
#retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.5})
#retriever = vectorstore.as_retriever(search_type="mmr",search_kwargs={'k': 50, 'fetch_k': 100})


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [42]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

query = "How many unique Commit ID are present? Name all you find"
#query = "Print all the unique commit ID"
#query = "How many rows are present in the documents?"
#query = "how many columns are present and which is the 2nd column?"
#query = "How many unique Commit ID are present? Which are the 2 most similar commit IDs and why?"
res = rag_chain.invoke(query)
print(res)


There are 10 unique Commit IDs present in the document. Here are the Commit IDs:
1. 97ac5a506b358d29c298aecadd41060d62cecc4b
2. 949be2ac0e957cfbe981da242384424b4786067a
3. 3ecedef3e81ac4104fe3f0a028aafad496ed6b42
4. 0fa74824f9fefec08acfe117866dc587563730d5
5. 39c8f648fdb08e8717237ae4ffca5b181cd3fd33
6. c8ece09049aab49a3563e0270356ff252cab07ec
7. 8a493a8f89f9c9d108922bd83af39f776ee6c76b
8. 1463a176bf652172560c3893c84b61c86e1b47f5
9. 26d2d44b12c28580549b6ade52bb6a4949cfe521
10. 4acc32ac9bf2dde41083bac73dc15f89cc69345d
