In [None]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from langchain_google_genai import ChatGoogleGenerativeAI
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

True


In [None]:
google_gemini_api_key = os.getenv("GOOGLE_API_KEY")

In [None]:
gemini_client = ChatGoogleGenerativeAI(
            model=os.getenv("gpt_deployment_name"),
            temperature=0.7,
            google_api_key=os.getenv("GOOGLE_API_KEY"),
        )
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [7]:
collection = chroma_client.create_collection(name="titanic_small")

In [8]:
file_dir = here("data/for_upload/titanic_small.csv")
df = pd.read_csv(file_dir, nrows=5)

In [9]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


NOTE: Process in chunks if dataset is big.

In [None]:
import spacy

docs = []
metadatas = []
ids = []
embeddings = []
nlp = spacy.load("en_core_web_md")  # 300-dimensional embeddings

for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    doc = nlp(output_str)
    embedding = doc.vector  # 300-dim vector
    embeddings.append(embedding.tolist())
    docs.append(output_str)
    metadatas.append({"source": "titanic_small"})
    ids.append(f"id{index}")

In [11]:
docs

['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n',
 'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n',
 'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n']

In [12]:
print(metadatas)
print(ids)

[{'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}]
['id0', 'id1', 'id2', 'id3', 'id4']


In [13]:
embeddings[0][:10]

[-0.5350187420845032,
 0.17263393104076385,
 -0.054363854229450226,
 -0.0893101617693901,
 0.04114704579114914,
 -0.0426018126308918,
 0.0006458513089455664,
 -0.22407247126102448,
 0.13861791789531708,
 0.9180378913879395]

In [14]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

Verify the vectorDB creation

In [15]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 5


### RAG

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

**Perform similarity search**

In [None]:
query_texts = nlp("what's the average age of survivors")
query_embeddings=query_texts.vector

**Load the chromaDB collection for vector search**

In [18]:
vectordb = chroma_client.get_collection(name="titanic_small")
vectordb.count()

5

In [19]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

{'ids': [['id3']],
 'embeddings': None,
 'documents': [['Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'source': 'titanic_small'}]],
 'distances': [[8.911084175109863]]}

Pass the results to an LLM

In [None]:
from langchain_core.messages import SystemMessage, HumanMessage

system_role = SystemMessage(
            content="You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
        )
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"
user_msg = HumanMessage(content=prompt)
response = gemini_client.invoke([system_role, user_msg])

In [None]:
response.content

'Based on the provided search results, I only have information for one survivor:\n\n*   **Mrs. Jacques Heath (Lily May Peel) Futrelle:** Age 35\n\nTo calculate an average age of survivors, I would need data for more individuals who survived. With only one data point, I can only tell you the age of this specific survivor.'

**Fact check**

In [None]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05
