In [1]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from openai import AzureOpenAI
import warnings

warnings.filterwarnings("ignore")
from dotenv import load_dotenv

print(load_dotenv())

True


In [2]:
from dotenv import load_dotenv
import os

here(".env")
print("Environment variables are loaded:", load_dotenv(dotenv_path=here(".env")))
print("test by reading a variable:", os.getenv("OPENAI_API_VERSION"))
print("test by reading a variable:", os.getenv("OPENAI_API_BASE"))
print("test by reading a variable:", os.getenv("OPENAI_API_KEY"))
print("test by reading a variable:", os.getenv("OPENAI_API_TYPE"))

Environment variables are loaded: True
test by reading a variable: 2024-05-01-preview
test by reading a variable: https://cogopenaiscgjwdllmchat1.openai.azure.com/
test by reading a variable: dc1475322fbf4c07a1469f57242f14b5
test by reading a variable: azure


In [3]:
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [4]:
azure_client = AzureOpenAI(
    api_key=azure_openai_api_key,
    api_version=os.getenv("OPENAI_API_VERSION"),
    azure_endpoint=azure_openai_endpoint,
)
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [22]:
if "titanic_small" in [collection_list.name for collection_list in chroma_client.list_collections()]:
    chroma_client.delete_collection(name="titanic_small")

collection = chroma_client.create_collection(name="titanic_small")

In [6]:
file_dir = here("data/for_upload/titanic_small.csv")
df = pd.read_csv(file_dir, nrows=5)

In [7]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


NOTE: Process in chunks if dataset is big.

In [8]:
docs = []
metadatas = []
ids = []
embeddings = []
for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    response = azure_client.embeddings.create(input=output_str, model="text-embedding-ada-002")
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({"source": "titanic_small"})
    ids.append(f"id{index}")

In [35]:
# >>> "what's the average age of survivors"
docs, metadatas, ids

(['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n',
  'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n',
  'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n',
  'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n',
  'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n'],
 [{'source': 'titanic_small'},
  {'source': 'titanic_small'},
  {'source': 'titanic_small'},
  {'source': 'titanic_small'},
  {'source': 'titanic_small'}],
 ['i

In [14]:
print(docs)
print(metadatas)
print(ids)

['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n', 'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n', 'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n', 'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n', 'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n']
[{'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}]
['id0', 'id1', 'id2', 'i

In [15]:
embeddings[0][:10]

[-0.005476914346218109,
 -0.019898656755685806,
 -0.0172883253544569,
 -0.025160321965813637,
 0.010256828740239143,
 0.03452198579907417,
 -0.013304493390023708,
 -0.0008212808752432466,
 -0.024927988648414612,
 -0.017629992216825485]

In [23]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

Verify the vectorDB creation

In [24]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 5


### RAG

In [25]:
from openai import AzureOpenAI

In [26]:
model_name = "gpt-35-turbo"
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [27]:
azure_client = AzureOpenAI(
    api_version=os.getenv("OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("OPENAI_API_BASE"),
    api_key=os.getenv("OPENAI_API_KEY"),
)

**Perform similarity search**

In [29]:
query_texts = "what's the average age of survivors"
response = azure_client.embeddings.create(input=query_texts, model="text-embedding-ada-002")
query_embeddings = response.data[0].embedding

In [31]:
query_embeddings

[-0.004014675039798021,
 -0.012156915850937366,
 0.013934943825006485,
 -0.023128477856516838,
 -0.03338741883635521,
 -0.0034431659150868654,
 -0.006328933872282505,
 0.012573199346661568,
 -0.014196003787219524,
 0.0010945104295387864,
 0.004512099549174309,
 0.03361320123076439,
 -0.019050301983952522,
 0.012121637351810932,
 0.0021590343676507473,
 0.0238199345767498,
 0.03544767573475838,
 0.004279262386262417,
 0.005962039344012737,
 -0.030734488740563393,
 0.008071684278547764,
 0.018796298652887344,
 -0.022521691396832466,
 -0.01715938374400139,
 -0.013631549663841724,
 -0.00918647926300764,
 0.003369081299751997,
 -0.024257386103272438,
 0.037423260509967804,
 -0.028674233704805374,
 0.008163407444953918,
 -0.014901570044457912,
 -0.017794393002986908,
 -0.010273052379488945,
 -0.028335561975836754,
 -0.002753474283963442,
 0.003536653472110629,
 -0.016355037689208984,
 -0.002174909459426999,
 -0.0008285999647341669,
 0.01174063142389059,
 -0.01724405214190483,
 -0.00905242189

**Load the chromaDB collection for vector search**

In [32]:
vectordb = chroma_client.get_collection(name="titanic_small")
vectordb.count()

5

In [44]:
results = vectordb.query(query_embeddings=query_embeddings, n_results=1)  # top_k

results

{'ids': [['id4']],
 'distances': [[0.483916173048672]],
 'metadatas': [[{'source': 'titanic_small'}]],
 'embeddings': None,
 'documents': [['Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

Pass the results to an LLM

In [41]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"
print(prompt)

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

User's question: what's the average age of survivors 

 Search results:
 {'ids': [['id4']], 'distances': [[0.483916173048672]], 'metadatas': [[{'source': 'titanic_small'}]], 'embeddings': None, 'documents': [['Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [46]:
model_name = "gpt-35-turbo"
response = azure_client.chat.completions.create(
    model=model_name,
    messages=messages,
)

In [47]:
response.choices[0].message.content

"The average age of survivors from the Titanic dataset is not directly available in the search results provided. However, the provided document contains information about a non-survivor, Mr. William Henry Allen, who was 35 years old. To calculate the average age of survivors, more data points about survivors' ages would be needed. If you have access to a larger dataset containing the ages of survivors, or if there are more search results available, please provide them for a more accurate calculation."

**Fact check**

In [48]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05
