In [32]:
import pandas as pd
import sqlite3
import chromadb
from chromadb.config import Settings
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
import ollama

In [12]:
# Connect to sqlite database
db_path = "data/gho.db"
#os.remove(db_path) if os.path.exists(db_path) else None # clear db if it exists
conn = sqlite3.connect(db_path)

In [13]:
# Convert csv file to sqlite database
df = pd.read_csv("data/filtered.csv", sep=";")
df.to_sql("diabetes_prevalence", conn, if_exists="replace", index=False)

6567

In [14]:
# Chroma DB vector store
embedding_func = DefaultEmbeddingFunction()
chroma_client = chromadb.EphemeralClient(settings=Settings(anonymized_telemetry=False))
table_collection = chroma_client.get_or_create_collection(name="tables", embedding_function=embedding_func)

In [44]:
# Store table ddls in chroma db
ddls = pd.read_sql_query("SELECT type, sql FROM sqlite_master WHERE sql is not null", conn)
ddls = ddls['sql'].to_list()
table_collection.add(documents=ddls, ids=[f"id{i}" for i in range(len(ddls))])

print(ddls)

Add of existing embedding ID: id0
Insert of existing embedding ID: id0


['CREATE TABLE "diabetes_prevalence" (\n"year" INTEGER,\n  "country" TEXT,\n  "sex" TEXT,\n  "age group" TEXT,\n  "value" REAL\n)']


In [43]:
# Create system prompt for question
user_prompt = "What was the highest diabetes prevalence for males?"

system_prompt = "===Tables \n"
ddls = table_collection.query(query_texts=user_prompt, n_results=10)["documents"][0]
for ddl in ddls:
    system_prompt += ddl + "\n\n"

system_prompt += (
    "===Response Guidelines \n"
    "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
    "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
    "3. If the provided context is insufficient, please explain why it can't be generated. \n"
    "4. Please use the most relevant table(s). \n"
    "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
    f"6. Ensure that the output SQL is SQLite-compliant and executable, and free of syntax errors. \n"
)

print(system_prompt)

Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1


===Tables 
CREATE TABLE "diabetes_prevalence" (
"year" INTEGER,
  "country" TEXT,
  "sex" TEXT,
  "age group" TEXT,
  "value" REAL
)

===Response Guidelines 
1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. 
2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql 
3. If the provided context is insufficient, please explain why it can't be generated. 
4. Please use the most relevant table(s). 
5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. 
6. Ensure that the output SQL is SQLite-compliant and executable, and free of syntax errors. 



In [42]:
# Create message log
messages = [
    {'role': 'system', 'content': system_prompt},
    {'role': 'user', 'content': user_prompt},
]

# Prompt LLM
response = ollama.chat(model="phi4", messages=messages)
sql = response["message"]["content"]
print(sql)

```sql
SELECT MAX(value) AS highest_diabetes_prevalence_for_males
FROM diabetes_prevalence
WHERE sex = 'male';
```
