In [11]:
import pandas as pd
import sqlite3
from openai import OpenAI
import os
from dotenv import load_dotenv
import json
from loguru import logger


### Get the database schema

In [7]:
def get_database_schema(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Fetch all table names in the database
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    schema = {}
    for table in tables:
        table_name = table[0]
        # Get all column names for each table
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = [info[1] for info in cursor.fetchall()]
        schema[table_name] = columns

    conn.close()
    return schema

# Example usage:
schema = get_database_schema('data.sqlite')
schema_df = pd.DataFrame([(table, col) for table, cols in schema.items() for col in cols], columns=['Table', 'Column'])
# schema['patients']


### Initiate openAI client

In [8]:
dotenv_path = os.path.join('..', 'keys.env')
load_dotenv(dotenv_path)
api_key=os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [19]:
def generate_sql_query(user_query, schema, api_key):
    
    # Prepare the schema information as part of the system message
    schema_prompt = "Here is the database schema:\n"
    for table, columns in schema.items():
        schema_prompt += f"Table: {table}\nColumns: {', '.join(columns)}\n"
    
    # Use the Chat Completion API (for models like GPT-4)
    messages = [
        {
            "role": "system",
            "content": (
                "You are a SQL expert specialized in SQLite. Based on the user's question and the following database schema, "
                "generate only a valid SQL query. Ensure that Greek names remain in Greek, without transliteration. "
                "Do not provide any explanations, comments, or additional context—only return the SQL query as plain text.\n"
                "When calculating dates or ages, make sure to use SQLite-compatible functions like strftime().\n\n"
                + schema_prompt
            )
        },
        {
            "role": "user",
            "content": f"User query: {user_query}"
        }
    ]

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=150,
        temperature=0,
        top_p=1,
    )

    sql_query = response.choices[0].message.content.strip()

    # Calculate cost
    prompt_tokens = response.usage.prompt_tokens
    completion_tokens = response.usage.completion_tokens

    # GPT-3.5-turbo pricing per 1K tokens
    input_cost = (prompt_tokens / 1000) * 0.0015
    output_cost = (completion_tokens / 1000) * 0.002
    total_cost = input_cost + output_cost

    response_dict = {
        "model": response.model,
        "tokens": {
            "prompt": prompt_tokens,
            "completion": completion_tokens,
            "total": response.usage.total_tokens
        },
        "cost": f"${total_cost:.6f}",
    }
    logger.info(f"OpenAI response:\n{json.dumps(response_dict, indent=2, ensure_ascii=False)}")
    logger.info(f"Generated SQL Query:\n{sql_query}")
    
    # Extract the SQL query from the assistant's reply
    sql_query = response.choices[0].message.content.strip()
    # print(f"Generated SQL Query: {sql_query}") 
    return sql_query

sql_query = generate_sql_query("Μια λίστα με τους πελάτες μου από 20 μέχρι 22 χρονών. Αριθμό πελάτη και ονοματεπώνυμο;", schema, api_key=api_key)


[32m2025-10-09 12:49:35.427[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_sql_query[0m:[36m54[0m - [1mOpenAI response:
{
  "model": "gpt-3.5-turbo-0125",
  "tokens": {
    "prompt": 958,
    "completion": 51,
    "total": 1009
  },
  "cost": "$0.001539"
}[0m
[32m2025-10-09 12:49:35.429[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_sql_query[0m:[36m55[0m - [1mGenerated SQL Query:
SELECT id, first_name || ' ' || last_name
FROM patients
WHERE date_of_birth BETWEEN strftime('%Y-%m-%d', 'now', '-22 years') AND strftime('%Y-%m-%d', 'now', '-20 years');[0m


In [5]:
sql_query

"SELECT id, first_name, last_name\nFROM patients\nWHERE (strftime('%Y', 'now') - strftime('%Y', date_of_birth)) BETWEEN 20 AND 22;"

In [20]:
def execute_sql_query(sql_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute(sql_query)
    result = cursor.fetchall()

    conn.close()
    return result


# Example usage:
result = execute_sql_query(sql_query, db_path="data.sqlite")
print(result)

[(768, 'ΠΑΠΑΝΙΚΟΛΑΟΥ ΣΟΦΙΑ'), (910, 'ΑΝΔΡΟΜΑΧΗ ΠΕΤΡΟΥ'), (1170, 'ΝΤΟΣΤΗ ΣΟΝΙΑ'), (1171, 'ΣΥΝΝΕΦΑΚΗ ΒΑΣΙΛΙΚΗ'), (1288, 'ΜΕΛΙΝΑ ΠΑΝΤΑΖΟΠΟΥΛΟΥ'), (1323, 'ΕΡΡΙΚΑ ΝΟΥΣΗ'), (1417, 'ΝΑΧHIRE null'), (1467, 'ΒΑΣΙΛΕΙΑ ΤΖΕΛΕΠΗ'), (1468, 'ΒΑΣΙΛΕΙΑ ΤΖΕΛΕΠΗ')]


In [28]:
def synthesize_response(user_query, data, api_key):
    
    # Build a conversation prompt where the data retrieved from the database is included
    data_context = f"Retrieved data: {data}\n"

    # Messages for the chat model (GPT-4) to synthesize the final response
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that provides concise and accurate answers based on user queries and retrieved data."
        },
        {
            "role": "user",
            "content": f"User query: {user_query}"
        },
        {
            "role": "system",
            "content": data_context  # Provide the retrieved data as context for GPT-4
        }
    ]
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=1000,
        temperature=0,
        top_p=1,
    )
     # Calculate cost
    prompt_tokens = response.usage.prompt_tokens
    completion_tokens = response.usage.completion_tokens

    # GPT-3.5-turbo pricing per 1K tokens
    input_cost = (prompt_tokens / 1000) * 0.0015
    output_cost = (completion_tokens / 1000) * 0.002
    total_cost = input_cost + output_cost

    response_dict = {
        "model": response.model,
        "tokens": {
            "prompt": prompt_tokens,
            "completion": completion_tokens,
            "total": response.usage.total_tokens
        },
        "cost": f"${total_cost:.6f}",
    }
    logger.info(f"OpenAI response:\n{json.dumps(response_dict, indent=2, ensure_ascii=False)}")
    
    final_response = response.choices[0].message.content.strip()
    return final_response

# Example usage:
final_response = synthesize_response("How many customers do I have?", result, api_key=api_key)
print(final_response)


[32m2025-10-09 12:53:40.201[0m | [1mINFO    [0m | [36m__main__[0m:[36msynthesize_response[0m:[36m47[0m - [1mOpenAI response:
{
  "model": "gpt-3.5-turbo-0125",
  "tokens": {
    "prompt": 349,
    "completion": 15,
    "total": 364
  },
  "cost": "$0.000553"
}[0m


Based on the retrieved data, you have a total of 9 customers.


In [None]:
def rag_pipeline(user_query, db_path='data.sqlite', api_key=None):
    # Step 1: Get the database schema dynamically
    schema = get_database_schema(db_path)
    # print(f"Database Schema: {schema}")  # Optional: to visualize the database schema
    
    # Step 2: Generate the SQL query from the user's natural language question
    sql_query = generate_sql_query(user_query, schema, api_key)
    # print(f"Generated SQL Query: {sql_query}")  # Optional: to visualize the generated SQL
    
    # Step 3: Execute the SQL query and retrieve data from the database
    result = execute_sql_query(sql_query, db_path)
    # print(f"Retrieved Data: {result}")  # Optional: to visualize the retrieved
    
    # Step 4: Synthesize the final response using GPT-4
    final_response = synthesize_response(user_query, result, api_key)
    
    return final_response



In [None]:
user_query = "Μια λίστα με τους πελάτες μου από 20 μέχρι 22 χρονών."
# user_query = "Θέλω μια λίστα με τα duplicates με τους διπλούς πελάτες δίπλα στον αριθμό τους. Μπορείς να μου το δώσεις; Τα duplicates είναι άτομα με το ίδιο όνομα και επώνυμο. και θέλω τον αριθμό τους και το όνομα τους σε μορφή json; θέλω και τους δύο πελάτες που είναι duplicates να εμφανίζονται στην λίστα αλλά ο αριθμός τους να μη χρησιμοπποιείτε για την ανεύρεση των διπλοτύπων;"

response = rag_pipeline(user_query, db_path="data.sqlite", api_key=api_key)
print(response)

In [None]:
# Function to handle the recursive question-answer process
def ask_questions():
    while True:
        user_query = input("Enter your question (or type 'exit' to quit): ")
        print(user_query)
        
        # Exit condition
        if user_query.lower() == 'exit':
            print("Exiting the question-answer loop.")
            break

        # Process the query using your pipeline
        response = rag_pipeline(user_query, db_path="data.sqlite", api_key=api_key)
        print(response)  # Print the response for the current query

# Call the function to start the loop
ask_questions()


In [None]:
import yfinance as yf

tickers = ["SPY"]
data = yf.download(tickers, period="max", interval="1h")
print(data['Close'].tail())

In [None]:
from matplotlib.pyplot import plot

plot(data['Close'])
