In [6]:
import re

def ddl_to_label_dict(ddl, reserved_words=['if', 'not', 'exists', 'table', 'create', 'primary', 'constraint', 'foreign', 'references']):
    # Convert reserved words to lowercase for case-insensitive comparison
    reserved_words = [word.strip().lower() for word in reserved_words if word.strip()]
    
    # Extract column definitions from DDL
    # This regex now captures column names that might include underscores
    column_defs = re.findall(r'(\w+(?:_\w+)*)\s+[\w()]+', ddl)
    
    # Function to convert snake_case to Title Case
    def to_title_case(string):
        return ' '.join(word.capitalize() for word in string.split('_'))
    
    # Create dictionary with column names as keys and title-cased labels as values
    return {col: to_title_case(col) for col in column_defs if col.lower() not in reserved_words}


# Example usage:
ddl = """
CREATE TABLE IF NOT EXISTS {TABLE_H7_TASK} (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    task_name TEXT NOT NULL,
    description TEXT,
    task_group TEXT DEFAULT 'Personal' CHECK(task_group IN ('','Work', 'Personal')),
    is_urgent TEXT DEFAULT 'N' CHECK(is_urgent IN ('Y', 'N')),
    is_important TEXT DEFAULT 'N' CHECK(is_important IN ('Y', 'N')),
    status TEXT DEFAULT '' CHECK(status IN ('', 'ToDo', 'Doing', 'Done')),
    pct_completed TEXT DEFAULT '0%' CHECK(pct_completed IN ('0%', '25%', '50%', '75%', '100%')),
    due_date TEXT,
    category TEXT DEFAULT '' CHECK(category IN ('', 'learning', 'research', 'project', 'fun')),
    note TEXT,
    created_by TEXT,
    created_at TEXT,
    updated_by TEXT,
    updated_at TEXT
)
"""

result = ddl_to_label_dict(ddl)
print(result)

{'id': 'Id', 'task_name': 'Task Name', 'description': 'Description', 'task_group': 'Task Group', 'is_urgent': 'Is Urgent', 'is_important': 'Is Important', 'status': 'Status', 'pct_completed': 'Pct Completed', 'due_date': 'Due Date', 'category': 'Category', 'note': 'Note', 'created_by': 'Created By', 'created_at': 'Created At', 'updated_by': 'Updated By', 'updated_at': 'Updated At'}


In [4]:
s = """
CONSTRAINT fk_departments
    FOREIGN KEY (department_id)
    REFERENCES
"""
l = [i.strip().split()[0].lower() for i in s.split("\n") if i.strip()]
print(l)

['constraint', 'foreign', 'references']


In [7]:
TABLE_H7_TASK = "habits7_task"
TABLE_H7_USER = "habits7_user"
ddl_statements = {
    TABLE_H7_USER: f'''
    CREATE TABLE IF NOT EXISTS {TABLE_H7_USER} (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        email TEXT NOT NULL UNIQUE,
        password TEXT NOT NULL,
        username TEXT,
        is_admin INTEGER DEFAULT 0 CHECK(is_admin IN (0, 1)),
        is_active INTEGER DEFAULT 1 CHECK(is_active IN (0, 1)),
        profile TEXT,
        note TEXT,
        created_by TEXT,
        created_at TEXT,
        updated_by TEXT,
        updated_at TEXT
    )
    ''',
    TABLE_H7_TASK: f'''
    CREATE TABLE IF NOT EXISTS {TABLE_H7_TASK} (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        task_name TEXT NOT NULL,
        description TEXT,
        task_group TEXT DEFAULT 'Personal' CHECK(task_group IN ('','Work', 'Personal')),
        is_urgent TEXT DEFAULT 'N' CHECK(is_urgent IN ('Y', 'N')),
        is_important TEXT DEFAULT 'N' CHECK(is_important IN ('Y', 'N')),
        status TEXT DEFAULT '' CHECK(status IN ('', 'ToDo', 'Doing', 'Done')),
        pct_completed TEXT DEFAULT '0%' CHECK(pct_completed IN ('0%', '25%', '50%', '75%', '100%')),
        due_date TEXT,
        category TEXT DEFAULT '' CHECK(category IN ('', 'learning', 'research', 'project', 'fun')),
        note TEXT,
        created_by TEXT,
        created_at TEXT,
        updated_by TEXT,
        updated_at TEXT
    ''',
}

In [8]:
def ddl_to_label_dict(ddl, reserved_words=['if', 'not', 'exists', 'table', 'create', 'primary', 'constraint', 'foreign', 'references', 'DEFAULT']):
    # Convert reserved words to lowercase for case-insensitive comparison
    reserved_words = [word.strip().lower() for word in reserved_words if word.strip()]
    
    # Extract column definitions from DDL
    # This regex now captures column names that might include underscores
    column_defs = re.findall(r'(\w+(?:_\w+)*)\s+[\w()]+', ddl)
    
    # Function to convert snake_case to Title Case
    def to_title_case(string):
        return ' '.join(word.capitalize() for word in string.split('_'))
    
    # Create dictionary with column names as keys and title-cased labels as values
    return {col: to_title_case(col) for col in column_defs if col.lower() not in reserved_words}

COL_LABELS = {k: ddl_to_label_dict(v) for k,v in ddl_statements.items() }


In [9]:
COL_LABELS

{'habits7_user': {'id': 'Id',
  'email': 'Email',
  'password': 'Password',
  'username': 'Username',
  'is_admin': 'Is Admin',
  'DEFAULT': 'Default',
  'is_active': 'Is Active',
  'profile': 'Profile',
  'note': 'Note',
  'created_by': 'Created By',
  'created_at': 'Created At',
  'updated_by': 'Updated By',
  'updated_at': 'Updated At'},
 'habits7_task': {'id': 'Id',
  'task_name': 'Task Name',
  'description': 'Description',
  'task_group': 'Task Group',
  'is_urgent': 'Is Urgent',
  'is_important': 'Is Important',
  'status': 'Status',
  'pct_completed': 'Pct Completed',
  'due_date': 'Due Date',
  'category': 'Category',
  'note': 'Note',
  'created_by': 'Created By',
  'created_at': 'Created At',
  'updated_by': 'Updated By',
  'updated_at': 'Updated At'}}

In [10]:
prompt_json = [{'role': 'system', 'content': 'You are a SQLite expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Tables \nCREATE TABLE "t_revenue" (\n"rank" INTEGER,\n  "name" TEXT,\n  "symbol" TEXT,\n  "revenue_ttm" INTEGER,\n  "price_gbp" REAL,\n  "country" TEXT\n)\n\nCREATE TABLE "t_earnings" (\n"rank" INTEGER,\n  "name" TEXT,\n  "symbol" TEXT,\n  "earnings_ttm" REAL,\n  "price_gbp" REAL,\n  "country" TEXT\n)\n\nCREATE TABLE "t_market_cap" (\n"rank" INTEGER,\n  "name" TEXT,\n  "symbol" TEXT,\n  "marketcap" REAL,\n  "price_gbp" REAL,\n  "country" TEXT\n)\n\nCREATE TABLE "t_p_e_ratio" (\n"rank" INTEGER,\n  "name" TEXT,\n  "symbol" TEXT,\n  "pe_ratio_ttm" REAL,\n  "price_gbp" REAL,\n  "country" TEXT\n)\n\nCREATE TABLE "t_dividend_yield" (\n"rank" INTEGER,\n  "name" TEXT,\n  "symbol" TEXT,\n  "dividend_yield_ttm" REAL,\n  "price_gbp" REAL,\n  "country" TEXT\n)\n\nCREATE TABLE "t_country_region" (\n"country" TEXT,\n  "code_2" TEXT,\n  "code_3" TEXT,\n  "region" TEXT,\n  "sub_region" TEXT,\n  "country_name" TEXT\n)\n\n\n===Additional Context \n\n\nIntroduction:\nThis dataset ranks top companies in the world.\nIts analysis delves into the financial performance of top companies by examining key metrics such as revenue, earnings, market capitalisation, P/E ratio, and dividend yield. By comparing these metrics, we gain a comprehensive understanding of a company\'s scale, profitability, market value, and growth potential. Through visualisations, the analysis also explores correlations between these metrics and offers insights into country-level performance, highlighting economic dominance across various sectors. This holistic approach provides a multi-dimensional view of global financial powerhouses, investor confidence, and regional economic trends.\n\nKey Metrics Used:\n1. Revenue (Trailing Twelve Months - TTM):\nTable Name: t_revenue ;\nDefinition: This is the total income generated by a company from its operations in the last twelve months ;\nPotential Insights: High revenue often indicates market dominance or high sales volume. Comparing revenues can reveal which companies are the largest in terms of business volume.\n\n2. Earnings (TTM):\nTable Name: t_earnings\nDefinition: This refers to the company\'s profit after taxes and expenses over the trailing twelve months.\nPotential Insights: Companies with high earnings are more efficient at converting revenue into profit, suggesting better profitability or cost management. A comparison of earnings provides insight into profitability rather than just scale.\n\n3. Market Capitalisation (Market Cap):\nTable Name: t_market_cap\nDefinition: Market cap is the total value of a company\'s outstanding shares of stock, calculated as stock price multiplied by the number of shares. It indicates the company’s size in the stock market.\nPotential Insights: High market cap usually indicates investor confidence in the company. Comparing market cap among the top 15 companies reveals their relative size in financial markets.\n\n4. P/E Ratio (TTM):\nTable Name: t_p_e_ratio\nDefinition: Price-to-Earnings (P/E) ratio measures a company\'s current share price relative to its per-share earnings.\nPotential Insights: A high P/E ratio may indicate that investors expect high growth in the future, while a low P/E ratio could imply undervaluation or scepticism about growth. Companies are compared by their growth prospects or current valuation.\n\n5. Dividend Yield (TTM):\nTable Name: t_dividend_yield\nDefinition: Dividend yield is a financial ratio that shows how much a company pays out in dividends each year relative to its share price.\nPotential Insights: High dividend yield may indicate that a company returns more income to shareholders. It’s particularly useful for income-focused investors.\n\n6. Country-Region Map:\nTable Name: t_country_region\nDefinition: This table translate country to region\n\n\n===Response Guidelines \n1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n3. If the provided context is insufficient, please explain why it can\'t be generated. \n4. Please use the most relevant table(s). \n5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n6. Ensure that the output SQL is SQLite-compliant and executable, and free of syntax errors. \n'}, {'role': 'user', 'content': "\n                For this question: which table stores Company Market Cap, \n                your generated SQL statement:  The table that stores information about a company's market cap is `t_market_cap`. results in the following exception: [ERROR-SQL] the generated SQL :  The table that stores information about a company's market cap is `t_market_cap`.\n does not starts with ('select','with') .\n                Can you please fix the error and re-generate the SQL statement?\n            "}, {'role': 'assistant', 'content': "SELECT 't_market_cap' AS table_name"}, {'role': 'user', 'content': "\n                For this question: \n                For this question: which table stores Company Dividend Yield, \n                your generated SQL statement: The LLM is not allowed to see the data in your database. Your question requires database introspection to generate the necessary SQL. Please set allow_llm_to_see_data=True to enable this. results in the following exception: [ERROR-SQL] the generated SQL : The LLM is not allowed to see the data in your database. Your question requires database introspection to generate the necessary SQL. Please set allow_llm_to_see_data=True to enable this.\n does not starts with ('select','with') .\n                Can you please fix the error and re-generate the SQL statement?\n            , \n                your generated SQL statement:  The table that stores information about a company's dividend yield is `t_dividend_yield`. results in the following exception: [ERROR-SQL] the generated SQL :  The table that stores information about a company's dividend yield is `t_dividend_yield`.\n does not starts with ('select','with') .\n                Can you please fix the error and re-generate the SQL statement?\n            "}, {'role': 'assistant', 'content': " SELECT name FROM sqlite_master WHERE type='table'"}, {'role': 'user', 'content': "\n                For this question: which table stores Company P/E Ratio, \n                your generated SQL statement: The LLM is not allowed to see the data in your database. Your question requires database introspection to generate the necessary SQL. Please set allow_llm_to_see_data=True to enable this. results in the following exception: [ERROR-SQL] the generated SQL : The LLM is not allowed to see the data in your database. Your question requires database introspection to generate the necessary SQL. Please set allow_llm_to_see_data=True to enable this.\n does not starts with ('select','with') .\n                Can you please fix the error and re-generate the SQL statement?\n            "}, {'role': 'assistant', 'content': "SELECT name FROM sqlite_master WHERE type='table'"}, {'role': 'user', 'content': 'Can you list all tables in the SQLite database catalog?'}, {'role': 'assistant', 'content': "SELECT name FROM sqlite_master WHERE type='table'"}, {'role': 'user', 'content': "\n                For this question: which table stores Company Revenue, \n                your generated SQL statement:  The table that stores information about a company's revenue is `t_revenue`. results in the following exception: [ERROR-SQL] the generated SQL :  The table that stores information about a company's revenue is `t_revenue`.\n does not starts with ('select','with') .\n                Can you please fix the error and re-generate the SQL statement?\n            "}]

In [13]:
out_json = []
for m in prompt_json:
    m['content'] = m['content'][:50] + "..."
    out_json.append(m)

In [14]:
out_json

[{'role': 'system',
  'content': 'You are a SQLite expert. Please help to generate a...'},
 {'role': 'user',
  'content': '\n                For this question: which table st...'},
 {'role': 'assistant', 'content': "SELECT 't_market_cap' AS table_name..."},
 {'role': 'user',
  'content': '\n                For this question: \n             ...'},
 {'role': 'assistant',
  'content': " SELECT name FROM sqlite_master WHERE type='table'..."},
 {'role': 'user',
  'content': '\n                For this question: which table st...'},
 {'role': 'assistant',
  'content': "SELECT name FROM sqlite_master WHERE type='table'..."},
 {'role': 'user',
  'content': 'Can you list all tables in the SQLite database cat...'},
 {'role': 'assistant',
  'content': "SELECT name FROM sqlite_master WHERE type='table'..."},
 {'role': 'user',
  'content': '\n                For this question: which table st...'}]

In [15]:
def keep_latest_messages(prompt_json):
    latest_messages = {}
    
    for message in reversed(prompt_json):
        role = message['role']
        if role not in latest_messages:
            latest_messages[role] = message
    
    return [latest_messages[role] for role in ['system', 'assistant', 'user'] if role in latest_messages]

In [17]:
updated_prompt = keep_latest_messages(out_json)
updated_prompt

[{'role': 'system',
  'content': 'You are a SQLite expert. Please help to generate a...'},
 {'role': 'assistant',
  'content': "SELECT name FROM sqlite_master WHERE type='table'..."},
 {'role': 'user',
  'content': '\n                For this question: which table st...'}]