# Libraries

## Install libraries

In [None]:
#pip install pandas openpyxl sqlalchemy psycopg2-binary flair langchain langchain-community langchain-openai fuzzywuzzy python-Levenshtein nltk tqdm

## Import libraries

In [None]:
import pandas as pd
from sqlalchemy import create_engine
from langchain_community.utilities import SQLDatabase
from langchain.chains import create_sql_query_chain
from langchain_openai import ChatOpenAI
from typing import List
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# SQL Connection

In [None]:
# Set the database URL
DATABASE_URL = 'postgresql+psycopg2://thodoris:B4AqjEYBhDPXDHmuSW8MYgfdPp5Nob88@dpg-cpmtb4g8fa8c73aoakig-a.frankfurt-postgres.render.com/capstone_fs'

# Create a SQLAlchemy engine
engine = create_engine(DATABASE_URL)

# Function to load data from the database
def load_data_from_db(query):
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

# Load Excel files in database

In [None]:
'''# Read the uploaded Excel file into a DataFrame
file_name = '/content/Mock up book.xls'  # uploaded to colab this will be deleted
df = pd.read_excel(file_name)

# Upload the DataFrame to PostgreSQL
table_name = 'mock_data'
df.to_sql(table_name, engine, if_exists='replace', index=False)

print(f"Uploaded {file_name} to {table_name} table in PostgreSQL.")'''

# Useful functions for querying SQL

In [None]:
# Lists all tables in database
def get_all_tables():
    query = """
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'public'
    """
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

# Lists the columns in a table and their data type
def inspect_columns(table_name):
    query = f"""
    SELECT column_name, data_type
    FROM information_schema.columns
    WHERE table_name = '{table_name}'
    """
    try:
        with engine.connect() as connection:
            result = pd.read_sql_query(query, connection)
        print(f"Columns and their data types in the table {table_name}:\n")
        print(result)
    except Exception as e:
        print(f"An error occurred: {e}")

# Returns the first 5 rows
def get_top_5_rows(table_name):
    query = f"SELECT * FROM {table_name} LIMIT 5"
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

# Executes a given query
def execute_query(table_name, query):
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

In [None]:
'''
# Example usage
inspect_columns('mock_data')

table_name = 'mock_data'
columns = '"Feedback"'
query = f"SELECT {columns} FROM {table_name}"
execute_query(table_name, query)
'''

# Dataset

In [None]:
table_name = 'mock_data'
columns = '"Feedback"'
query = f"SELECT {columns} FROM {table_name}"
df = execute_query(table_name, query)

In [None]:
df.head(5)

In [None]:
from flair.models import TextClassifier
classifier = TextClassifier.load('en-sentiment')
# Import flair Sentence to process input text
from flair.data import Sentence
# Import accuracy_score to check performance
from sklearn.metrics import accuracy_score

text= df["Feedback"][0]
sentence = Sentence(text)
classifier.predict(sentence)
score = sentence.labels[0].score
value = sentence.labels[0].value

In [None]:
score, value

In [None]:
df['Feedback'][0]

# Text 2 SQL

## Approach 1 (Working)

Step 1: Connect to the Database and Obtain Table and Column Names

In [3]:
from sqlalchemy import create_engine, MetaData

# Database URL
DATABASE_URL = 'postgresql+psycopg2://thodoris:B4AqjEYBhDPXDHmuSW8MYgfdPp5Nob88@dpg-cpmtb4g8fa8c73aoakig-a.frankfurt-postgres.render.com/capstone_fs'

# Create an engine
engine = create_engine(DATABASE_URL)

# Connect to the database and fetch table and column names
metadata = MetaData()
metadata.reflect(bind=engine)

# Store table and column names
table_columns = {}
for table in metadata.tables.values():
    table_columns[table.name] = [column.name for column in table.c]

# Print table and column names
for table, columns in table_columns.items():
    print(f"Table: {table}\nColumns: {columns}\n")

Table: mock_data
Columns: ['Order ID', 'Investor ID', 'Investor Name', 'Region', 'Territory/Country', 'Type', 'Firm (mm)', 'Alloc Firm (mm)', 'Limit Security', 'Spread', 'Hedge/Switch', 'Modified', 'Created', 'Deal', 'Feedback']



Step 2: Use NLP to Parse the User Query

In [4]:
from transformers import pipeline
from fuzzywuzzy import process
import spacy
from tqdm import tqdm

# Load Spacy model
nlp_spacy = spacy.load("en_core_web_sm")

# Load a pre-trained model for zero-shot classification
nlp_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Example categories for intent classification
categories = ["count", "retrieve", "average", "sum", "min", "max", "unique count"]

# Filter-indicating words
filter_keywords = {'that', 'where', 'which', 'over', 'under', 'greater', 'less', 'have', 'has'}

def parse_user_query():
    question = input("Please enter your question: ")
    intent_result = nlp_model(question, candidate_labels=categories)
    intent = intent_result['labels'][0]  # Most likely intent
    return intent, question

def extract_pre_filter_keywords(question):
    # Split question at the filter keywords
    tokens = question.lower().split()
    split_index = len(tokens)
    for i, token in enumerate(tokens):
        if token in filter_keywords:
            split_index = i
            break

    pre_filter_tokens = tokens[:split_index]
    keywords = [word for word in pre_filter_tokens if word.isalpha()]

    return keywords

def match_columns(keywords, priority_terms=[]):
    matched_columns = []
    for table, col_list in table_columns.items():
        for keyword in tqdm(keywords, desc="Matching columns"):
            closest_match = process.extract(keyword, col_list, limit=3)
            for match in closest_match:
                matched_columns.append((table, match[0], match[1]))

    # Increase confidence for priority terms
    for i, (table, column, confidence) in enumerate(matched_columns):
        for term in priority_terms:
            if term in column.lower():
                matched_columns[i] = (table, column, confidence + 20)

    matched_columns = sorted(matched_columns, key=lambda x: x[2], reverse=True)  # Sort by confidence, descending
    return matched_columns

def needs_filter(question):
    tokens = question.lower().split()
    return any(token in filter_keywords for token in tokens)

Step 3: Dynamically Form and Execute SQL Queries

In [10]:
import pandas as pd
from sqlalchemy.sql import text

def extract_filter_conditions(filter_input):
    doc = nlp_spacy(filter_input)
    conditions = []

    operators = {
        'greater': '>',
        'more': '>',
        'less': '<',
        'fewer': '<',
        'equal': '=',
        'over': '>',
        'under': '<',
        'above': '>',
        'below': '<'
    }

    current_column = None
    operator = None
    value = None

    for token in doc:
        if token.text.lower() in operators:
            operator = operators[token.text.lower()]
        elif token.like_num:
            value = token.text
        elif token.text.isalpha():
            current_column = token.text

    if current_column and operator and value:
        conditions.append((current_column, operator, value))

    return conditions

def form_query(parsed_intent, question):
    pre_filter_keywords = extract_pre_filter_keywords(question)
    matched_columns = match_columns(pre_filter_keywords, priority_terms=["investor", "firm"])

    if not matched_columns:
        print("No columns matched.")
        return None

    select_columns = []
    for table, column, confidence in matched_columns:
        if confidence >= 95:
            select_columns.append((table, column))
        else:
            break  # Only show suggestions if confidence is below 95%

    if not select_columns and matched_columns:
        print(f"\nThe script is not fully confident about the columns. Here are the suggestions:\n")
        for i, (table, column, confidence) in enumerate(matched_columns):
            print(f"{i+1}. Table: {table}, Column: {column}, Confidence: {confidence}%\n")

        choice = input("Choose the number of the correct column or press enter to use the highest confidence match: ")
        if choice.isdigit() and 1 <= int(choice) <= len(matched_columns):
            index = int(choice) - 1
            select_columns.append((matched_columns[index][0], matched_columns[index][1]))
        else:
            select_columns.append((matched_columns[0][0], matched_columns[0][1]))  # Use highest confidence match

    if not select_columns:
        print("No columns confirmed.")
        return None

    where_clause = ''
    if needs_filter(question):
        filter_input = input("As I understand, you want to filter some of your results. Explain to me in your words the filter that should be applied. If you do not want an extra filter, type 'Continue': ")
        if filter_input.lower() != 'continue':
            filter_conditions = extract_filter_conditions(filter_input)
            for col_keyword, operator, value in filter_conditions:
                matched_filter_columns = match_columns([col_keyword])
                if matched_filter_columns:
                    best_match = matched_filter_columns[0]
                    if best_match[2] < 90:
                        table, col_list = best_match[0], table_columns[best_match[0]]
                        print(f"\nThe script is not fully confident about the column that should be filtered. Here are the columns in the table '{table}':\n")
                        for i, col in enumerate(col_list, 1):
                            print(f"{i+1}. {col}\n")
                        col_choice = input("Choose the number of the correct column: ")
                        if col_choice.isdigit() and 1 <= int(col_choice) <= len(col_list):
                            chosen_column = col_list[int(col_choice) - 1]
                            where_clause += f' AND "{chosen_column}" {operator} {value}'
                    else:
                        where_clause += f' AND "{best_match[1]}" {operator} {value}'
            if where_clause:
                where_clause = where_clause.lstrip(' AND')

    query = None
    if parsed_intent == "count":
        for table, column in select_columns:
            query = f'SELECT COUNT("{column}") FROM {table}'
            if where_clause:
                query += f' WHERE {where_clause}'
            return query
    elif parsed_intent == "retrieve":
        for table, column in select_columns:
            query = f'SELECT * FROM {table}'
            if where_clause:
                query += f' WHERE {where_clause}'
            return query
    elif parsed_intent == "average":
        for table, column in select_columns:
            query = f'SELECT AVG("{column}") FROM {table}'
            if where_clause:
                query += f' WHERE {where_clause}'
            return query
    elif parsed_intent == "sum":
        for table, column in select_columns:
            query = f'SELECT SUM("{column}") FROM {table}'
            if where_clause:
                query += f' WHERE {where_clause}'
            return query
    elif parsed_intent == "min":
        for table, column in select_columns:
            query = f'SELECT MIN("{column}") FROM {table}'
            if where_clause:
                query += f' WHERE {where_clause}'
            return query
    elif parsed_intent == "max":
        for table, column in select_columns:
            query = f'SELECT MAX("{column}") FROM {table}'
            if where_clause:
                query += f' WHERE {where_clause}'
            return query
    elif parsed_intent == "unique count":
        for table, column in select_columns:
            query = f'SELECT COUNT(DISTINCT "{column}") FROM {table}'
            if where_clause:
                query += f' WHERE {where_clause}'
            return query
    return None

def execute_query(query):
    with engine.connect() as connection:
        result = connection.execute(text(query))
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
        return df

Step 4: Explain Results in Natural Language

In [11]:
def explain_results(result_df, parsed_intent, original_question):
    if result_df.empty:
        return "No data found."

    if parsed_intent == "count":
        count = result_df.iloc[0, 0]
        explanation = f"There are {count} records matching your query."
    elif parsed_intent == "retrieve":
        explanation = f"The query returned {len(result_df)} records. Here are the details:"
    elif parsed_intent == "average":
        avg_value = result_df.iloc[0, 0]
        explanation = f"The average value is {avg_value}."
    elif parsed_intent == "sum":
        sum_value = result_df.iloc[0, 0]
        explanation = f"The total sum is {sum_value}."
    elif parsed_intent == "min":
        min_value = result_df.iloc[0, 0]
        explanation = f"The minimum value is {min_value}."
    elif parsed_intent == "max":
        max_value = result_df.iloc[0, 0]
        explanation = f"The maximum value is {max_value}."
    elif parsed_intent == "unique count":
        unique_count = result_df.iloc[0, 0]
        explanation = f"There are {unique_count} unique records matching your query."
    else:
        explanation = "Query executed successfully."

    return explanation, result_df

In [12]:
# Main script
parsed_intent, question = parse_user_query()
query = form_query(parsed_intent, question)
if query:
    result_df = execute_query(query)
    explanation, result_df = explain_results(result_df, parsed_intent, question)
    output_message = f"The intent of the question was: {parsed_intent}\n\nThe question was: {question}\n\n{explanation}"
    print(output_message)
    display(result_df)  # Use Jupyter's display function to show the DataFrame
else:
    print("Could not generate a query based on the input.")

Please enter your question: Retrieve investors that have less than 500 firm


Matching columns: 100%|██████████| 2/2 [00:00<00:00, 333.77it/s]


As I understand, you want to filter some of your results. Explain to me in your words the filter that should be applied. If you do not want an extra filter, type 'Continue': firm less than 500


Matching columns: 100%|██████████| 1/1 [00:00<00:00, 566.19it/s]



The script is not fully confident about the column that should be filtered. Here are the columns in the table 'mock_data':

2. Order ID

3. Investor ID

4. Investor Name

5. Region

6. Territory/Country

7. Type

8. Firm (mm)

9. Alloc Firm (mm)

10. Limit Security

11. Spread

12. Hedge/Switch

13. Modified

14. Created

15. Deal

16. Feedback

Choose the number of the correct column: 8
The intent of the question was: retrieve

The question was: Retrieve investors that have less than 500 firm

The query returned 98 records. Here are the details:


Unnamed: 0,Order ID,Investor ID,Investor Name,Region,Territory/Country,Type,Firm (mm),Alloc Firm (mm),Limit Security,Spread,Hedge/Switch,Modified,Created,Deal,Feedback
0,177267,2556,AG2R,EMEA,FRA,INS,200.000,55.0,,reoffer,[C] Cash,1 Dec 2020 - 15:04:16 (CET),1 Dec 2020 - 11:02:08 (CET),Project Everest,They consider the overall yield would be too l...
1,168073,4701,Aldo Asset Management Ab,EMEA,FIN,FUND,3.000,0.5,,reoffer,[C] Cash,7 Jul 2020 - 16:28:02 (CET),7 Jul 2020 - 09:16:15 (CET),Project Athena,They will take a closer look tomorrow - but as...
2,168148,141158,Alifund Bond Fund,EMEA,ITA,FUND,0.100,0.1,,reoffer,[C] Cash,7 Jul 2020 - 16:33:12 (CET),7 Jul 2020 - 09:30:42 (CET),Project Athena,"Looking at their curve and peers, a 4y should ..."
3,177203,4710,Athena Global Investors (France) S.A.,EMEA,FRA,FUND,117.750,35.0,,reoffer,[C] Cash,1 Dec 2020 - 15:07:26 (CET),1 Dec 2020 - 10:37:27 (CET),Project Everest,"They will look at the new bond, they have alre..."
4,214192,213201,"Alpine Asset Management, LLP",EMEA,GBR,FUND,12.000,1.2,,reoffer,[H] 100% of Face DW Govt 3.25% 4 Jul 2042 Ma...,5 Oct 2022 - 16:09:48 (CET),5 Oct 2022 - 11:43:24 (CET),Project Vesper,"30/40bps NIP is reasonable, tenor is ok"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,177438,5073,Poncho Asset Management S.A.,EMEA,FRA,FUND,20.000,6.0,,reoffer,[C] Cash,1 Dec 2020 - 14:22:59 (CET),1 Dec 2020 - 12:11:02 (CET),Project Everest,"We’re interested in the credit, but the tenor ..."
94,177258,13551,Paton & Merck,AMRS,USA,FUND,7.500,2.5,,reoffer,[C] Cash,1 Dec 2020 - 14:22:59 (CET),1 Dec 2020 - 10:58:56 (CET),Project Everest,"Initial pricing seems tight, but it might fit ..."
95,169250,4334,Prrenial Investments Limited,EMEA,GBR,FUND,47.454,23.0,,reoffer,[C] Cash,7 Jul 2020 - 16:28:02 (CET),7 Jul 2020 - 12:17:38 (CET),Project Athena,"We’re keen on the sector, but significant NIP ..."
96,177504,4334,Prrenial Investments Limited,EMEA,GBR,FUND,1.246,1.2,,reoffer,[C] Cash,1 Dec 2020 - 14:23:18 (CET),1 Dec 2020 - 12:28:37 (CET),Project Everest,"The structure is acceptable, but forward visib..."


## Approach 2

In [None]:
%pip install transformers sqlalchemy psycopg2 pandas


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sqlalchemy import create_engine
import pandas as pd

In [None]:
tokenizer = AutoTokenizer.from_pretrained("suriya7/t5-base-text-to-sql")
model = AutoModelForSeq2SeqLM.from_pretrained("suriya7/t5-base-text-to-sql")

def translate_to_sql_select(english_query):
    input_text = f"translate English to SQL: {english_query}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(input_ids, max_new_tokens=100)
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return sql_query

# Set the database URL
DATABASE_URL = 'postgresql+psycopg2://thodoris:B4AqjEYBhDPXDHmuSW8MYgfdPp5Nob88@dpg-cpmtb4g8fa8c73aoakig-a.frankfurt-postgres.render.com/capstone_fs'

# Create a SQLAlchemy engine
engine = create_engine(DATABASE_URL)

# Function to load data from the database
def load_data_from_db(query):
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

# Function to upload Excel data to PostgreSQL
def upload_excel_to_db(file_name, table_name, engine):
    # Read the uploaded Excel file into a DataFrame
    df = pd.read_excel(file_name)
    # Upload the DataFrame to PostgreSQL
    df.to_sql(table_name, engine, if_exists='replace', index=False)
    print(f"Uploaded {file_name} to {table_name} table in PostgreSQL.")


In [None]:
# File name and table name
file_name = '/content/Mock up book.xls'  # Change the path as necessary
table_name = 'mock_data'

# Upload the Excel data to the database
upload_excel_to_db(file_name, table_name, engine)


In [None]:
table_name

In [None]:
# Example usage
english_query = "Show one investor id with firm alloc greater than 5000"
sql_query = translate_to_sql_select(english_query)
print("SQL Query:", sql_query)

# Load data from the database
try:
    data = load_data_from_db(sql_query)
    print("Data from the database:")
    print(data)
except Exception as e:
    print(f"An error occurred: {e}")

## Approach 3

In [None]:
%pip install transformers torch sqlalchemy pandas psycopg2-binary

In [None]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Set the database URL
DATABASE_URL = 'postgresql+psycopg2://thodoris:B4AqjEYBhDPXDHmuSW8MYgfdPp5Nob88@dpg-cpmtb4g8fa8c73aoakig-a.frankfurt-postgres.render.com/capstone_fs'

# Create a SQLAlchemy engine
engine = create_engine(DATABASE_URL)

In [None]:
# Function to read all tables and columns from the database
def get_db_schema(engine):
    inspector = inspect(engine)
    schema = {}
    for table_name in inspector.get_table_names():
        columns = inspector.get_columns(table_name)
        schema[table_name] = [column['name'] for column in columns]
    return schema

# Format schema for better input to the model
def format_schema(schema):
    formatted_schema = []
    for table, columns in schema.items():
        formatted_columns = ", ".join(columns)
        formatted_schema.append(f"{table}({formatted_columns})")
    return " | ".join(formatted_schema)

In [None]:
# Load the schema
schema = get_db_schema(engine)
formatted_schema = format_schema(schema)
print(formatted_schema)

In [None]:
# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('cssupport/t5-small-awesome-text-to-sql')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

In [None]:
# Function to generate SQL query from natural language prompt
def generate_sql(prompt, formatted_schema):
    input_text = f"{prompt} | {formatted_schema}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(**inputs, max_length=512)
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return sql_query

In [None]:
# Example usage
prompt = "Show one investor id with firm alloc greater than 5000"
sql_query = generate_sql(prompt, formatted_schema)
print(f"Generated SQL query: {sql_query}")

In [None]:
# Function to load data from the database
def load_data_from_db(query):
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

In [None]:
# Execute the query
result_df = load_data_from_db(sql_query)
print(result_df)

In [None]:
# Function to explain the results
def explain_results(result_df):
    num_investors = result_df.iloc[0, 0] if not result_df.empty else 0
    return f"There are {num_investors} investors."

In [None]:
# Explain the results
explanation = explain_results(result_df)
print(explanation)

## Approach 4

In [None]:
%pip install transformers torch sqlalchemy pandas psycopg2-binary

In [None]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Set the database URL
DATABASE_URL = 'postgresql+psycopg2://thodoris:B4AqjEYBhDPXDHmuSW8MYgfdPp5Nob88@dpg-cpmtb4g8fa8c73aoakig-a.frankfurt-postgres.render.com/capstone_fs'

# Create a SQLAlchemy engine
engine = create_engine(DATABASE_URL)

In [None]:
# Function to read all tables and columns from the database
def get_db_schema(engine):
    inspector = inspect(engine)
    schema = {}
    for table_name in inspector.get_table_names():
        columns = inspector.get_columns(table_name)
        schema[table_name] = [column['name'] for column in columns]
    return schema

# Format schema for better input to the model
def format_schema(schema):
    formatted_schema = []
    for table, columns in schema.items():
        formatted_columns = ", ".join(columns)
        formatted_schema.append(f"{table}({formatted_columns})")
    return " | ".join(formatted_schema)

In [None]:
# Load the schema
schema = get_db_schema(engine)
formatted_schema = format_schema(schema)
print(formatted_schema)

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("suriya7/t5-base-text-to-sql")
model = AutoModelForSeq2SeqLM.from_pretrained("suriya7/t5-base-text-to-sql")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

In [None]:
# Function to generate SQL query from natural language prompt
def translate_to_sql_select(english_query):
    input_text = f"translate English to SQL: {english_query}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(input_ids, max_new_tokens=100)
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return sql_query

# Postprocess SQL query to correct common errors
def postprocess_sql(sql_query):
    sql_query = sql_query.replace(' , ', ', ')
    sql_query = sql_query.replace(' .', '.')
    if 'SELECT' not in sql_query.upper():
        sql_query = 'SELECT ' + sql_query
    return sql_query

In [None]:
# Example usage
prompt = "How many investors ids are there"
sql_query = translate_to_sql_select(prompt)
sql_query = postprocess_sql(sql_query)
print(f"Generated SQL query: {sql_query}")

In [None]:
# Function to load data from the database
def load_data_from_db(query):
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

In [None]:
# Execute the query
result_df = load_data_from_db(sql_query)
print(result_df)

In [None]:
# Function to explain the results
def explain_results(result_df):
    if result_df.empty:
        return "There are no investors available."
    num_investors = result_df.iloc[0, 0] if len(result_df.columns) == 1 else len(result_df)
    return f"There are {num_investors} investors."

In [None]:
# Explain the results
explanation = explain_results(result_df)
print(explanation)

## Approach 5

In [None]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Set the database URL
DATABASE_URL = 'postgresql+psycopg2://thodoris:B4AqjEYBhDPXDHmuSW8MYgfdPp5Nob88@dpg-cpmtb4g8fa8c73aoakig-a.frankfurt-postgres.render.com/capstone_fs'

# Create a SQLAlchemy engine
engine = create_engine(DATABASE_URL)

In [None]:
# Function to read all tables and columns from the database
def get_db_schema(engine):
    inspector = inspect(engine)
    schema = {}
    for table_name in inspector.get_table_names():
        columns = inspector.get_columns(table_name)
        schema[table_name] = [column['name'] for column in columns]
    return schema

# Format schema for better input to the model
def format_schema(schema):
    formatted_schema = []
    for table, columns in schema.items():
        formatted_columns = ", ".join(columns)
        formatted_schema.append(f"{table}({formatted_columns})")
    return " | ".join(formatted_schema)

In [None]:
# Load the schema
schema = get_db_schema(engine)
formatted_schema = format_schema(schema)
print("Formatted Schema:", formatted_schema)

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("suriya7/t5-base-text-to-sql")
model = AutoModelForSeq2SeqLM.from_pretrained("suriya7/t5-base-text-to-sql")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

In [None]:
# Function to generate SQL query from natural language prompt
def translate_to_sql_select(english_query, formatted_schema):
    input_text = f"Query: {english_query}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(input_ids, max_new_tokens=100)
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return sql_query

# Postprocess SQL query to correct common errors
def postprocess_sql(sql_query):
    sql_query = sql_query.replace(' , ', ', ')
    sql_query = sql_query.replace(' .', '.')
    if 'SELECT' not in sql_query.upper():
        sql_query = 'SELECT ' + sql_query
    return sql_query

In [None]:
# Example usage
prompt = "How many investors are there?"
sql_query = translate_to_sql_select(prompt, formatted_schema)
sql_query = postprocess_sql(sql_query)
print(f"Generated SQL query: {sql_query}")

In [None]:
# Function to load data from the database
def load_data_from_db(query):
    with engine.connect() as connection:
        result = pd.read_sql_query(query, connection)
    return result

In [None]:
# Execute the query
try:
    result_df = load_data_from_db(sql_query)
    print(result_df)
except Exception as e:
    print(f"Error executing query: {e}")


In [None]:
# Function to explain the results
def explain_results(result_df):
    if result_df.empty:
        return "There are no investors available."
    num_investors = result_df.iloc[0, 0] if len(result_df.columns) == 1 else len(result_df)
    return f"There are {num_investors} investors."

In [None]:
# Explain the results
explanation = explain_results(result_df)
print(explanation)
