In [1]:
import openai
import psycopg2
import pandas as pd
import re
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import pinecone
import os
from fuzzywuzzy import fuzz

# Initialize OpenAI API key
OPENAI_API_KEY = ""
openai.api_key = OPENAI_API_KEY

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Constants
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
NAMESPACE = ""  # Replace with your namespace
columnnames=''
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# Initialize Pinecone client
def initialize_pinecone():
    from pinecone import Pinecone, ServerlessSpec
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-west-2')
        )
    return pc.Index(INDEX_NAME)

# Load Hugging Face model for embeddings
def load_huggingface_model():
    return SentenceTransformer(MODEL_NAME)


# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Function to fetch schema from PostgreSQL database
def fetch_schema(conn):
    try:
        query = """
        SELECT table_name, column_name
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        # print(schema_df)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema: {e}")
        raise

# Function to process schema: remove special characters and convert to lowercase
def process_schema(schema_df):
    def clean_column_name(name):
        return re.sub(r'[^a-zA-Z]', '', name).lower()

    schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
    return schema_df

# Function to extract features (like project name, owner, etc.) using OpenAI's LLM
def extract_features_with_openai(user_input, processed_schema_df):
    schema_json = processed_schema_df.to_json(orient='records')
    
    # Refined prompt to ensure OpenAI extracts table names, column names, and their values
    prompt = f"""
    ## Database Schema Context:
    The following represents the columns and their respective tables available in the database:
    {schema_json}

    ## User Input:
    The user has provided the following input: "{user_input}"

    ## Task:
    Extract the relevant features, values, and table names from the user input based on the schema. These features might include project names, owners, dates, statuses, etc., along with their corresponding table names.

    ## Instructions:
    - Return a JSON dictionary that includes the table names as keys, and within each table, include the fields and their values extracted from the user input.
    - Omit any fields or tables where the value is empty or null.
    - Format the output as a JSON object with keys only for tables and fields that have values.
    """

    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=500,
            temperature=0.5
        )
        extracted_features = response.choices[0].text.strip()
        return extracted_features
    except openai.OpenAIError as e:
        print(f"Error with OpenAI: {e}")
        raise

# Function to remove null, None, empty values from JSON and list
def clean_extracted_features(feature_dict):
    # Remove any keys with None or empty values
    cleaned_feature_dict = {k: v for k, v in feature_dict.items() if v}
    print("cleaned_feature_dict", cleaned_feature_dict)
    # Extract the non-null values into a list
    feature_list = list(cleaned_feature_dict.values())
    return cleaned_feature_dict, feature_list

#Extract Dynamic Namespoace
def extract_nmaespace(extracted_dict):
    global NAMESPACE,columnnames
    for key in extracted_dict.keys():
        print(key)
    NAMESPACE= key
    # columnnames = extracted_dict[key].values()
    # print(columnnames)
            
# Function to parse and process extracted features
def process_extracted_features(extracted_features):
    try:
        # Remove the "## Solution:" part and any other non-JSON text
        json_match = re.search(r'\{.*\}', extracted_features, re.DOTALL)
        
        if json_match:
            # Extract the JSON part from the matched result
            cleaned_features = json_match.group(0)
            print("cleaned_features", cleaned_features)
            # Convert JSON string to a Python dictionary
            feature_dict = json.loads(cleaned_features)
            print("feature_dict", feature_dict)

            # Clean feature dictionary and feature list to remove nulls and empty values
            cleaned_feature_dict, feature_list = clean_extracted_features(feature_dict)

            # Return cleaned JSON and feature list
            return json.dumps(cleaned_feature_dict, indent=4), feature_list
        else:
            return None, []
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Error parsing features: {e}")
        return None, []

# Query Pinecone for relevant context and augment the input
def query_pinecone_and_augment_input(user_input, entities, namespace):
    embedding_model = load_huggingface_model()
    pinecone_index = initialize_pinecone()
    augmented_input = user_input
    pinecone_data = {}
    global columnnames

    # Function to flatten the nested dictionary
    def flatten_dict(d, parent_key=''):
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}.{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(flatten_dict(v, new_key).items())
            else:
                items.append((new_key, v))
        return dict(items)

    # Flatten the entities dictionary
    flat_entities = flatten_dict(entities)

    for entity_name, entity_value in flat_entities.items():
        if entity_value:
            query_embedding = embedding_model.encode([entity_value])[0]
            query_embedding = np.array(query_embedding, dtype=np.float32)
           
            try:
                result = pinecone_index.query(
                    namespace=namespace,
                    vector=query_embedding.tolist(),
                                    filter={
                        "column_name": {"$eq": "status" }
                    },
                    top_k=3,
                    include_values=True,
                    include_metadata=True
                )
                matches = result.get('matches', [])
                if matches:
                    unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
                    if unique_values:
                        pinecone_data[entity_name] = unique_values
                        if len(unique_values) > 1:
                            print(f"Multiple matches found for '{entity_value}':")
                            for idx, unique_value in enumerate(unique_values):
                                print(f"{idx + 1}: {unique_value}")
                            while True:
                                selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
                                try:
                                    selected_value = unique_values[int(selection) - 1]
                                    augmented_input = augmented_input.replace(entity_value, selected_value)
                                    break
                                except (IndexError, ValueError):
                                    print("Invalid selection. Please choose a valid option.")
                        else:
                            augmented_input = augmented_input.replace(entity_value, unique_values[0])
                else:
                    print(f"No matches found for {entity_value} in Pinecone.")
            except Exception as e:
                print(f"Error querying Pinecone: {str(e)}")
                return f"Error querying Pinecone: {str(e)}", {}
    return augmented_input, pinecone_data

# Function to generate SQL query using GPT-4o-mini
def generate_sql_query(processed_schema_df, augmented_input):
    # Convert the schema dataframe to a string
    schema_str = processed_schema_df.to_string(index=False)

    prompt = f"""
    The database contains the following schema:
    {schema_str}

    Based on this schema and the user request:
    "{augmented_input}"

    Generate an optimized SQL query that meets the user's intent.
    The query should be efficient and use the correct table and column names.
    """

    # Call GPT-4o-mini-2024-07-18 model using chat completion API
    response = openai.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specialized in generating SQL queries, always ensuring the use of appropriate operators like LIKE or expressions in sql queries like '% %' for partial matches if needed. Accurately map user input to the relevant tables and columns in the database based on the provided schema, using the LIKE operator for partial matches where necessary. Handle data type mismatches explicitly by casting to the appropriate type when required, ensuring correct query execution. Additionally, Manage variations in user input, such as case sensitivity or small spelling differences, using flexible matching techniques to generate precise and reliable SQL queries.Note do not use ILIKE Operator"},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,  # Reduced token limit for completion
        temperature=0.7
    )

    # Extract SQL query from the response
    sql_response = response.choices[0].message.content
    # Find and clean the SQL query part
    start = sql_response.find("```sql") + 6
    end = sql_response.find("```", start)
    sql_query = sql_response
    print("Response:",sql_response)

    return sql_query


# Extract generated SQL Query
def extract_sql_query(response):
    start = response.find("```sql") + len("```sql\n")
    end = response.find("```", start)
    sql_query = response[start:end].strip()
    return sql_query

# Function to execute the SQL query and print the results
def execute_sql_query(conn, sql_query):
    try:
        with conn.cursor() as cursor:
            cursor.execute(sql_query)
            results = cursor.fetchall()
            print(results)
            return results
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return None
        
# Main function to process user input and extract entities
def process_user_input(user_input):
    global NAMESPACE
    # Connect to DB and fetch schema
    conn = connect_to_db()
    schema_df = fetch_schema(conn)
    processed_schema_df = process_schema(schema_df)

    # Extract features from user input using OpenAI
    extracted_features = extract_features_with_openai(user_input, processed_schema_df)

    # Process the extracted features and clean them
    cleaned_json, feature_list = process_extracted_features(extracted_features)

    # Query Pinecone and augment the user input
    if cleaned_json:
        cleaned_feature_dict = json.loads(cleaned_json)
        cleaned_extracted_features, feature_list = clean_extracted_features(cleaned_feature_dict)  # Rename the variable here
        extract_namespace = extract_nmaespace(cleaned_extracted_features)
        augmented_input, namespace_data = query_pinecone_and_augment_input(user_input, cleaned_feature_dict, NAMESPACE)  # Ensure to capture both outputs
        print(augmented_input)
        
        # Generate SQL query using the augmented input
        sql_query = generate_sql_query(processed_schema_df, augmented_input)
        sql_query=extract_sql_query(sql_query)
        print("Generated SQL Query:", sql_query)

        # Execute the SQL query
        results = execute_sql_query(conn, sql_query)
        print("Query Results:", results)

    conn.close()


    
# Example usage
user_input = input("Enter your query: ")
process_user_input(user_input)

  from tqdm.autonotebook import tqdm, trange


Enter your query:  give me the list of all closed tasks


  schema_df = pd.read_sql(query, conn)


cleaned_features {"tasks": {"status": "closed"}}
feature_dict {'tasks': {'status': 'closed'}}
cleaned_feature_dict {'tasks': {'status': 'closed'}}
cleaned_feature_dict {'tasks': {'status': 'closed'}}
tasks




Multiple matches found for 'closed':
1: Terminated/Cancelled
2: Done
3: In Progress


Please select the most relevant option for 'closed' (1-3):  2


give me the list of all Done tasks
Response: To retrieve the list of all done tasks based on the provided schema, we need to query the `tasks` table and filter for tasks that have a status indicating they are done. Assuming that the status for done tasks is represented as 'Done' (case-sensitive), the SQL query would be:

```sql
SELECT *
FROM tasks
WHERE status = 'Done';
```

This query selects all columns from the `tasks` table where the `status` column matches 'Done'.
Generated SQL Query: SELECT *
FROM tasks
WHERE status = 'Done';


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [4]:
# import openai
import psycopg2
import pandas as pd
import re
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import pinecone
import os
from fuzzywuzzy import fuzz

# Initialize OpenAI API key
OPENAI_API_KEY = "sk-proj-UnzdWuWBs7ZQRbRPiRCoT3BlbkFJhPM1p7DdZUMklcpnWK1S"
openai.api_key = OPENAI_API_KEY

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Constants
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
NAMESPACE = "projects"  # Replace with your namespace
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# Initialize Pinecone client
def initialize_pinecone():
    from pinecone import Pinecone, ServerlessSpec
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-west-2')
        )
    return pc.Index(INDEX_NAME)

# Load Hugging Face model for embeddings
def load_huggingface_model():
    return SentenceTransformer(MODEL_NAME)


# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Function to fetch schema from PostgreSQL database
def fetch_schema(conn):
    try:
        query = """
        SELECT table_name, column_name
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        # print(schema_df)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema: {e}")
        raise

# Function to process schema: remove special characters and convert to lowercase
def process_schema(schema_df):
    def clean_column_name(name):
        return re.sub(r'[^a-zA-Z]', '', name).lower()

    schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
    return schema_df

# Function to extract features (like project name, owner, etc.) using OpenAI's LLM
def extract_features_with_openai(user_input, processed_schema_df):
    schema_json = processed_schema_df.to_json(orient='records')
    
    # Refined prompt to ensure OpenAI extracts table names, column names, and their values
    prompt = f"""
    ## Database Schema Context:
    The following represents the columns and their respective tables available in the database:
    {schema_json}

    ## User Input:
    The user has provided the following input: "{user_input}"

    ## Task:
    Extract the relevant features, values, and table names from the user input based on the schema. These features might include project names, owners, dates, statuses, etc., along with their corresponding table names.

    ## Instructions:
    - Return a JSON dictionary that includes the table names as keys, and within each table, include the fields and their values extracted from the user input.
    - Omit any fields or tables where the value is empty or null.
    - Format the output as a JSON object with keys only for tables and fields that have values.
    """

    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=500,
            temperature=0.5
        )
        extracted_features = response.choices[0].text.strip()
        return extracted_features
    except openai.OpenAIError as e:
        print(f"Error with OpenAI: {e}")
        raise

# Function to remove null, None, empty values from JSON and list
def clean_extracted_features(feature_dict):
    # Remove any keys with None or empty values
    cleaned_feature_dict = {k: v for k, v in feature_dict.items() if v}
    print(cleaned_feature_dict)
    # Extract the non-null values into a list
    feature_list = list(cleaned_feature_dict.values())
    return cleaned_feature_dict, feature_list

#Extract Dynamic Namespoace
def extract_nmaespace(extracted_dict):
    global NAMESPACE
    for key in extracted_dict.keys():
        print(key)
    NAMESPACE= key
            
# Function to parse and process extracted features
def process_extracted_features(extracted_features):
    try:
        # Remove the "## Solution:" part and any other non-JSON text
        json_match = re.search(r'\{.*\}', extracted_features, re.DOTALL)
        
        if json_match:
            # Extract the JSON part from the matched result
            cleaned_features = json_match.group(0)

            # Convert JSON string to a Python dictionary
            feature_dict = json.loads(cleaned_features)

            # Clean feature dictionary and feature list to remove nulls and empty values
            cleaned_feature_dict, feature_list = clean_extracted_features(feature_dict)

            # Return cleaned JSON and feature list
            return json.dumps(cleaned_feature_dict, indent=4), feature_list
        else:
            return None, []
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Error parsing features: {e}")
        return None, []

# Query Pinecone for relevant context and augment the input
def query_pinecone_and_augment_input(user_input, entities, namespace):
    embedding_model = load_huggingface_model()
    pinecone_index = initialize_pinecone()
    augmented_input = user_input
    pinecone_data = {}

    # Function to flatten the nested dictionary
    def flatten_dict(d, parent_key=''):
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}.{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(flatten_dict(v, new_key).items())
            else:
                items.append((new_key, v))
        return dict(items)

    # Flatten the entities dictionary
    flat_entities = flatten_dict(entities)

    for entity_name, entity_value in flat_entities.items():
        if entity_value:
            query_embedding = embedding_model.encode([entity_value])[0]
            query_embedding = np.array(query_embedding, dtype=np.float32)
            try:
                result = pinecone_index.query(
                    namespace=namespace,
                    vector=query_embedding.tolist(),
                    filter={
                        "column_name": {"$eq": "status"}
                    },
                    top_k=3,
                    include_values=True,
                    include_metadata=True
                )
                matches = result.get('matches', [])
                if matches:
                    unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
                    if unique_values:
                        pinecone_data[entity_name] = unique_values
                        if len(unique_values) > 1:
                            print(f"Multiple matches found for '{entity_value}':")
                            for idx, unique_value in enumerate(unique_values):
                                print(f"{idx + 1}: {unique_value}")
                            while True:
                                selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
                                try:
                                    selected_value = unique_values[int(selection) - 1]
                                    augmented_input = augmented_input.replace(entity_value, selected_value)
                                    break
                                except (IndexError, ValueError):
                                    print("Invalid selection. Please choose a valid option.")
                        else:
                            augmented_input = augmented_input.replace(entity_value, unique_values[0])
                else:
                    print(f"No matches found for {entity_value} in Pinecone.")
            except Exception as e:
                print(f"Error querying Pinecone: {str(e)}")
                return f"Error querying Pinecone: {str(e)}", {}
    return augmented_input, pinecone_data

# Function to generate SQL query using GPT-4o-mini
def generate_sql_query(processed_schema_df, augmented_input):
    # Convert the schema dataframe to a string
    schema_str = processed_schema_df.to_string(index=False)

    prompt = f"""
    The database contains the following schema:
    {schema_str}

    Based on this schema and the user request:
    "{augmented_input}"

    Generate an optimized SQL query that meets the user's intent.
    The query should be efficient and use the correct table and column names.
    """

    # Call GPT-4o-mini-2024-07-18 model using chat completion API
    response = openai.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specialized in generating SQL queries, always ensuring the use of appropriate operators like LIKE or expressions in sql queries like '% %' for partial matches if needed. Accurately map user input to the relevant tables and columns in the database based on the provided schema, using the LIKE operator for partial matches where necessary. Handle data type mismatches explicitly by casting to the appropriate type when required, ensuring correct query execution. Additionally, Manage variations in user input, such as case sensitivity or small spelling differences, using flexible matching techniques to generate precise and reliable SQL queries.Note do not use ILIKE Operator"},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,  # Reduced token limit for completion
        temperature=0.7
    )

    # Extract SQL query from the response
    sql_response = response.choices[0].message.content
    # Find and clean the SQL query part
    start = sql_response.find("```sql") + 6
    end = sql_response.find("```", start)
    sql_query = sql_response
    print("Response:",sql_response)

    return sql_query


# Extract generated SQL Query
def extract_sql_query(response):
    start = response.find("```sql") + len("```sql\n")
    end = response.find("```", start)
    sql_query = response[start:end].strip()
    return sql_query

# Function to execute the SQL query and print the results
def execute_sql_query(conn, sql_query):
    try:
        with conn.cursor() as cursor:
            cursor.execute(sql_query)
            results = cursor.fetchall()
            print(results)
            return results
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return None
        
# Main function to process user input and extract entities
def process_user_input(user_input):
    global NAMESPACE
    # Connect to DB and fetch schema
    conn = connect_to_db()
    schema_df = fetch_schema(conn)
    processed_schema_df = process_schema(schema_df)

    # Extract features from user input using OpenAI
    extracted_features = extract_features_with_openai(user_input, processed_schema_df)

    # Process the extracted features and clean them
    cleaned_json, feature_list = process_extracted_features(extracted_features)

    # Query Pinecone and augment the user input
    if cleaned_json:
        cleaned_feature_dict = json.loads(cleaned_json)
        cleaned_extracted_features, feature_list = clean_extracted_features(cleaned_feature_dict)  # Rename the variable here
        extract_namespace = extract_nmaespace(cleaned_extracted_features)
        augmented_input, namespace_data = query_pinecone_and_augment_input(user_input, cleaned_feature_dict, NAMESPACE)  # Ensure to capture both outputs
        print(augmented_input)
        
        # Generate SQL query using the augmented input
        sql_query = generate_sql_query(processed_schema_df, augmented_input)
        sql_query=extract_sql_query(sql_query)
        print("Generated SQL Query:", sql_query)

        # Execute the SQL query
        results = execute_sql_query(conn, sql_query)
        print("Query Results:", results)

    conn.close()


    
# Example usage
user_input = input("Enter your query: ")
process_user_input(user_input)

Enter your query:  give me the list of all closed tasks


  schema_df = pd.read_sql(query, conn)


{'tasks': {'status': 'closed'}}
{'tasks': {'status': 'closed'}}
tasks




Multiple matches found for 'closed':
1: Terminated/Cancelled
2: Done
3: In Progress


Please select the most relevant option for 'closed' (1-3):  2


give me the list of all Done tasks
Response: Based on the user's request to "give me the list of all Done tasks," we need to query the `tasks` table and filter the results where the `status` is "Done". Here is the optimized SQL query:

```sql
SELECT *
FROM tasks
WHERE status LIKE 'Done';
```

This query retrieves all columns from the `tasks` table where the `status` column matches "Done". The use of `LIKE` is appropriate for this case, allowing for potential variations in case sensitivity.
Generated SQL Query: SELECT *
FROM tasks
WHERE status LIKE 'Done';


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [3]:
# import openai
# import psycopg2
# import pandas as pd
# import re
# import json

# # Initialize OpenAI API key
# OPENAI_API_KEY = "sk-proj-UnzdWuWBs7ZQRbRPiRCoT3BlbkFJhPM1p7DdZUMklcpnWK1S"
# openai.api_key = OPENAI_API_KEY

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "python_test_poc"
# PORT = 5432

# # Function to connect to PostgreSQL database
# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise

# # Function to fetch schema from PostgreSQL database
# def fetch_schema(conn):
#     try:
#         query = """
#         SELECT table_name, column_name
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         """
#         schema_df = pd.read_sql(query, conn)
#         # print(schema_df)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema: {e}")
#         raise

# # Function to process schema: remove special characters and convert to lowercase
# def process_schema(schema_df):
#     def clean_column_name(name):
#         return re.sub(r'[^a-zA-Z]', '', name).lower()

#     schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
#     return schema_df

# # Function to extract features (like project name, owner, etc.) using OpenAI's LLM
# def extract_features_with_openai(user_input, processed_schema_df):
#     schema_json = processed_schema_df.to_json(orient='records')
    
#     # Refined prompt to ensure OpenAI extracts table names, column names, and their values
#     prompt = f"""
#     ## Database Schema Context:
#     The following represents the columns and their respective tables available in the database:
#     {schema_json}

#     ## User Input:
#     The user has provided the following input: "{user_input}"

#     ## Task:
#     Extract the relevant features, values, and table names from the user input based on the schema. These features might include project names, owners, dates, statuses, etc., along with their corresponding table names.

#     ## Instructions:
#     - Return a JSON dictionary that includes the table names as keys, and within each table, include the fields and their values extracted from the user input.
#     - Omit any fields or tables where the value is empty or null.
#     - Format the output as a JSON object with keys only for tables and fields that have values.
#     """

#     try:
#         response = openai.completions.create(
#             model="gpt-3.5-turbo-instruct",
#             prompt=prompt,
#             max_tokens=500,
#             temperature=0.5
#         )
#         extracted_features = response.choices[0].text.strip()
#         return extracted_features
#     except openai.OpenAIError as e:
#         print(f"Error with OpenAI: {e}")
#         raise

# # Function to remove null, None, empty values from JSON and list
# def clean_extracted_features(feature_dict):
#     # Remove any keys with None or empty values
#     cleaned_feature_dict = {k: v for k, v in feature_dict.items() if v}
#     # Extract the non-null values into a list
#     feature_list = list(cleaned_feature_dict.values())
#     return cleaned_feature_dict, feature_list

# # Function to parse and process extracted features
# def process_extracted_features(extracted_features):
#     try:
#         # Remove the "## Solution:" part and any other non-JSON text
#         json_match = re.search(r'\{.*\}', extracted_features, re.DOTALL)
        
#         if json_match:
#             # Extract the JSON part from the matched result
#             cleaned_features = json_match.group(0)

#             # Convert JSON string to a Python dictionary
#             feature_dict = json.loads(cleaned_features)

#             # Clean feature dictionary and feature list to remove nulls and empty values
#             cleaned_feature_dict, feature_list = clean_extracted_features(feature_dict)

#             # Return cleaned JSON and feature list
#             return json.dumps(cleaned_feature_dict, indent=4), feature_list
#         else:
#             return None, []
#     except (json.JSONDecodeError, ValueError) as e:
#         print(f"Error parsing features: {e}")
#         return None, []

# # Main function to process user input and extract entities
# def process_user_input(user_input):
#     # Connect to DB and fetch schema
#     conn = connect_to_db()
#     schema_df = fetch_schema(conn)
#     processed_schema_df = process_schema(schema_df)

#     # Extract features from user input using OpenAI's LLM
#     extracted_features = extract_features_with_openai(user_input, processed_schema_df)

#     # Process the extracted features and clean them
#     cleaned_json, feature_list = process_extracted_features(extracted_features)

#     # Output cleaned JSON and feature list
#     if cleaned_json:
#         print("Cleaned Extracted JSON Features:")
#         print(cleaned_json)
    
#     print("Feature List:")
#     print(feature_list)

# # Get user input
# user_input = input("Enter your query: ")

# # Process user input and extract entities
# process_user_input(user_input)
