In [2]:
import re
import psycopg2
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import openai
from pinecone import Pinecone, ServerlessSpec

# Constants
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
NAMESPACE = "projects"  # Replace with your namespace
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# OpenAI API key
OPENAI_API_KEY = ''
openai.api_key = OPENAI_API_KEY

# PostgreSQL database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Initialize Pinecone client
def initialize_pinecone():
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
    return pc.Index(INDEX_NAME)

# Load Hugging Face model for embeddings
def load_huggingface_model():
    return SentenceTransformer(MODEL_NAME)

# Connect to PostgreSQL and fetch schema information
def fetch_database_schema():
    conn = psycopg2.connect(
        host=DATABASE_HOST,
        user=DATABASE_USERNAME,
        password=DATABASE_PASSWORD,
        dbname=DATABASE_DB,
        port=PORT
    )
    cur = conn.cursor()
    
    # Fetch schema information
    cur.execute("""
        SELECT table_name, column_name, data_type 
        FROM information_schema.columns 
        WHERE table_schema = 'public'
    """)
    
    schema = []
    for table, column, dtype in cur.fetchall():
        schema.append({"table_name": table, "column_name": column, "data_type": dtype})
    
    cur.close()
    conn.close()
    
    # Convert to DataFrame
    schema_df = pd.DataFrame(schema)
    return schema_df

# Extract relevant entities based on regex and column names
def extract_entities(user_query, schema_df):
    entities = {}

    # Extract unique column names to generate patterns
    column_names = schema_df['column_name'].unique()
    column_patterns = {col: re.compile(rf'{re.escape(col)}\s+([a-zA-Z0-9_ ]+)', re.IGNORECASE) for col in column_names}

    # Extract entities using dynamically generated patterns
    for column, pattern in column_patterns.items():
        match = pattern.search(user_query)
        if match:
            entities[column] = match.group(1).strip()
        else:
            entities[column] = None

    print(f"Extracted Entities: {entities}")
    return entities

# Query Pinecone for relevant context and augment the input
def query_pinecone_for_entities(entities, namespace):
    embedding_model = load_huggingface_model()
    pinecone_index = initialize_pinecone()
    
    pinecone_data = {}

    for entity_name, entity_value in entities.items():
        if entity_value:
            query_embedding = embedding_model.encode([entity_value])[0]
            query_embedding = np.array(query_embedding, dtype=np.float32)
            
            print(f"Query Embedding for {entity_value}: {query_embedding}")

            try:
                result = pinecone_index.query(
                    namespace=namespace,
                    vector=query_embedding.tolist(),
                    top_k=10,
                    include_values=True,
                    include_metadata=True
                )
                
                matches = result.get('matches', [])
                if matches:
                    unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
                    if unique_values:
                        pinecone_data[entity_name] = unique_values
                        
                        if len(unique_values) > 1:
                            print(f"Multiple matches found for '{entity_value}':")
                            for idx, unique_value in enumerate(unique_values):
                                print(f"{idx + 1}: {unique_value}")
                            
                            while True:
                                selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
                                try:
                                    selected_value = unique_values[int(selection) - 1]
                                    entities[entity_name] = selected_value
                                    print(f"Selected Value: {selected_value}")
                                    break
                                except (IndexError, ValueError):
                                    print("Invalid selection. Please choose a valid option.")
                        else:
                            entities[entity_name] = unique_values[0]
                            print(f"Selected Value: {unique_values[0]}")
                else:
                    print(f"No matches found for {entity_value} in Pinecone.")
            except Exception as e:
                print(f"Error querying Pinecone: {str(e)}")
                return {}, {}
    
    return entities

# Function to generate SQL query using OpenAI API
def generate_sql_query(user_input, schema_df):
    schema_json = schema_df.to_json(orient='records')
    schema_with_types = schema_df[['table_name', 'column_name', 'data_type']].to_dict(orient='records')
    context = f"""
    ## Database Schema Context
    Schema JSON: {schema_json}
    Detailed Schema: {schema_with_types}

    ## User Input
    Given the following user input: '{user_input}', generate an SQL query.
    Use the LIKE operator for partial matches where appropriate. Handle data type mismatches explicitly.

    ## Instructions
    Based on the user input and the provided schema, generate an accurate SQL query.
    Ensure the query maps correctly to the tables and columns in the database.
    Handle data type casting if necessary to match columns with different types.
    """
    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=context,
            max_tokens=500,
            temperature=0.7
        )
        generated_query = response.choices[0].text.strip()
        print(f"Generated SQL Query: {generated_query}")
        
        return generated_query
    except openai.OpenAIError as e:
        print(f"Error generating SQL query: {e}")
        return f"Error generating SQL query: {e}"

# Function to execute the SQL query and print the results
def execute_sql_query(conn, sql_query):
    try:
        with conn.cursor() as cursor:
            cursor.execute(sql_query)
            results = cursor.fetchall()
            return results
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return None

# Initialize OpenAI Chat model
openai_model = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=150
)

# Create a ChatPromptTemplate with the knowledge base included
template = """
## Knowledge Base:
{knowledge_base}

## Database Schema:
{database_schema}

## Question:
{question}

## Answer:
"""

prompt_template = ChatPromptTemplate.from_template(template)

def get_answer_from_chatbot(question, database_schema):
    try:
        prompt = prompt_template.format(
            knowledge_base="",
            database_schema=database_schema,
            question=question
        )

        response = openai_model.invoke(input=prompt)

        parsed_response = response.content.strip() if hasattr(response, 'content') else "No response content found."
        return parsed_response

    except Exception as e:
        return f"Error generating response from OpenAI: {str(e)}"

# Determine if user query is related to database or general knowledge
def determine_query_type(user_query, schema_df):
    user_query_lower = user_query.lower()
    
    if any(table.lower() in user_query_lower for table in schema_df['table_name'].unique()) or \
       any(column.lower() in user_query_lower for column in schema_df['column_name'].unique()):
        return "database"
    
    return "knowledge"

# Function to process user query
def process_user_query(user_query, schema_df, conn):
    query_type = determine_query_type(user_query, schema_df)
    
    if query_type == "database":
        entities = extract_entities(user_query, schema_df)
        entities = query_pinecone_for_entities(entities, NAMESPACE)
        
        # Generate SQL query based on selected entities
        augmented_query = user_query
        sql_query = generate_sql_query(augmented_query, schema_df)
        
        results = execute_sql_query(conn, sql_query)
        return results
    
    elif query_type == "knowledge":
        database_schema = schema_df.to_json(orient='records')
        answer = get_answer_from_chatbot(user_query, database_schema)
        return answer
    
    else:
        return "Unable to determine the query type."

# Main function to interact with the user
def main():
    conn = psycopg2.connect(
        host=DATABASE_HOST,
        user=DATABASE_USERNAME,
        password=DATABASE_PASSWORD,
        dbname=DATABASE_DB,
        port=PORT
    )
    
    schema_df = fetch_database_schema()
    
    print("Welcome to the Intelligent Query System")
    user_query = input("Please enter your query: ")
    
    results = process_user_query(user_query, schema_df, conn)
    
    if isinstance(results, list):
        for row in results:
            print(row)
    else:
        print(results)
    
    conn.close()

if __name__ == "__main__":
    main()


Welcome to the Intelligent Query System


Please enter your query:  who is the owner of a project IIFL Samsta


Extracted Entities: {'project_id': None, 'profile': None, 'duration': None, 'sprint_new': None, 'allocated_unallocated': None, 'start_date': None, 'created_time': None, 'days_completed_on': None, 'delivery_team': None, 'due_date': None, 'owner': 'of a project IIFL Samsta', 'milestone_status': None, 'clarity_level': None, 'task_id': None, 'status': None, 'project_efforts': None, 'task_delay_time': None, 'project_name': None, 'sprint': None, 'priority': None, 'budget': None, 'user_name': None, 'owner_name': None, 'milestone_id': None, 'sprint_ff_sf': None, 'product_skill': None, 'user_id': None, 'end_date': None, 'open_closed': None, 'role': None, 'duration_1': None, 'milestone_value': None, 'actual_time_taken': None, 'is_overdue': None, 'task_name': None, 'milestone_name': None, 'completion_percentage': None, 'completion_date': None, 'application': None, 'user_email': None, 'task_completion_mode': None, 'milestone_end_lag': None, 'duration_unit': None, 'milestone_completion_mode': None,



Query Embedding for of a project IIFL Samsta: [ 2.92236265e-02 -6.20245282e-03 -2.09105890e-02  3.03355418e-02
 -1.40203470e-02 -3.13791074e-02  2.88565899e-03 -1.38844931e-02
 -7.62413070e-02  1.27609400e-03  9.91367176e-02  4.71980032e-03
  4.54690568e-02  2.99121514e-02  2.05354076e-02 -9.35822055e-02
  1.38946762e-02 -1.49154337e-02 -1.20191418e-01 -2.71366853e-02
 -4.70747761e-02  1.14780683e-02 -7.32151733e-04 -1.81617178e-02
 -3.41040567e-02 -5.86015033e-03 -4.28046659e-02 -3.60278450e-02
  2.39579100e-02 -1.80258341e-02  4.02290858e-02 -8.44855141e-03
  1.95493530e-02 -2.15836708e-02  1.99088913e-06 -3.87730151e-02
 -1.06445756e-02 -1.55503377e-02 -4.01581973e-02  6.34026304e-02
  4.22161967e-02  3.22474316e-02 -5.70685826e-02 -3.05697266e-02
 -2.70418320e-02  6.74583623e-03  5.32083623e-02  1.80104058e-02
 -2.34520007e-02 -3.66046205e-02  5.19352918e-03 -8.06421340e-02
 -2.48946417e-02 -9.90041532e-03  1.57674570e-02  4.04851437e-02
 -1.94487702e-02 -2.88512576e-02  4.04523946

Please select the most relevant option for 'of a project IIFL Samsta' (1-10):  3


Selected Value: IIFL Samasta - CGRM
Error generating SQL query: 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

Error executing SQL query: syntax error at or near "Error"
LINE 1: Error generating SQL query: 
        ^

None
