In [None]:
import re
import pandas as pd
import psycopg2
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import pinecone
import openai
import os
import json
from fuzzywuzzy import fuzz

# OpenAI API key
OPENAI_API_KEY = ''
openai.api_key = OPENAI_API_KEY

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Constants
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
NAMESPACE = "projects"  # Replace with your namespace
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# Initialize Pinecone client
def initialize_pinecone():
    from pinecone import Pinecone, ServerlessSpec
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-west-2')
        )
    return pc.Index(INDEX_NAME)

# Load Hugging Face model for embeddings
def load_huggingface_model():
    return SentenceTransformer(MODEL_NAME)

def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise
        
# Function to fetch schema from PostgreSQL database
def fetch_schema(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema: {e}")
        raise


# Function to process schema: remove special characters and convert to lowercase
def process_schema(schema_df):
    def clean_column_name(name):
        return re.sub(r'[^a-zA-Z]', '', name).lower()

    schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
    return schema_df

# Function to generate SQL Query
def generate_sql_from_input(user_input, processed_schema_df):
    """
    Extracts entities from the user input, understands user intent, and generates a corresponding SQL query.
    """
    # Summarize the schema to only include table and column names
    schema_summary = processed_schema_df[['table_name', 'column_name']].to_dict(orient='records')
    schema_summary_str = json.dumps(schema_summary, indent=2)

    # Chain of Thought reasoning prompt with summarized schema
    cot_prompt = f"""
    ## Database Schema Summary:
    {schema_summary_str}

    ## User Input:
    "{user_input}"

    ## Steps:
    1. Extract relevant entities (project name, task name, milestone, etc.).
    2. Map the entities to the schema columns.
    3. Generate an SQL query using appropriate operators like `LIKE` for partial matches.

    ## Example Output:
    {{
      "entities": {{
        "projects": {{"project_name": "Extracted Project Name", "status": "Extracted Status"}},
        "tasks": {{"task_name": "Extracted Task Name", "owner": "Extracted Owner"}}
      }},
      "sql_query": "Generated SQL Query"
    }}
    """

    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=cot_prompt,
            max_tokens=500,  # Reduced token limit for completion
            temperature=0.7
        )

        # Check if response is valid and has choices
        if not response or not response.choices or len(response.choices) == 0:
            print(f"Empty response received from OpenAI: {response}")
            return None, None

        result = response.choices[0].text.strip()
        print(f"Raw API Response: {result}")  # Debugging output

        # Try to parse the response as JSON
        try:
            parsed_result = json.loads(result)
        except json.JSONDecodeError as json_err:
            print(f"Error decoding JSON response: {json_err}")
            print(f"Response text: {result}")  # Print raw response for debugging
            return None, None  # Handle the error gracefully

        # Extract entities and SQL query
        extracted_entities = parsed_result.get('entities')
        sql_query = parsed_result.get('sql_query')

        print("Extracted Entities:", json.dumps(extracted_entities, indent=2))
        print("Generated SQL Query:", sql_query)

        return extracted_entities, sql_query
    except openai.OpenAIError as e:
        print(f"Error processing request: {e}")
        raise


# Initialize OpenAI Chat model
openai_model = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=150
)

# Create a ChatPromptTemplate with the knowledge base included
template = """
## Knowledge Base:
{knowledge_base}

## Database Schema:
{database_schema}

## Question:
{question}

## Answer:
"""

prompt_template = ChatPromptTemplate.from_template(template)

def get_answer_from_chatbot(question, database_schema):
    try:
        prompt = prompt_template.format(
            knowledge_base="",
            database_schema=database_schema,
            question=question
        )
        response = openai_model.invoke(input=prompt)
        parsed_response = response.content.strip() if hasattr(response, 'content') else "No response content found."
        return parsed_response
    except Exception as e:
        return f"Error generating response from OpenAI: {str(e)}"

# Function to execute the SQL query and print the results
def execute_sql_query(conn, sql_query):
    try:
        with conn.cursor() as cursor:
            cursor.execute(sql_query)
            results = cursor.fetchall()
            return results
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return None

# Determine if user query is related to database or general knowledge
def determine_query_type(user_query, schema_df, threshold = 75):
    user_query_lower = user_query.lower()
    
    # Extract unique table and column names from the schema and convert to lowercase
    table_names = schema_df['table_name'].str.lower().unique()
    column_names = schema_df['column_name'].str.lower().unique()
    
    # Function to check fuzzy match
    def is_fuzzy_match(query, options, threshold):
        for option in options:
            if fuzz.partial_ratio(query, option) >= threshold:
                return True
        return False
    
    # Check if user query matches any table or column name
    if is_fuzzy_match(user_query_lower, table_names, threshold) or \
       is_fuzzy_match(user_query_lower, column_names, threshold):
        return "database"
    
    return "knowledge"

# Main function to handle user queries
def process_user_query(user_query):
    # Connect to the database and fetch the schema
    conn = connect_to_db()
    
    schema_df = fetch_schema(conn)
    processed_schema_df = process_schema(schema_df)
    query_type = determine_query_type(user_query, schema_df)

    if query_type == "database":
        extracted_entities, sql_query = generate_sql_from_input(user_query, processed_schema_df)
        
        print("Generated SQL Query:", sql_query)
        
        # Execute the generated SQL query
        results = execute_sql_query(conn, sql_query)
        conn.close()

        if results is not None:
            print("Query Results:")
            for row in results:
                print(row)
        else:
            print("No results returned or error occurred during query execution.")
        
        return f"Generated SQL Query: {sql_query}"
    
    else:
        # For non-database related queries, respond using the chatbot
        database_schema = fetch_schema(conn)  # Fetching schema again if needed
        database_schema_df = process_schema(database_schema)
        return get_answer_from_chatbot(user_query, database_schema_df.to_dict(orient='records'))


# Example usage
if __name__ == "__main__":
    while True:
        user_input = input("Enter your query: ")
        if user_input.lower() in ['exit', 'quit']:
            break
        response = process_user_query(user_input)
        print(response)


  from tqdm.autonotebook import tqdm, trange


Enter your query:  Hi


  schema_df = pd.read_sql(query, conn)
  schema_df = pd.read_sql(query, conn)


Hello! How can I assist you today?


Enter your query:  what is the status of a project IIFL Samasta


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
      "entities": {
        "projects": {"project_name": "IIFL Samasta", "status": "status"},
        "tasks": {"task_name": null, "owner": null}
      },
      "sql_query": "SELECT status FROM projects WHERE project_name LIKE 'IIFL Samasta'"
    }
Extracted Entities: {
  "projects": {
    "project_name": "IIFL Samasta",
    "status": "status"
  },
  "tasks": {
    "task_name": null,
    "owner": null
  }
}
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE 'IIFL Samasta'
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE 'IIFL Samasta'
Query Results:
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE 'IIFL Samasta'


Enter your query:  what is the status of a project IIFL Samasta


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
      "entities": {
        "projects": {"project_name": "IIFL Samasta", "status": "status"},
        "tasks": {"task_name": None, "owner": None}
      },
      "sql_query": "SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'"
    }
Error decoding JSON response: Expecting value: line 4 column 32 (char 127)
Response text: {
      "entities": {
        "projects": {"project_name": "IIFL Samasta", "status": "status"},
        "tasks": {"task_name": None, "owner": None}
      },
      "sql_query": "SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'"
    }
Generated SQL Query: None
Error executing SQL query: can't execute an empty query
No results returned or error occurred during query execution.
Generated SQL Query: None


Enter your query:  what is the status of a project IIFL Samasta


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
      "entities": {
        "projects": {"project_name": "IIFL Samasta", "status": "status"},
        "tasks": {"task_name": "None", "owner": "None"}
      },
      "sql_query": "SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'"
    }
Extracted Entities: {
  "projects": {
    "project_name": "IIFL Samasta",
    "status": "status"
  },
  "tasks": {
    "task_name": "None",
    "owner": "None"
  }
}
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'
Query Results:
('To Do',)
('Requirement Gathering',)
('Development',)
('Development',)
('To Do',)
('To Do',)
('To Do',)
('To Do',)
('Development',)
('To Do',)
('Development',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Development',)
('Development',)
('To Do',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Comple

Enter your query:  what is the status of a project IIFL Samasta


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
      "entities": {
        "projects": {"project_name": "IIFL Samasta", "status": "status"},
        "tasks": {"task_name": "", "owner": ""}
      },
      "sql_query": "SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'"
    }
Extracted Entities: {
  "projects": {
    "project_name": "IIFL Samasta",
    "status": "status"
  },
  "tasks": {
    "task_name": "",
    "owner": ""
  }
}
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE '%IIFL Samasta%'
Query Results:
('To Do',)
('Requirement Gathering',)
('Development',)
('Development',)
('To Do',)
('To Do',)
('To Do',)
('To Do',)
('Development',)
('To Do',)
('Development',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Development',)
('Development',)
('To Do',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Completed',)
('Complet

Enter your query:  what is the longest delayed milestone name


  schema_df = pd.read_sql(query, conn)


Raw API Response: ## Example Output:
    {
      "entities": {
        "projects": {"project_name": "Extracted Project Name", "status": "Extracted Status"},
        "tasks": {"task_name": "Extracted Task Name", "owner": "Extracted Owner"},
        "milestones": {"milestone_name": "Extracted Milestone Name"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE project_name LIKE '%Extracted Project Name%' AND owner LIKE '%Extracted Owner%' AND status = 'Extracted Status' ORDER BY milestone_end_lag DESC LIMIT 1;"
    }
Error decoding JSON response: Expecting value: line 1 column 1 (char 0)
Response text: ## Example Output:
    {
      "entities": {
        "projects": {"project_name": "Extracted Project Name", "status": "Extracted Status"},
        "tasks": {"task_name": "Extracted Task Name", "owner": "Extracted Owner"},
        "milestones": {"milestone_name": "Extracted Milestone Name"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE project

Enter your query:  what is the longest delayed milestone name


  schema_df = pd.read_sql(query, conn)


Raw API Response: ## Output:
    {
      "entities": {
        "projects": {"project_name": "N/A", "status": "N/A"},
        "tasks": {"task_name": "N/A", "owner": "N/A"}
      },
      "sql_query": "SELECT milestones.milestone_name FROM milestones ORDER BY milestones.duration DESC LIMIT 1"
    }
Error decoding JSON response: Expecting value: line 1 column 1 (char 0)
Response text: ## Output:
    {
      "entities": {
        "projects": {"project_name": "N/A", "status": "N/A"},
        "tasks": {"task_name": "N/A", "owner": "N/A"}
      },
      "sql_query": "SELECT milestones.milestone_name FROM milestones ORDER BY milestones.duration DESC LIMIT 1"
    }
Generated SQL Query: None
Error executing SQL query: can't execute an empty query
No results returned or error occurred during query execution.
Generated SQL Query: None


Enter your query:  what is the longest delayed milestone name


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
      "entities": {
        "milestones": {"milestone_name": "longest delayed", "status": "delayed"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE status LIKE '%delayed%' ORDER BY duration DESC LIMIT 1"
    }
Extracted Entities: {
  "milestones": {
    "milestone_name": "longest delayed",
    "status": "delayed"
  }
}
Generated SQL Query: SELECT milestone_name FROM milestones WHERE status LIKE '%delayed%' ORDER BY duration DESC LIMIT 1
Generated SQL Query: SELECT milestone_name FROM milestones WHERE status LIKE '%delayed%' ORDER BY duration DESC LIMIT 1
Query Results:
Generated SQL Query: SELECT milestone_name FROM milestones WHERE status LIKE '%delayed%' ORDER BY duration DESC LIMIT 1


Enter your query:  what is the longest delayed milestone name


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
  "entities": {
    "milestones": {"milestone_name": "Longest Delayed Milestone Name"}
  },
  "sql_query": "SELECT milestone_name FROM milestones ORDER BY milestone_end_lag DESC LIMIT 1"
}
Extracted Entities: {
  "milestones": {
    "milestone_name": "Longest Delayed Milestone Name"
  }
}
Generated SQL Query: SELECT milestone_name FROM milestones ORDER BY milestone_end_lag DESC LIMIT 1
Generated SQL Query: SELECT milestone_name FROM milestones ORDER BY milestone_end_lag DESC LIMIT 1
Query Results:
('Milestone Twelve-Post Go-Live Application Support - Zoho ONE',)
Generated SQL Query: SELECT milestone_name FROM milestones ORDER BY milestone_end_lag DESC LIMIT 1


Enter your query:  what is the longest delayed milestone name


  schema_df = pd.read_sql(query, conn)


Raw API Response: ## Example Output:
    {
      "entities": {
        "projects": {"project_name": null, "status": null},
        "tasks": {"task_name": null, "owner": null}
      },
      "sql_query": "SELECT milestone_name FROM milestones ORDER BY duration DESC LIMIT 1"
    }
Error decoding JSON response: Expecting value: line 1 column 1 (char 0)
Response text: ## Example Output:
    {
      "entities": {
        "projects": {"project_name": null, "status": null},
        "tasks": {"task_name": null, "owner": null}
      },
      "sql_query": "SELECT milestone_name FROM milestones ORDER BY duration DESC LIMIT 1"
    }
Generated SQL Query: None
Error executing SQL query: can't execute an empty query
No results returned or error occurred during query execution.
Generated SQL Query: None


Enter your query:  what is the longest delayed milestone name


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
      "entities": {
        "milestones": {"milestone_end_lag": "Extracted Milestone End Lag", "status": "Extracted Status"},
        "projects": {"project_name": "Extracted Project Name"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE project_name LIKE 'Extracted Project Name' AND status = 'Extracted Status' ORDER BY milestone_end_lag DESC LIMIT 1;"
    }
Extracted Entities: {
  "milestones": {
    "milestone_end_lag": "Extracted Milestone End Lag",
    "status": "Extracted Status"
  },
  "projects": {
    "project_name": "Extracted Project Name"
  }
}
Generated SQL Query: SELECT milestone_name FROM milestones WHERE project_name LIKE 'Extracted Project Name' AND status = 'Extracted Status' ORDER BY milestone_end_lag DESC LIMIT 1;
Generated SQL Query: SELECT milestone_name FROM milestones WHERE project_name LIKE 'Extracted Project Name' AND status = 'Extracted Status' ORDER BY milestone_end_lag DESC LIMIT 1;
Query Results:
Generated SQL Que

Enter your query:  what is the longest delayed milestone name


  schema_df = pd.read_sql(query, conn)


Raw API Response: ## Example Output:
    {
        "entities": {
            "projects": {"project_name": "Extracted Project Name"},
            "milestones": {"milestone_name": "Extracted Milestone Name"}
        },
        "sql_query": "SELECT milestone_name FROM milestones WHERE project_name = 'Extracted Project Name' AND milestone_status = 'Delayed' ORDER BY duration DESC LIMIT 1"
    }
Error decoding JSON response: Expecting value: line 1 column 1 (char 0)
Response text: ## Example Output:
    {
        "entities": {
            "projects": {"project_name": "Extracted Project Name"},
            "milestones": {"milestone_name": "Extracted Milestone Name"}
        },
        "sql_query": "SELECT milestone_name FROM milestones WHERE project_name = 'Extracted Project Name' AND milestone_status = 'Delayed' ORDER BY duration DESC LIMIT 1"
    }
Generated SQL Query: None
Error executing SQL query: can't execute an empty query
No results returned or error occurred during query execution.

Enter your query:  give me the name of milestone which has delayed more


  schema_df = pd.read_sql(query, conn)


Raw API Response: ## Example Output:
    {
      "entities": {
        "milestones": {"milestone_name": "Extracted Milestone Name"},
        "tasks": {"status": "Delayed"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE status LIKE '%Delayed%' ORDER BY milestone_end_date DESC"
    }
Error decoding JSON response: Expecting value: line 1 column 1 (char 0)
Response text: ## Example Output:
    {
      "entities": {
        "milestones": {"milestone_name": "Extracted Milestone Name"},
        "tasks": {"status": "Delayed"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE status LIKE '%Delayed%' ORDER BY milestone_end_date DESC"
    }
Generated SQL Query: None
Error executing SQL query: can't execute an empty query
No results returned or error occurred during query execution.
Generated SQL Query: None


Enter your query:  give me the name of milestone which has delayed more 


  schema_df = pd.read_sql(query, conn)


Raw API Response: ## Example Output:
    {
      "entities": {
        "milestones": {"milestone_name": "Extracted Milestone Name"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE milestone_status = 'delayed' ORDER BY milestone_end_lag DESC"
    }
Error decoding JSON response: Expecting value: line 1 column 1 (char 0)
Response text: ## Example Output:
    {
      "entities": {
        "milestones": {"milestone_name": "Extracted Milestone Name"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE milestone_status = 'delayed' ORDER BY milestone_end_lag DESC"
    }
Generated SQL Query: None
Error executing SQL query: can't execute an empty query
No results returned or error occurred during query execution.
Generated SQL Query: None


Enter your query:  give me the name of milestone which has delayed in days


  schema_df = pd.read_sql(query, conn)


Raw API Response: {
      "entities": {
        "milestones": {"milestone_name": "Delayed Milestone"}
      },
      "sql_query": "SELECT milestone_name FROM milestones WHERE milestone_status = 'delayed' AND duration > 0"
    }
Extracted Entities: {
  "milestones": {
    "milestone_name": "Delayed Milestone"
  }
}
Generated SQL Query: SELECT milestone_name FROM milestones WHERE milestone_status = 'delayed' AND duration > 0
Generated SQL Query: SELECT milestone_name FROM milestones WHERE milestone_status = 'delayed' AND duration > 0
Query Results:
Generated SQL Query: SELECT milestone_name FROM milestones WHERE milestone_status = 'delayed' AND duration > 0


In [1]:
# import re
# import pandas as pd
# import psycopg2
# import numpy as np
# from sentence_transformers import SentenceTransformer
# from langchain_openai import ChatOpenAI
# from langchain.prompts import ChatPromptTemplate
# import pinecone
# import openai
# import os
# import json
# from fuzzywuzzy import fuzz

# # OpenAI API key
# OPENAI_API_KEY = 'sk-proj-UnzdWuWBs7ZQRbRPiRCoT3BlbkFJhPM1p7DdZUMklcpnWK1S'
# openai.api_key = OPENAI_API_KEY

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "python_test_poc"
# PORT = 5432

# # Constants
# PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
# INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
# NAMESPACE = "projects"  # Replace with your namespace
# MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# # Initialize Pinecone client
# def initialize_pinecone():
#     from pinecone import Pinecone, ServerlessSpec
#     pc = Pinecone(api_key=PINECONE_API_KEY)
    
#     if INDEX_NAME not in pc.list_indexes().names():
#         pc.create_index(
#             name=INDEX_NAME,
#             dimension=768,
#             metric='cosine',
#             spec=ServerlessSpec(cloud='aws', region='us-west-2')
#         )
#     return pc.Index(INDEX_NAME)

# # Load Hugging Face model for embeddings
# def load_huggingface_model():
#     return SentenceTransformer(MODEL_NAME)

# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise
        
# # Function to fetch schema from PostgreSQL database
# def fetch_schema(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         """
#         schema_df = pd.read_sql(query, conn)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema: {e}")
#         raise


# # Function to process schema: remove special characters and convert to lowercase
# def process_schema(schema_df):
#     def clean_column_name(name):
#         return re.sub(r'[^a-zA-Z]', '', name).lower()

#     schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
#     return schema_df

# # # Function to apply Chain of Thought reasoning to extract features (like project name, owner, etc.) using OpenAI's LLM
# # def extract_entities(user_input, processed_schema_df):
# #     schema_json = processed_schema_df.to_json(orient='records')
    
# #     prompt = f"""
# #     ## Database Schema Context:
# #     The following represents the columns and their respective tables available in the database:
# #     {schema_json}

# #     ## Chain of Thought Reasoning:
# #     Step 1: Analyze the user input to understand the intent. The user might ask about specific tasks, projects, owners, dates, etc.
# #     Step 2: Match these intent-specific words to the schema's column names and table names.
# #     Step 3: Extract the relevant features (like project names, owners, dates, statuses, etc.) based on the schema.
    
# #     ## User Input:
# #     The user has provided the following input: "{user_input}"

# #     ## Task:
# #     Extract the relevant features, values, and table names from the user input based on the schema using the above reasoning steps. 
# #     Output a JSON object where table names are keys, and within each table, include the fields and their values.

# #     ## Instructions:
# #     - Return a JSON dictionary that includes the table names as keys, and within each table, include the fields and their values extracted from the user input.
# #     - Omit any fields or tables where the value is empty or null.
# #     - Format the output as a JSON object with keys only for tables and fields that have values.
# #     """

# #     try:
# #         response = openai.completions.create(
# #             model="gpt-3.5-turbo-instruct",
# #             prompt=prompt,
# #             max_tokens=500,
# #             temperature=0.5
# #         )
# #         extracted_features = response.choices[0].text.strip()
# #         print("OpenAI API Response:", extracted_features)  # Debugging line
# #         return json.loads(extracted_features)  # Parse JSON string into a dictionary
# #     except openai.OpenAIError as e:
# #         print(f"Error with OpenAI: {e}")
# #         raise
# #     except json.JSONDecodeError as e:
# #         print(f"Error decoding JSON response: {e}")
# #         raise

# # # Function to remove null, None, empty values from JSON and list
# # def clean_extracted_features(feature_dict):
# #     # Remove any keys with None or empty values
# #     cleaned_feature_dict = {k: v for k, v in feature_dict.items() if v}
# #     return cleaned_feature_dict

# # # Function to parse and process extracted features
# # def process_extracted_features(extracted_features):
# #     try:
# #         # Remove the "## Solution:" part and any other non-JSON text
# #         json_match = re.search(r'\{.*\}', extracted_features, re.DOTALL)

# #         if json_match:
# #             # Extract the JSON part from the matched result
# #             cleaned_features = json_match.group(0).strip()

# #             # Handle potential trailing commas or extra data issues
# #             # Replace any trailing commas before closing braces
# #             cleaned_features = re.sub(r',\s*}', '}', cleaned_features)

# #             # Convert JSON string to a Python dictionary
# #             feature_dict = json.loads(cleaned_features)

# #             # Clean feature dictionary
# #             cleaned_feature_dict = clean_extracted_features(feature_dict)
            

# #             # Format the dictionary as per the requested format
# #             formatted_dict = []
# #             for table, fields in cleaned_feature_dict.items():
# #                 # Convert the field values to a string, joining with commas
# #                 field_values = ', '.join(fields.values())
# #                 formatted_dict.append(f'{{ "{table}": "{field_values}" }}')

# #             # Return cleaned and formatted dictionary in the requested output format
# #             return ', '.join(formatted_dict)
            
# #         else:
# #             return "Error: No valid JSON found in the extracted features"
# #     except (json.JSONDecodeError, ValueError) as e:
# #         print(f"Error parsing features: {e}")
# #         return "Error: Could not parse the features as JSON"


# # # Query Pinecone for relevant context and augment the input
# # def query_pinecone_and_augment_input(user_input, entities, namespace):
# #     embedding_model = load_huggingface_model()
# #     pinecone_index = initialize_pinecone()
# #     augmented_input = user_input
# #     pinecone_data = {}
# #     for entity_name, entity_value in entities.items():
# #         if entity_value:
# #             query_embedding = embedding_model.encode([entity_value])[0]
# #             query_embedding = np.array(query_embedding, dtype=np.float32)
# #             try:
# #                 result = pinecone_index.query(
# #                     namespace=namespace,
# #                     vector=query_embedding.tolist(),
# #                     top_k=3,
# #                     include_values=True,
# #                     include_metadata=True
# #                 )
# #                 matches = result.get('matches', [])
# #                 if matches:
# #                     unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
# #                     if unique_values:
# #                         pinecone_data[entity_name] = unique_values
# #                         if len(unique_values) > 1:
# #                             print(f"Multiple matches found for '{entity_value}':")
# #                             for idx, unique_value in enumerate(unique_values):
# #                                 print(f"{idx + 1}: {unique_value}")
# #                             while True:
# #                                 selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
# #                                 try:
# #                                     selected_value = unique_values[int(selection) - 1]
# #                                     augmented_input = augmented_input.replace(entity_value, selected_value)
# #                                     break
# #                                 except (IndexError, ValueError):
# #                                     print("Invalid selection. Please choose a valid option.")
# #                         else:
# #                             augmented_input = augmented_input.replace(entity_value, unique_values[0])
# #                 else:
# #                     print(f"No matches found for {entity_value} in Pinecone.")
# #             except Exception as e:
# #                 print(f"Error querying Pinecone: {str(e)}")
# #                 return f"Error querying Pinecone: {str(e)}", {}
# #     return augmented_input, pinecone_data

# # Function to generate SQL query using OpenAI API
# def generate_sql_from_input(user_input, processed_schema_df):
#     """
#     Extracts entities from the user input, understands user intent, and generates a corresponding SQL query.
#     """
#     # Summarize the schema to only include table and column names
#     schema_summary = processed_schema_df[['table_name', 'column_name']].to_dict(orient='records')
#     schema_summary_str = json.dumps(schema_summary, indent=2)

#     # Chain of Thought reasoning prompt with summarized schema
#     cot_prompt = f"""
#     ## Database Schema Summary:
#     {schema_summary_str}

#     ## User Input:
#     "{user_input}"

#     ## Steps:
#     1. Extract relevant entities (project name, task name, milestone, etc.).
#     2. Map the entities to the schema columns.
#     3. Generate an SQL query using appropriate operators like `LIKE` for partial matches.

#     ## Example Output:
#     {{
#       "entities": {{
#         "projects": {{"project_name": "Extracted Project Name", "status": "Extracted Status"}},
#         "tasks": {{"task_name": "Extracted Task Name", "owner": "Extracted Owner"}}
#       }},
#       "sql_query": "Generated SQL Query"
#     }}
#     """

#     try:
#         response = openai.completions.create(
#             model="gpt-3.5-turbo-instruct",
#             prompt=cot_prompt,
#             max_tokens=500,  # Reduced token limit for completion
#             temperature=0.7
#         )
#         result = response.choices[0].text.strip()

#         # Parse the response
#         parsed_result = json.loads(result)

#         extracted_entities = parsed_result.get('entities')
#         sql_query = parsed_result.get('sql_query')

#         print("Extracted Entities:", json.dumps(extracted_entities, indent=2))
#         print("Generated SQL Query:", sql_query)

#         return extracted_entities, sql_query
#     except openai.OpenAIError as e:
#         print(f"Error processing request: {e}")
#         raise
#     except json.JSONDecodeError as e:
#         print(f"Error decoding JSON response: {e}")
#         raise

# # Initialize OpenAI Chat model
# openai_model = ChatOpenAI(
#     openai_api_key=OPENAI_API_KEY,
#     model_name="gpt-3.5-turbo",
#     temperature=0.7,
#     max_tokens=150
# )

# # Create a ChatPromptTemplate with the knowledge base included
# template = """
# ## Knowledge Base:
# {knowledge_base}

# ## Database Schema:
# {database_schema}

# ## Question:
# {question}

# ## Answer:
# """

# prompt_template = ChatPromptTemplate.from_template(template)

# def get_answer_from_chatbot(question, database_schema):
#     try:
#         prompt = prompt_template.format(
#             knowledge_base="",
#             database_schema=database_schema,
#             question=question
#         )
#         response = openai_model.invoke(input=prompt)
#         parsed_response = response.content.strip() if hasattr(response, 'content') else "No response content found."
#         return parsed_response
#     except Exception as e:
#         return f"Error generating response from OpenAI: {str(e)}"

# # Function to execute the SQL query and print the results
# def execute_sql_query(conn, sql_query):
#     try:
#         with conn.cursor() as cursor:
#             cursor.execute(sql_query)
#             results = cursor.fetchall()
#             return results
#     except Exception as e:
#         print(f"Error executing SQL query: {e}")
#         return None

# # Determine if user query is related to database or general knowledge

# def determine_query_type(user_query, schema_df, threshold = 75):
#     user_query_lower = user_query.lower()
    
#     # Extract unique table and column names from the schema and convert to lowercase
#     table_names = schema_df['table_name'].str.lower().unique()
#     column_names = schema_df['column_name'].str.lower().unique()
    
#     # Function to check fuzzy match
#     def is_fuzzy_match(query, options, threshold):
#         for option in options:
#             if fuzz.partial_ratio(query, option) >= threshold:
#                 return True
#         return False
    
#     # Check if user query matches any table or column name
#     if is_fuzzy_match(user_query_lower, table_names, threshold) or \
#        is_fuzzy_match(user_query_lower, column_names, threshold):
#         return "database"
    
#     return "knowledge"



# # Main function to handle user queries
# def process_user_query(user_query):
#     # Connect to the database and fetch the schema
#     conn = connect_to_db()
    
#     schema_df = fetch_schema(conn)
#     processed_schema_df = process_schema(schema_df)
#     query_type = determine_query_type(user_query, schema_df)

#     # # Extract features from user input using OpenAI's LLM
#     # extracted_features = extract_entities(user_input, schema_df)

#     # # Process the extracted features and clean them
#     # cleaned_json = process_extracted_features(extracted_features)

#     if query_type == "database":
#         # # Extract entities from the user query
#         # entities = extract_entities(user_query, processed_schema_df)

#         # if isinstance(cleaned_features_dict, dict):
#         #     for table, fields in cleaned_features_dict.items():
#         #         for field, entity_value in fields.items():
#         #             if entity_value:
#         #                 print("Entities found. Querying Pinecone for context.")
#         #                 augmented_query, pinecone_data = query_pinecone_and_augment_input(user_query, entities, NAMESPACE)
#         #                 # Generate SQL query using the augmented query
#         #                 processed_schema_df = process_schema(fetch_schema(conn))  # Process schema for better matching
#         #                 sql_query = generate_sql_query(augmented_query, processed_schema_df)
                        
#         #             else: 
#         #                 print("No relevant entities found. Directly generating SQL query.")
#         #                 sql_query = generate_sql_query(user_query, processed_schema_df)
#         #                 print(sql_query)      
                
#         # Execute the SQL query and print the results
#         sql_query = generate_sql_from_input(user_query, processed_schema_df)
#         print(sql_query)
#         results = execute_sql_query(conn, sql_query)
#         print(results)
#         conn.close()

#         if results is not None:
#             print("Query Results:")
#             for row in results:
#                 print(row)
#         else:
#             print("No results returned or error occurred during query execution.")
        
#         return f"Generated SQL Query: {sql_query}"
    
#     else:
#         # For non-database related queries, respond using the chatbot
#         database_schema = fetch_schema(conn)  # Fetching schema again if needed
#         database_schema_df = process_schema(database_schema)
#         return get_answer_from_chatbot(user_query, database_schema_df.to_dict(orient='records'))

# # Example usage
# if __name__ == "__main__":
#     while True:
#         user_input = input("Enter your query: ")
#         if user_input.lower() in ['exit', 'quit']:
#             break
#         response = process_user_query(user_input)
#         print(response)


  from tqdm.autonotebook import tqdm, trange


Enter your query:  what is the status of a project IIFL Samasta


  schema_df = pd.read_sql(query, conn)


Error decoding JSON response: Expecting value: line 1 column 1 (char 0)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# import re
# import pandas as pd
# import psycopg2
# import numpy as np
# from sentence_transformers import SentenceTransformer
# from langchain_openai import ChatOpenAI
# from langchain.prompts import ChatPromptTemplate
# import pinecone
# import openai
# import os
# import json
# from fuzzywuzzy import fuzz

# # OpenAI API key
# OPENAI_API_KEY = 'sk-proj-UnzdWuWBs7ZQRbRPiRCoT3BlbkFJhPM1p7DdZUMklcpnWK1S'
# openai.api_key = OPENAI_API_KEY

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "python_test_poc"
# PORT = 5432

# # Constants
# PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
# INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
# NAMESPACE = "projects"  # Replace with your namespace
# MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# # Initialize Pinecone client
# def initialize_pinecone():
#     from pinecone import Pinecone, ServerlessSpec
#     pc = Pinecone(api_key=PINECONE_API_KEY)
    
#     if INDEX_NAME not in pc.list_indexes().names():
#         pc.create_index(
#             name=INDEX_NAME,
#             dimension=768,
#             metric='cosine',
#             spec=ServerlessSpec(cloud='aws', region='us-west-2')
#         )
#     return pc.Index(INDEX_NAME)

# # Load Hugging Face model for embeddings
# def load_huggingface_model():
#     return SentenceTransformer(MODEL_NAME)

# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise
        
# # Function to fetch schema from PostgreSQL database
# def fetch_schema(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         """
#         schema_df = pd.read_sql(query, conn)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema: {e}")
#         raise


# # Function to process schema: remove special characters and convert to lowercase
# def process_schema(schema_df):
#     def clean_column_name(name):
#         return re.sub(r'[^a-zA-Z]', '', name).lower()

#     schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
#     return schema_df

# # # Function to apply Chain of Thought reasoning to extract features (like project name, owner, etc.) using OpenAI's LLM
# # def extract_entities(user_input, processed_schema_df):
# #     schema_json = processed_schema_df.to_json(orient='records')
    
# #     prompt = f"""
# #     ## Database Schema Context:
# #     The following represents the columns and their respective tables available in the database:
# #     {schema_json}

# #     ## Chain of Thought Reasoning:
# #     Step 1: Analyze the user input to understand the intent. The user might ask about specific tasks, projects, owners, dates, etc.
# #     Step 2: Match these intent-specific words to the schema's column names and table names.
# #     Step 3: Extract the relevant features (like project names, owners, dates, statuses, etc.) based on the schema.
    
# #     ## User Input:
# #     The user has provided the following input: "{user_input}"

# #     ## Task:
# #     Extract the relevant features, values, and table names from the user input based on the schema using the above reasoning steps. 
# #     Output a JSON object where table names are keys, and within each table, include the fields and their values.

# #     ## Instructions:
# #     - Return a JSON dictionary that includes the table names as keys, and within each table, include the fields and their values extracted from the user input.
# #     - Omit any fields or tables where the value is empty or null.
# #     - Format the output as a JSON object with keys only for tables and fields that have values.
# #     """

# #     try:
# #         response = openai.completions.create(
# #             model="gpt-3.5-turbo-instruct",
# #             prompt=prompt,
# #             max_tokens=500,
# #             temperature=0.5
# #         )
# #         extracted_features = response.choices[0].text.strip()
# #         print("OpenAI API Response:", extracted_features)  # Debugging line
# #         return json.loads(extracted_features)  # Parse JSON string into a dictionary
# #     except openai.OpenAIError as e:
# #         print(f"Error with OpenAI: {e}")
# #         raise
# #     except json.JSONDecodeError as e:
# #         print(f"Error decoding JSON response: {e}")
# #         raise

# # # Function to remove null, None, empty values from JSON and list
# # def clean_extracted_features(feature_dict):
# #     # Remove any keys with None or empty values
# #     cleaned_feature_dict = {k: v for k, v in feature_dict.items() if v}
# #     return cleaned_feature_dict

# # # Function to parse and process extracted features
# # def process_extracted_features(extracted_features):
# #     try:
# #         # Remove the "## Solution:" part and any other non-JSON text
# #         json_match = re.search(r'\{.*\}', extracted_features, re.DOTALL)

# #         if json_match:
# #             # Extract the JSON part from the matched result
# #             cleaned_features = json_match.group(0).strip()

# #             # Handle potential trailing commas or extra data issues
# #             # Replace any trailing commas before closing braces
# #             cleaned_features = re.sub(r',\s*}', '}', cleaned_features)

# #             # Convert JSON string to a Python dictionary
# #             feature_dict = json.loads(cleaned_features)

# #             # Clean feature dictionary
# #             cleaned_feature_dict = clean_extracted_features(feature_dict)
            

# #             # Format the dictionary as per the requested format
# #             formatted_dict = []
# #             for table, fields in cleaned_feature_dict.items():
# #                 # Convert the field values to a string, joining with commas
# #                 field_values = ', '.join(fields.values())
# #                 formatted_dict.append(f'{{ "{table}": "{field_values}" }}')

# #             # Return cleaned and formatted dictionary in the requested output format
# #             return ', '.join(formatted_dict)
            
# #         else:
# #             return "Error: No valid JSON found in the extracted features"
# #     except (json.JSONDecodeError, ValueError) as e:
# #         print(f"Error parsing features: {e}")
# #         return "Error: Could not parse the features as JSON"


# # # Query Pinecone for relevant context and augment the input
# # def query_pinecone_and_augment_input(user_input, entities, namespace):
# #     embedding_model = load_huggingface_model()
# #     pinecone_index = initialize_pinecone()
# #     augmented_input = user_input
# #     pinecone_data = {}
# #     for entity_name, entity_value in entities.items():
# #         if entity_value:
# #             query_embedding = embedding_model.encode([entity_value])[0]
# #             query_embedding = np.array(query_embedding, dtype=np.float32)
# #             try:
# #                 result = pinecone_index.query(
# #                     namespace=namespace,
# #                     vector=query_embedding.tolist(),
# #                     top_k=3,
# #                     include_values=True,
# #                     include_metadata=True
# #                 )
# #                 matches = result.get('matches', [])
# #                 if matches:
# #                     unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
# #                     if unique_values:
# #                         pinecone_data[entity_name] = unique_values
# #                         if len(unique_values) > 1:
# #                             print(f"Multiple matches found for '{entity_value}':")
# #                             for idx, unique_value in enumerate(unique_values):
# #                                 print(f"{idx + 1}: {unique_value}")
# #                             while True:
# #                                 selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
# #                                 try:
# #                                     selected_value = unique_values[int(selection) - 1]
# #                                     augmented_input = augmented_input.replace(entity_value, selected_value)
# #                                     break
# #                                 except (IndexError, ValueError):
# #                                     print("Invalid selection. Please choose a valid option.")
# #                         else:
# #                             augmented_input = augmented_input.replace(entity_value, unique_values[0])
# #                 else:
# #                     print(f"No matches found for {entity_value} in Pinecone.")
# #             except Exception as e:
# #                 print(f"Error querying Pinecone: {str(e)}")
# #                 return f"Error querying Pinecone: {str(e)}", {}
# #     return augmented_input, pinecone_data

# # Function to generate SQL query using OpenAI API
# def generate_sql_query(user_input, processed_schema_df):
#     schema_json = processed_schema_df.to_json(orient='records')
#     schema_with_types = processed_schema_df[['table_name', 'column_name', 'data_type']].to_dict(orient='records')
#     context = f"""
#     ## Database Schema Context
#     Schema JSON: {schema_json}
#     Detailed Schema: {schema_with_types}

#     ## User Input
#     Given the following user input: '{user_input}', generate an SQL query.
#     Use the LIKE operator for partial matches where appropriate. Handle data type mismatches explicitly.

#     ## Instructions
#     Based on the user input and the provided schema, generate an accurate SQL query.
#     Ensure the query maps correctly to the tables and columns in the database.
#     Handle data type casting if necessary to match columns with different types.
#     """
#     try:
#         response = openai.completions.create(
#             model="gpt-3.5-turbo-instruct",
#             prompt=context,
#             max_tokens=500,
#             temperature=0.7
#         )
#         generated_query = response.choices[0].text.strip()
#         if generated_query.lower().startswith("the generated sql query is:"):
#             generated_query = generated_query[len("The generated SQL query is:"):].strip()
#         return generated_query
#     except openai.OpenAIError as e:
#         print(f"Error generating SQL query: {e}")
#         raise


# # Initialize OpenAI Chat model
# openai_model = ChatOpenAI(
#     openai_api_key=OPENAI_API_KEY,
#     model_name="gpt-3.5-turbo",
#     temperature=0.7,
#     max_tokens=150
# )

# # Create a ChatPromptTemplate with the knowledge base included
# template = """
# ## Knowledge Base:
# {knowledge_base}

# ## Database Schema:
# {database_schema}

# ## Question:
# {question}

# ## Answer:
# """

# prompt_template = ChatPromptTemplate.from_template(template)

# def get_answer_from_chatbot(question, database_schema):
#     try:
#         prompt = prompt_template.format(
#             knowledge_base="",
#             database_schema=database_schema,
#             question=question
#         )
#         response = openai_model.invoke(input=prompt)
#         parsed_response = response.content.strip() if hasattr(response, 'content') else "No response content found."
#         return parsed_response
#     except Exception as e:
#         return f"Error generating response from OpenAI: {str(e)}"

# # Function to execute the SQL query and print the results
# def execute_sql_query(conn, sql_query):
#     try:
#         with conn.cursor() as cursor:
#             cursor.execute(sql_query)
#             results = cursor.fetchall()
#             return results
#     except Exception as e:
#         print(f"Error executing SQL query: {e}")
#         return None

# # Determine if user query is related to database or general knowledge

# def determine_query_type(user_query, schema_df, threshold = 75):
#     user_query_lower = user_query.lower()
    
#     # Extract unique table and column names from the schema and convert to lowercase
#     table_names = schema_df['table_name'].str.lower().unique()
#     column_names = schema_df['column_name'].str.lower().unique()
    
#     # Function to check fuzzy match
#     def is_fuzzy_match(query, options, threshold):
#         for option in options:
#             if fuzz.partial_ratio(query, option) >= threshold:
#                 return True
#         return False
    
#     # Check if user query matches any table or column name
#     if is_fuzzy_match(user_query_lower, table_names, threshold) or \
#        is_fuzzy_match(user_query_lower, column_names, threshold):
#         return "database"
    
#     return "knowledge"



# # Main function to handle user queries
# def process_user_query(user_query):
#     # Connect to the database and fetch the schema
#     conn = connect_to_db()
    
#     schema_df = fetch_schema(conn)
#     processed_schema_df = process_schema(schema_df)
#     query_type = determine_query_type(user_query, schema_df)

#     # # Extract features from user input using OpenAI's LLM
#     # extracted_features = extract_entities(user_input, schema_df)

#     # # Process the extracted features and clean them
#     # cleaned_json = process_extracted_features(extracted_features)

#     if query_type == "database":
#         # # Extract entities from the user query
#         # entities = extract_entities(user_query, processed_schema_df)

#         # if isinstance(cleaned_features_dict, dict):
#         #     for table, fields in cleaned_features_dict.items():
#         #         for field, entity_value in fields.items():
#         #             if entity_value:
#         #                 print("Entities found. Querying Pinecone for context.")
#         #                 augmented_query, pinecone_data = query_pinecone_and_augment_input(user_query, entities, NAMESPACE)
#         #                 # Generate SQL query using the augmented query
#         #                 processed_schema_df = process_schema(fetch_schema(conn))  # Process schema for better matching
#         #                 sql_query = generate_sql_query(augmented_query, processed_schema_df)
                        
#         #             else: 
#         #                 print("No relevant entities found. Directly generating SQL query.")
#         #                 sql_query = generate_sql_query(user_query, processed_schema_df)
#         #                 print(sql_query)      
                
#         # Execute the SQL query and print the results
#         sql_query = generate_sql_query(user_query, processed_schema_df)
#         print(sql_query)
#         results = execute_sql_query(conn, sql_query)
#         print(results)
#         conn.close()

#         if results is not None:
#             print("Query Results:")
#             for row in results:
#                 print(row)
#         else:
#             print("No results returned or error occurred during query execution.")
        
#         return f"Generated SQL Query: {sql_query}"
    
#     else:
#         # For non-database related queries, respond using the chatbot
#         database_schema = fetch_schema(conn)  # Fetching schema again if needed
#         database_schema_df = process_schema(database_schema)
#         return get_answer_from_chatbot(user_query, database_schema_df.to_dict(orient='records'))

# # Example usage
# if __name__ == "__main__":
#     while True:
#         user_input = input("Enter your query: ")
#         if user_input.lower() in ['exit', 'quit']:
#             break
#         response = process_user_query(user_input)
#         print(response)


In [None]:
##Smart Desk Code

# import re
# import pandas as pd
# import psycopg2
# import numpy as np
# from sentence_transformers import SentenceTransformer
# from langchain_openai import ChatOpenAI
# from langchain.prompts import ChatPromptTemplate
# import pinecone
# import openai
# import os

# # OpenAI API key
# OPENAI_API_KEY = 'sk-proj-UnzdWuWBs7ZQRbRPiRCoT3BlbkFJhPM1p7DdZUMklcpnWK1S'
# openai.api_key = OPENAI_API_KEY

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "python_test_poc"
# PORT = 5432

# # Constants
# PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
# INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
# NAMESPACE = "projects"  # Replace with your namespace
# MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# # Initialize Pinecone client
# def initialize_pinecone():
#     from pinecone import Pinecone, ServerlessSpec
#     pc = Pinecone(api_key=PINECONE_API_KEY)
    
#     if INDEX_NAME not in pc.list_indexes().names():
#         pc.create_index(
#             name=INDEX_NAME,
#             dimension=768,
#             metric='cosine',
#             spec=ServerlessSpec(cloud='aws', region='us-west-2')
#         )
#     return pc.Index(INDEX_NAME)

# # Load Hugging Face model for embeddings
# def load_huggingface_model():
#     return SentenceTransformer(MODEL_NAME)

# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise
        
# # Function to fetch schema from PostgreSQL database
# def fetch_schema(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         """
#         schema_df = pd.read_sql(query, conn)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema: {e}")
#         raise


# # Function to process schema: remove special characters and convert to lowercase
# def process_schema(schema_df):
#     def clean_column_name(name):
#         return re.sub(r'[^a-zA-Z]', '', name).lower()

#     schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
#     return schema_df

# # Extract relevant entities based on regex and column names
# def extract_entities(user_query, schema):
#     entities = {
#         'project_name': None,
#         'owner': None
#     }
#     project_pattern = re.compile(r'project\s+([a-zA-Z0-9_ ]+)', re.IGNORECASE)
#     owner_pattern = re.compile(r'owner\s+of\s+project\s+([a-zA-Z0-9_ ]+)', re.IGNORECASE)
#     project_match = project_pattern.search(user_query)
#     if project_match:
#         entities['project_name'] = project_match.group(1).strip()
#     owner_match = owner_pattern.search(user_query)
#     if owner_match:
#         entities['owner'] = owner_match.group(1).strip()
#     return entities

# # Query Pinecone for relevant context and augment the input
# def query_pinecone_and_augment_input(user_input, entities, namespace):
#     embedding_model = load_huggingface_model()
#     pinecone_index = initialize_pinecone()
#     augmented_input = user_input
#     pinecone_data = {}
#     for entity_name, entity_value in entities.items():
#         if entity_value:
#             query_embedding = embedding_model.encode([entity_value])[0]
#             query_embedding = np.array(query_embedding, dtype=np.float32)
#             try:
#                 result = pinecone_index.query(
#                     namespace=namespace,
#                     vector=query_embedding.tolist(),
#                     top_k=3,
#                     include_values=True,
#                     include_metadata=True
#                 )
#                 matches = result.get('matches', [])
#                 if matches:
#                     unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
#                     if unique_values:
#                         pinecone_data[entity_name] = unique_values
#                         if len(unique_values) > 1:
#                             print(f"Multiple matches found for '{entity_value}':")
#                             for idx, unique_value in enumerate(unique_values):
#                                 print(f"{idx + 1}: {unique_value}")
#                             while True:
#                                 selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
#                                 try:
#                                     selected_value = unique_values[int(selection) - 1]
#                                     augmented_input = augmented_input.replace(entity_value, selected_value)
#                                     break
#                                 except (IndexError, ValueError):
#                                     print("Invalid selection. Please choose a valid option.")
#                         else:
#                             augmented_input = augmented_input.replace(entity_value, unique_values[0])
#                 else:
#                     print(f"No matches found for {entity_value} in Pinecone.")
#             except Exception as e:
#                 print(f"Error querying Pinecone: {str(e)}")
#                 return f"Error querying Pinecone: {str(e)}", {}
#     return augmented_input, pinecone_data

# def generate_sql_query(user_input, processed_schema_df):
#     schema_json = processed_schema_df.to_json(orient='records')
#     schema_with_types = processed_schema_df[['table_name', 'column_name']].to_dict(orient='records')  # Removed 'data_type'
    
#     context = f"""
#     ## Database Schema Context
#     Schema JSON: {schema_json}
#     Detailed Schema: {schema_with_types}

#     ## User Input
#     Given the following user input: '{user_input}', generate an SQL query.
#     Use the LIKE operator for partial matches where appropriate. Handle data type mismatches explicitly.

#     ## Instructions
#     Based on the user input and the provided schema, generate an accurate SQL query.
#     Ensure the query maps correctly to the tables and columns in the database.
#     Handle data type casting if necessary to match columns with different types.
#     """
#     try:
#         response = openai.completions.create(
#             model="gpt-3.5-turbo-instruct",
#             prompt=context,
#             max_tokens=500,
#             temperature=0.7
#         )
#         generated_query = response.choices[0].text.strip()
#         if generated_query.lower().startswith("the generated sql query is:"):
#             generated_query = generated_query[len("The generated SQL query is:"):].strip()
#         return generated_query
#     except openai.OpenAIError as e:
#         print(f"Error generating SQL query: {e}")
#         raise


# # Initialize OpenAI Chat model
# openai_model = ChatOpenAI(
#     openai_api_key=OPENAI_API_KEY,
#     model_name="gpt-3.5-turbo",
#     temperature=0.7,
#     max_tokens=150
# )

# # Create a ChatPromptTemplate with the knowledge base included
# template = """
# ## Knowledge Base:
# {knowledge_base}

# ## Database Schema:
# {database_schema}

# ## Question:
# {question}

# ## Answer:
# """

# prompt_template = ChatPromptTemplate.from_template(template)

# def get_answer_from_chatbot(question, database_schema):
#     try:
#         prompt = prompt_template.format(
#             knowledge_base="",
#             database_schema=database_schema,
#             question=question
#         )
#         response = openai_model.invoke(input=prompt)
#         parsed_response = response.content.strip() if hasattr(response, 'content') else "No response content found."
#         return parsed_response
#     except Exception as e:
#         return f"Error generating response from OpenAI: {str(e)}"

# # Function to execute the SQL query and print the results
# def execute_sql_query(conn, sql_query):
#     try:
#         with conn.cursor() as cursor:
#             cursor.execute(sql_query)
#             results = cursor.fetchall()
#             return results
#     except Exception as e:
#         print(f"Error executing SQL query: {e}")
#         return None

# # Determine if user query is related to database or general knowledge
# def determine_query_type(user_query, schema_df):
#     user_query_lower = user_query.lower()
    
#     if any(table.lower() in user_query_lower for table in schema_df['table_name'].unique()) or \
#        any(column.lower() in user_query_lower for column in schema_df['column_name'].unique()):
#         return "database"
    
#     return "knowledge"


# # Main function to handle user queries
# def process_user_query(user_query):
#     # Connect to the database and fetch the schema
    
#     conn = connect_to_db()
    
#     schema_df = fetch_schema(conn)
#     processed_schema_df = process_schema(schema_df)
#     query_type = determine_query_type(user_query,schema_df)
    
#     query_type = determine_query_type(user_query, schema_df)
    
#     if query_type == "database":
#         database_schema = fetch_schema(conn)  # You can use the schema_df here if you prefer
        
#         entities = extract_entities(user_query, database_schema)
#         augmented_query, pinecone_data = query_pinecone_and_augment_input(user_query, entities, NAMESPACE)
        
#         processed_schema_df = process_schema(database_schema)  # Process schema for better matching
#         sql_query = generate_sql_query(augmented_query, processed_schema_df)
        
#         results = execute_sql_query(conn, sql_query)
#         conn.close()
        
#         if results is not None:
#             print("Query Results:")
#             for row in results:
#                 print(row)
#         else:
#             print("No results returned or error occurred during query execution.")
        
#         return f"Generated SQL Query: {sql_query}"
    
#     else:
#         database_schema = fetch_schema(conn)  # Fetching schema again if needed
#         database_schema_df = process_schema(database_schema)
#         return get_answer_from_chatbot(user_query, database_schema_df.to_dict(orient='records'))


# # Example usage
# if __name__ == "__main__":
#     while True:
#         user_input = input("Enter your query: ")
#         if user_input.lower() in ['exit', 'quit']:
#             break
#         response = process_user_query(user_input)
#         print(response)


In [None]:
import re
import pandas as pd
import psycopg2
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import pinecone
import openai
import os

# OpenAI API key
OPENAI_API_KEY = 'sk-proj-UnzdWuWBs7ZQRbRPiRCoT3BlbkFJhPM1p7DdZUMklcpnWK1S'
openai.api_key = OPENAI_API_KEY

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Constants
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
NAMESPACE = "projects"  # Replace with your namespace
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# Initialize Pinecone client
def initialize_pinecone():
    from pinecone import Pinecone, ServerlessSpec
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-west-2')
        )
    return pc.Index(INDEX_NAME)

# Load Hugging Face model for embeddings
def load_huggingface_model():
    return SentenceTransformer(MODEL_NAME)

def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise
        
# Function to fetch schema from PostgreSQL database
def fetch_schema(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema: {e}")
        raise


# Function to process schema: remove special characters and convert to lowercase
def process_schema(schema_df):
    def clean_column_name(name):
        return re.sub(r'[^a-zA-Z]', '', name).lower()

    schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
    return schema_df

# Extract relevant entities based on regex and column names
def extract_entities(user_query, schema):
    entities = {
        'project_name': None,
        'owner': None
    }
    project_pattern = re.compile(r'project\s+([a-zA-Z0-9_ ]+)', re.IGNORECASE)
    owner_pattern = re.compile(r'owner\s+of\s+project\s+([a-zA-Z0-9_ ]+)', re.IGNORECASE)
    project_match = project_pattern.search(user_query)
    if project_match:
        entities['project_name'] = project_match.group(1).strip()
    owner_match = owner_pattern.search(user_query)
    if owner_match:
        entities['owner'] = owner_match.group(1).strip()
    return entities

# Query Pinecone for relevant context and augment the input
def query_pinecone_and_augment_input(user_input, entities, namespace):
    embedding_model = load_huggingface_model()
    pinecone_index = initialize_pinecone()
    augmented_input = user_input
    pinecone_data = {}
    for entity_name, entity_value in entities.items():
        if entity_value:
            query_embedding = embedding_model.encode([entity_value])[0]
            query_embedding = np.array(query_embedding, dtype=np.float32)
            try:
                result = pinecone_index.query(
                    namespace=namespace,
                    vector=query_embedding.tolist(),
                    top_k=3,
                    include_values=True,
                    include_metadata=True
                )
                matches = result.get('matches', [])
                if matches:
                    unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
                    if unique_values:
                        pinecone_data[entity_name] = unique_values
                        if len(unique_values) > 1:
                            print(f"Multiple matches found for '{entity_value}':")
                            for idx, unique_value in enumerate(unique_values):
                                print(f"{idx + 1}: {unique_value}")
                            while True:
                                selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
                                try:
                                    selected_value = unique_values[int(selection) - 1]
                                    augmented_input = augmented_input.replace(entity_value, selected_value)
                                    break
                                except (IndexError, ValueError):
                                    print("Invalid selection. Please choose a valid option.")
                        else:
                            augmented_input = augmented_input.replace(entity_value, unique_values[0])
                else:
                    print(f"No matches found for {entity_value} in Pinecone.")
            except Exception as e:
                print(f"Error querying Pinecone: {str(e)}")
                return f"Error querying Pinecone: {str(e)}", {}
    return augmented_input, pinecone_data

def generate_sql_query(user_input, processed_schema_df):
    schema_json = processed_schema_df.to_json(orient='records')
    schema_with_types = processed_schema_df[['table_name', 'column_name']].to_dict(orient='records')  # Removed 'data_type'
    
    context = f"""
    ## Database Schema Context
    Schema JSON: {schema_json}
    Detailed Schema: {schema_with_types}

    ## User Input
    Given the following user input: '{user_input}', generate an SQL query.
    Use the LIKE operator for partial matches where appropriate. Handle data type mismatches explicitly.

    ## Instructions
    Based on the user input and the provided schema, generate an accurate SQL query.
    Ensure the query maps correctly to the tables and columns in the database.
    Handle data type casting if necessary to match columns with different types.
    """
    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=context,
            max_tokens=500,
            temperature=0.7
        )
        generated_query = response.choices[0].text.strip()
        if generated_query.lower().startswith("the generated sql query is:"):
            generated_query = generated_query[len("The generated SQL query is:"):].strip()
        return generated_query
    except openai.OpenAIError as e:
        print(f"Error generating SQL query: {e}")
        raise


# Initialize OpenAI Chat model
openai_model = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=150
)

# Create a ChatPromptTemplate with the knowledge base included
template = """
## Knowledge Base:
{knowledge_base}

## Database Schema:
{database_schema}

## Question:
{question}

## Answer:
"""

prompt_template = ChatPromptTemplate.from_template(template)

def get_answer_from_chatbot(question, database_schema):
    try:
        prompt = prompt_template.format(
            knowledge_base="",
            database_schema=database_schema,
            question=question
        )
        response = openai_model.invoke(input=prompt)
        parsed_response = response.content.strip() if hasattr(response, 'content') else "No response content found."
        return parsed_response
    except Exception as e:
        return f"Error generating response from OpenAI: {str(e)}"

# Function to execute the SQL query and print the results
def execute_sql_query(conn, sql_query):
    try:
        with conn.cursor() as cursor:
            cursor.execute(sql_query)
            results = cursor.fetchall()
            return results
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return None

# Determine if user query is related to database or general knowledge
def determine_query_type(user_query, schema_df):
    user_query_lower = user_query.lower()
    
    if any(table.lower() in user_query_lower for table in schema_df['table_name'].unique()) or \
       any(column.lower() in user_query_lower for column in schema_df['column_name'].unique()):
        return "database"
    
    return "knowledge"


# Main function to handle user queries
def process_user_query(user_query):
    # Connect to the database and fetch the schema
    conn = connect_to_db()
    schema_df = fetch_schema(conn)
    processed_schema_df = process_schema(schema_df)
   
    
    query_type = determine_query_type(user_query, schema_df)
    
    if query_type == "database":
        database_schema = fetch_schema(conn)  # You can use the schema_df here if you prefer
        
        entities = extract_entities(user_query, database_schema)
        augmented_query, pinecone_data = query_pinecone_and_augment_input(user_query, entities, NAMESPACE)
        
        processed_schema_df = process_schema(database_schema)  # Process schema for better matching
        sql_query = generate_sql_query(augmented_query, processed_schema_df)
       
        results = execute_sql_query(conn, sql_query)
        conn.close()
        
        if results is not None:
            print("Query Results:")
            for row in results:
                print(row)
        else:
            print("No results returned or error occurred during query execution.")
        
        return f"Generated SQL Query: {sql_query}"
    
    else:
        database_schema = fetch_schema(conn)  # Fetching schema again if needed
        database_schema_df = process_schema(database_schema)
        return get_answer_from_chatbot(user_query, database_schema_df.to_dict(orient='records'))


# Example usage
if __name__ == "__main__":
    while True:
        user_input = input("Enter your query: ")
        if user_input.lower() in ['exit', 'quit']:
            break
        response = process_user_query(user_input)
        print(response)


Enter your query:  what is the status of a project IIFL Samasta


  schema_df = pd.read_sql(query, conn)
  schema_df = pd.read_sql(query, conn)


Multiple matches found for 'IIFL Samasta':
1: IIFl Samasta CPL CR
2: IIFL Samasta - CGRM
3: IIFL SAMASTA - RPA BOT


Please select the most relevant option for 'IIFL Samasta' (1-3):  3


Query Results:
('Completed',)
('Completed',)
Generated SQL Query: SELECT status FROM projects WHERE project_name LIKE '%IIFL SAMASTA - RPA BOT%';
