In [1]:
import openai
import psycopg2
import pandas as pd
import re
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import pinecone
import os
from fuzzywuzzy import fuzz
# Initialize OpenAI API key
OPENAI_API_KEY = ""
openai.api_key = OPENAI_API_KEY
# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

  from tqdm.autonotebook import tqdm, trange







In [2]:
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"  # Replace with your Pinecone API key
INDEX_NAME = "smart-desk"  # Replace with your Pinecone index name
# Initialize OpenAI Chat model
openai_model = ChatOpenAI(
    openai_api_key=openai.api_key,
    model_name="gpt-4o-mini-2024-07-18",
    temperature=0.7,
    max_tokens=150
)
# Make sure to replace the completion calls elsewhere in the code   
# Create a ChatPromptTemplate with the knowledge base included
template = """
## Knowledge Base:
{knowledge_base}

## Database Schema:
{database_schema}

## Question:
{question}

## Answer:
"""
prompt_template = ChatPromptTemplate.from_template(template)
NAMESPACE = []  # Replace with your namespace
columnnames={}
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
searched_cols=[]
augmented_input=''

In [3]:
def load_huggingface_model():
    return SentenceTransformer(MODEL_NAME)

In [4]:
def initialize_pinecone():
    from pinecone import Pinecone, ServerlessSpec
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-west-2')
        )
    return pc.Index(INDEX_NAME)

In [5]:
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

In [6]:
def fetch_schema(conn):
    try:
        query = """
        SELECT table_name, column_name
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        # print(schema_df)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema: {e}")
        raise

In [7]:
def process_schema(schema_df):
    def clean_column_name(name):
        return re.sub(r'[^a-zA-Z]', '', name).lower()

    schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
    return schema_df

In [8]:
def extract_features_with_openai(user_input, processed_schema_df):
    schema_json = processed_schema_df.to_json(orient='records')
    
    # Refined prompt to ensure OpenAI extracts table names, column names, and their values
    prompt = f"""
    ## Database Schema Context:
    The following represents the columns and their respective tables available in the database:
    {schema_json}

    ## User Input:
    The user has provided the following input: "{user_input}"

    ## Task:
    Extract the relevant features, values, and table names from the user input based on the schema. These features might include project names, owners, dates, statuses, etc., along with their corresponding table names.

    ## Instructions:
    - Return a JSON dictionary that includes the table names as keys, and within each table, include the fields and their values extracted from the user input.
    - Omit any fields or tables where the value is empty or null.
    - Format the output as a JSON object with keys only for tables and fields that have values.
    """

    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=500,
            temperature=0.5
        )
        extracted_features = response.choices[0].text.strip()
        return extracted_features
    except openai.OpenAIError as e:
        print(f"Error with OpenAI: {e}")
        raise

In [9]:
def process_extracted_features(extracted_features):
    try:
        # Remove the "## Solution:" part and any other non-JSON text
        json_match = re.search(r'\{.*\}', extracted_features, re.DOTALL)
        
        if json_match:
            # Extract the JSON part from the matched result
            cleaned_features = json_match.group(0)

            # Convert JSON string to a Python dictionary
            feature_dict = json.loads(cleaned_features)

            # Clean feature dictionary and feature list to remove nulls and empty values
            cleaned_feature_dict, feature_list = clean_extracted_features(feature_dict)

            # Return cleaned JSON and feature list
            return json.dumps(cleaned_feature_dict, indent=4), feature_list
        else:
            return None, []
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Error parsing features: {e}")
        return None, []

In [10]:
def clean_extracted_features(feature_dict):
    # Remove any keys with None or empty values
    cleaned_feature_dict = {k: v for k, v in feature_dict.items() if v}
    # Extract the non-null values into a list
    feature_list = list(cleaned_feature_dict.values())
    return cleaned_feature_dict, feature_list

In [11]:
def extract_nmaespace(extracted_dict):
    global NAMESPACE,columnnames,augmented_input
    for key in extracted_dict.keys():
        NAMESPACE.append(key)
        columnnames[key]= extracted_dict[key]

In [12]:
def query_pinecone_and_augment_input(user_input, entities, namespace, columns):
    global searched_cols
    embedding_model = load_huggingface_model()
    pinecone_index = initialize_pinecone()
    augmented_input = user_input
    pinecone_data = {}

    # Function to flatten the nested dictionary
    def flatten_dict(d, parent_key=''):
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}.{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(flatten_dict(v, new_key).items())
            else:
                items.append((new_key, v))
        return dict(items)

    # Flatten the entities dictionary
    flat_entities = flatten_dict(entities)
    
    #print("Table:", namespace)
    for column_name in columns:
        if column_name not in searched_cols:
            #print("Column name:", column_name)
            searched_cols.append(column_name)

            # Obtain the entity value corresponding to the current column
            entity_value = entities[namespace].get(column_name, None)
            if not entity_value:
                continue  # Skip to the next column if no value is found

            # Generate the query embedding for the entity value
            query_embedding = embedding_model.encode([entity_value])[0]
            query_embedding = np.array(query_embedding, dtype=np.float32)

            try:
                result = pinecone_index.query(
                    namespace=namespace,
                    vector=query_embedding.tolist(),
                    filter={
                        "column_name": {"$eq": column_name}
                    },
                    top_k=3,
                    include_values=True,
                    include_metadata=True
                )
                
                matches = result.get('matches', [])
                #print("matches:",matches)
                if matches:
                    unique_values = [match['metadata'].get('unique_value') for match in matches if 'metadata' in match]
                    if unique_values:
                        pinecone_data[column_name] = unique_values
                        if len(unique_values) > 1:
                            print(f"Multiple matches found for '{entity_value}':")
                            for idx, unique_value in enumerate(unique_values):
                                print(f"{idx + 1}: {unique_value}")
                            while True:
                                selection = input(f"Please select the most relevant option for '{entity_value}' (1-{len(unique_values)}): ")
                                try:
                                    selected_value = unique_values[int(selection) - 1]
                                    augmented_input = augmented_input.replace(entity_value, selected_value)
                                    break
                                except (IndexError, ValueError):
                                    print("Invalid selection. Please choose a valid option.")
                        else:
                            augmented_input = augmented_input.replace(entity_value, unique_values[0])
                else:
                    print(f"No matches found for {entity_value} in Pinecone.")
            except Exception as e:
                print(f"Error querying Pinecone: {str(e)}")
                return f"Error querying Pinecone: {str(e)}", {}
                
    return augmented_input, pinecone_data


In [13]:
def clear_values():
    global NAMESPACE,columnnames,searched_cols
    NAMESPACE.clear()
    columnnames.clear()
    searched_cols.clear()

In [14]:
def process_user_input(user_input):
    clear_values()
    aug_result=''
    # Connect to DB and fetch schema
    conn = connect_to_db()
    schema_df = fetch_schema(conn)
    #processed_schema_df = process_schema(schema_df)
    # Extract features from user input using OpenAI
    extracted_features = extract_features_with_openai(user_input, schema_df)
    # Process the extracted features and clean them
    cleaned_json, feature_list = process_extracted_features(extracted_features)
    #print("Cleaned json:",cleaned_json)
    #print("Feature list:",feature_list)
    cleaned_feature_dict = json.loads(cleaned_json)
    cleaned_extracted_features, feature_list = clean_extracted_features(cleaned_feature_dict)  # Rename the variable here
    extract_namespace = extract_nmaespace(cleaned_extracted_features)
    for key,val in cleaned_feature_dict.items():
        x=list(val.keys())
        #print(x)
        #print("For values:",user_input, cleaned_feature_dict, key,x)
        if aug_result=='':
            aug_result, pinecone_data=query_pinecone_and_augment_input(user_input, cleaned_feature_dict, key,x)
        else:
            aug_result, pinecone_data=query_pinecone_and_augment_input(aug_result, cleaned_feature_dict, key,x)
    return (aug_result)
    

In [15]:
# Fetch schema with column names and data types
def fetch_schema_with_data_types(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema with data types: {e}")
        raise


In [16]:

# Format schema as a string for the prompt
def format_schema(schema_df):
    schema_str = ""
    grouped = schema_df.groupby('table_name')
    for table_name, group in grouped:
        columns = ', '.join([f"{row['column_name']} ({row['data_type']})" for _, row in group.iterrows()])
        schema_str += f"{table_name}: {columns}\n"
    return schema_str


In [17]:
    
#Fetch query explainer text
def fetch_query_explaination(text):
    match = re.search(r'(.*?):', text)

    # Print the result if found
    if match:
        return (match.group(1))


In [18]:

# Function to generate SQL query using GPT-4o-mini
def generate_sql_query(schema_str, user_input):
    prompt = f"""
    The database contains the following schema:
    {schema_str}

    Based on this schema and the user request:
    "{user_input}"
 
    Generate an optimized SQL query that meets the user's intent.
    The query should be efficient and use the correct table and column names.
    """

    # Call GPT-4o-mini-2024-07-18 model using chat completion API
    #rephrased the prompt
    response = openai.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specialized in generating SQL queries, ensuring the use of appropriate operators like LIKE or expressions in sql queries like '% %' for  matches if needed. Accurately map user input to the relevant tables and columns in the database based on the provided schema, using the LIKE operator for partial matches where necessary. Handle data type mismatches explicitly by casting to the appropriate type when required, ensuring correct query execution. Additionally, Manage variations in user input, such as case sensitivity or small spelling differences, using flexible matching techniques to generate precise and reliable SQL queries."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,  # Reduced token limit for completion
        temperature=0.7
    )

    # Extract SQL query from the response
    sql_response = response.choices[0].message.content
    # Find and clean the SQL query part
    start = sql_response.find("```sql") + 6
    end = sql_response.find("```", start)
    sql_query = sql_response
    

    return sql_query,sql_response


In [19]:

# Extract generated SQL Query
def extract_sql_query(response):
    start = response.find("```sql") + len("```sql\n")
    end = response.find("```", start)
    sql_query = response[start:end].strip()
    return sql_query


In [20]:

#Generate Response
# Update the generate_response function
def generate_response(user_query, sql_result):
    # Prepare the prompt for GPT-4 to generate the natural language response
    prompt = f"User query: \"{user_query}\"\nSQL result: {sql_result}\nGenerate a natural language response from the result:"
    
    # Call the OpenAI Chat API
    response = openai.chat.completions.create(
      model="gpt-4o-mini-2024-07-18",
      messages=[
          {"role": "user", "content": prompt}
      ],
      max_tokens=500,
      temperature=0.7
    )
    
    return response.choices[0].message.content


In [21]:

def get_answer_from_chatbot(question, database_schema):
    try:
        prompt = prompt_template.format(
            knowledge_base="",
            database_schema=database_schema,
            question=question
        )
        response = openai_model.invoke(input=prompt)
        parsed_response = response.content.strip() if hasattr(response, 'content') else "No response content found."
        return parsed_response
    except Exception as e:
        return f"Error generating response from OpenAI: {str(e)}"


In [22]:
        
# Function to execute the SQL query and print the results
def execute_sql_query(conn, sql_query):
    try:
        with conn.cursor() as cursor:
            cursor.execute(sql_query)
            results = cursor.fetchall()
            return results
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return None


In [23]:
        
# Determine if user query is related to database or general knowledge
def determine_query_type(user_query, schema_df, threshold=75):
    user_query_lower = user_query.lower()
    
    # Extract unique table and column names from the schema and convert to lowercase
    table_names = schema_df['table_name'].str.lower().unique()
    column_names = schema_df['column_name'].str.lower().unique()
    
    # Function to check fuzzy match
    def is_fuzzy_match(query, options, threshold):
        for option in options:
            if fuzz.partial_ratio(query, option) >= threshold:
                return True
        return False
    
    # Check if user query matches any table or column name
    if is_fuzzy_match(user_query_lower, table_names, threshold) or \
       is_fuzzy_match(user_query_lower, column_names, threshold):
        return "database"
    
    return "knowledge"



In [56]:
from datetime import datetime, timedelta

def contains_date_related_text(user_input):
    # Current year and month for comparison
    current_year = datetime.now().year
    current_month = datetime.now().strftime("%B")
    
    # Patterns to match date, month, year, and relative terms
    date_pattern = r'\b(\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4}|\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})\b'
    month_pattern = r'\b(January|February|March|April|May|June|July|August|September|October|November|December|' \
                    r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b'
    year_pattern = r'\b(19|20)\d{2}\b'
    time_pattern = r'\b\d+\s*(or\s*(more|fewer|less))?\s*(days?|weeks?|months?|years?)\b'
    relative_terms_pattern = r'\b(this month|this year|last month|last year|next month|next year)\b'
    
    # Find matches
    if (re.search(date_pattern, user_input) or 
        re.search(month_pattern, user_input, re.IGNORECASE) or 
        re.search(year_pattern, user_input) or 
        re.search(time_pattern, user_input) or
        re.search(relative_terms_pattern, user_input, re.IGNORECASE)):
        return True
    
    return False

In [26]:
import spacy
nlp = spacy.load("en_core_web_sm")
def named_entity_recognition(text):
    doc = nlp(text)
    # Check for entities that are either PERSON, ORG, or GPE
    for entity in doc.ents:
        if entity.label_ in ("PERSON", "ORG", "GPE"):
            return True
            
    return False

In [46]:
def is_entity_present(schema_entities,user_text):
    user_text_lower = user_text.lower()
    for entity in schema_entities:
        if entity.lower() in user_text_lower:
            return True  
    return False

In [58]:
# Main function to handle user queries
def process_user_query(user_input):
    conn = connect_to_db()
    schema_df = fetch_schema_with_data_types(conn)
    processed_schema_str = format_schema(schema_df)
    query_type = determine_query_type(user_input, schema_df)
    if query_type == "database":
        is_entity=is_entity_present(schema_df['column_name'].tolist(),user_input)
        if contains_date_related_text(user_input):
            if named_entity_recognition(user_input):
                if is_enity:
                    aug_input=process_user_input(user_input)
                else:
                    aug_input=user_input
            else:
                aug_input=user_input
        else:
            aug_input=process_user_input(user_input)
        sql_query,sql_response = generate_sql_query(processed_schema_str, aug_input)
        sql_query=extract_sql_query(sql_query)
        explain_text=fetch_query_explaination(sql_response)
        #print("Query Explaination:",explain_text)
        print("Generated SQL Query:", sql_query)

        
        # Execute the generated SQL query
        results = execute_sql_query(conn, sql_query)
        rows=results
        print("Row:",rows)
        if len(rows)!=0:
            resp=generate_response(aug_input,rows)
            result=resp+"\n"+explain_text
            return result
        else:
            return "I'm sorry, but I'm unable to provide results. Could you please clarify your query so I can assist you better?"
        
        conn.close()
    
    else:
        # For non-database related queries, respond using the chatbot
        return get_answer_from_chatbot(user_input, processed_schema_str)


In [None]:
if __name__ == "__main__":
    while True:
        user_input = input("Enter your question: ")
        if user_input.lower() in ['exit', 'quit']:
            break
        response = process_user_query(user_input)
        print(response)

Enter your question:  give me the list of milestones which are delayed by 7 or more days


  schema_df = pd.read_sql(query, conn)


Generated SQL Query: SELECT *
FROM milestones
WHERE (end_date - CURRENT_DATE) > 7;
Row: [('MedGenome Inc. - Zoho Marketing Automation', 'Milestone Three-AMC', 'Uday Desai', datetime.date(2024, 7, 8), datetime.date(2025, 7, 7), 'Upcoming', 'zcr_749813000021513025', 'zcr_749813000021509027', datetime.date(2024, 5, 27), 365, 23, 'On Time', Decimal('0.00'), 342, 'Upcoming', Decimal('0.00'), 'Zoho Marketing Automation'), ('Aajeeth Innovation LLP - Zoho CRM Implementation', 'Milestone Three-AMC', 'Uday Desai', datetime.date(2024, 8, 10), datetime.date(2025, 8, 9), 'Upcoming', 'zcr_749813000022031001', 'zcr_749813000022027005', datetime.date(2024, 7, 11), 365, -10, 'On Time', Decimal('0.00'), 375, 'Upcoming', Decimal('0.00'), 'CRM'), ('COLUMBIA PACIFIC COMMUNITIES - Zoho Books, Zoho Creator & Zoho Analytics', 'Milestone Five-Annual maintenance charges - Post Go-Live Application support for Zoho Books, Z', 'Uday Desai', datetime.date(2024, 7, 22), datetime.date(2025, 6, 20), 'Upcoming', 'zcr_7

Enter your question:  give me the list of count of milestones which are delayed by 7 or more days


  schema_df = pd.read_sql(query, conn)


Generated SQL Query: SELECT COUNT(*) AS delayed_milestone_count
FROM milestones
WHERE (CURRENT_DATE - end_date) >= 7;
Row: [(1155,)]
There are 1,155 milestones that are delayed by 7 or more days.
To generate the SQL query that counts the number of milestones which are delayed by 7 or more days, we will need to


No matching entities found in user input.
