In [1]:
import openai
import psycopg2
import pandas as pd
import re
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec

# Initialize OpenAI API key
OPENAI_API_KEY = ""
openai.api_key = OPENAI_API_KEY

# Pinecone Initialization
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
INDEX_NAME = "jagoai"

# Initialize HuggingFace Embeddings model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Convert user input to vector
def text_to_vector(text):
    return embedder.embed_query(text)

# Initialize Pinecone client
def initialize_pinecone():
    # Create Pinecone instance
    pc = Pinecone(api_key=PINECONE_API_KEY)

    # Check if index exists and create if it doesn't
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=embedder.embedding_dim,  # Assuming your embedding dimension
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
    
    return pc.Index(INDEX_NAME)

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Function to fetch schema from PostgreSQL database
def fetch_schema(conn):
    try:
        query = """
        SELECT table_name, column_name
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema: {e}")
        raise

# Function to process schema: remove special characters and convert to lowercase
def process_schema(schema_df):
    def clean_column_name(name):
        return re.sub(r'[^a-zA-Z]', '', name).lower()

    schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
    return schema_df

# LangChain PromptTemplate to augment user input
def augment_user_input(user_input, closest_match):
    prompt_template = PromptTemplate(
        input_variables=["user_input", "closest_match"],
        template="""
        User provided input: "{user_input}".
        Pinecone retrieved the closest match: "{closest_match}".

        Update the user input by replacing the original input with the closest match.
        """
    )
    llm = OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0.7)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    augmented_input = chain.run(user_input=user_input, closest_match=closest_match)
    return augmented_input.strip()

# Function to extract features (like project name, owner, etc.) using OpenAI's LLM
def extract_features_with_openai(user_input, processed_schema_df):
    schema_json = processed_schema_df.to_json(orient='records')
    
    # Construct prompt for OpenAI LLM
    prompt = f"""
    ## Database Schema Context
    {schema_json}

    ## User Input
    User has provided the following input: '{user_input}'.

    ## Instructions
    Based on the given user input and the database schema provided above, identify all features or values the user is trying to express. These features could include project names, owners, dates, statuses, or any other information that matches the schema. Extract them and return as a structured list.Also if you do not find feture or values than return them as empty string.

    Provide the output in the following format:
    - project_name: 
    - owner: 
    - date: 
    - status:
    - [Any other relevant column values based on schema]
    """

    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=500,
            temperature=0.5
        )
        extracted_features = response.choices[0].text.strip()
        return extracted_features
    except openai.OpenAIError as e:
        print(f"Error with OpenAI: {e}")
        raise

# Function to parse and store extracted features into a list
def parse_extracted_features(extracted_features):
    feature_list = []
    
    # Using regex to extract key-value pairs
    for line in extracted_features.splitlines():
        match = re.match(r"-\s(\w+):\s(.+)", line)
        if match:
            key = match.group(1).strip()
            value = match.group(2).strip()
            # Only add to the feature list if the value is not empty
            if value and value.lower() not in {"none", "not specified", " ","N/A"}:
                feature_list.append(value)
    
    return feature_list

# Function to query Pinecone using the extracted features
def query_pinecone(index, feature_list):
    if not feature_list:
        print("No features to query.")
        return []

    closest_matches = []
    for feature in feature_list:
        vector = text_to_vector(feature)
        try:
            response = index.query(
                vector=vector,
                top_k=1,
                include_values=True,
                include_metadata=True
            )

            # Process and collect matching records
            if 'matches' in response and response['matches']:
                match = response['matches'][0]
                match_id = match.get('id', 'N/A')
                score = match.get('score', 'N/A')
                metadata = match.get('metadata', 'No metadata')
                values = match.get('values', 'No values')

                closest_match = metadata.get('value', 'No match found')
                print(f"Feature: {feature} - Closest Match: {closest_match}")
                closest_matches.append((feature, closest_match))
            else:
                print(f"No matches found for feature: {feature}")
        except Exception as e:
            print(f"An error occurred querying Pinecone for feature '{feature}':", e)

    return closest_matches

# Function to generate SQL query using OpenAI API and LangChain
def generate_sql_query(user_input, processed_schema_df, entities):
    schema_json = processed_schema_df.to_json(orient='records')

    prompt_template = """
    ## Database Schema Context
    {schema_json}

    ## User Input
    User input: "{user_input}"
    Extracted entities: {entities}

    ## Instructions
    Based on the user input and the database schema, generate an SQL query that correctly maps to the database tables and columns.
    Use the LIKE operator where necessary for partial matches.
    """

    context = prompt_template.format(
        schema_json=schema_json,
        user_input=user_input,
        entities=entities
    )

    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=context,
            max_tokens=500,
            temperature=0.7
        )
        generated_query = response.choices[0].text.strip()
        return generated_query
    except openai.OpenAIError as e:
        print(f"Error generating SQL query: {e}")
        raise

# Function to clean the generated SQL query
def clean_sql_query(sql_query):
    sql_query_cleaned = re.sub(r'##.*\n', '', sql_query)
    sql_query_cleaned = sql_query_cleaned.replace("\n", " ").strip()
    return sql_query_cleaned

# Main function to process user input, fetch schema, extract features, and query Pinecone
def process_user_input_and_query_pinecone(user_input):
    # Connect to DB and fetch schema
    conn = connect_to_db()
    schema_df = fetch_schema(conn)
    processed_schema_df = process_schema(schema_df)

    # Extract features from user input using OpenAI's LLM
    extracted_features = extract_features_with_openai(user_input, processed_schema_df)
    
    print("Extracted Features from OpenAI:")
    print(extracted_features)

    # Parse extracted features into a list
    feature_list = parse_extracted_features(extracted_features)
    
    print("\nParsed Feature List:")
    print(feature_list)

    # Initialize Pinecone and query based on the feature list
    index = initialize_pinecone()
    closest_matches = query_pinecone(index, feature_list)

    # Augment user input with closest matches
    for feature, closest_match in closest_matches:
        user_input = augment_user_input(user_input, closest_match)
    
    print("\nAugmented User Input:")
    print(user_input)

    # Generate SQL query using augmented input
    generated_query = generate_sql_query(user_input, processed_schema_df, closest_matches)
    cleaned_query = clean_sql_query(generated_query)
    
    print("\nGenerated SQL Query:")
    print(cleaned_query)

    return cleaned_query

# Example usage
if __name__ == "__main__":
    user_input = "Show all projects owned by Mohammed"
    process_user_input_and_query_pinecone(user_input)


  schema_df = pd.read_sql(query, conn)


Extracted Features from OpenAI:
- project_name: 
   - owner: Mohammed
   - date: 
   - status: 
   - project_id: 
   - start_date: 
   - created_time: 
   - delivery_team: 
   - project_efforts: 
   - status: 
   - end_date: 
   - project_name: 
   - owner_name: 
   - user_name: Mohammed
   - role: 
   - project_name: 
   - start_date: 
   - end_date: 
   - status: 
   - project_efforts: 
   - created_time: 
   - delivery_team: 
   - owner: Mohammed

Parsed Feature List:
[]
No features to query.

Augmented User Input:
Show all projects owned by Mohammed

Generated SQL Query:
SELECT projectid, projectname, ownername      FROM projects      WHERE ownername LIKE '%Mohammed%'
