In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
import openai
import psycopg2
import pandas as pd
import re

# Initialize OpenAI API key
OPENAI_API_KEY = ""
openai.api_key = OPENAI_API_KEY

# Pinecone Initialization
PINECONE_API_KEY = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
INDEX_NAME = "jagoai"

# Initialize HuggingFace Embeddings model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Convert user input to vector
def text_to_vector(text):
    return embedder.embed_query(text)

# Initialize Pinecone client
def initialize_pinecone():
    # Create Pinecone instance
    pc = Pinecone(api_key=PINECONE_API_KEY)

    # Check if index exists and create if it doesn't
    if INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=INDEX_NAME,
            dimension=embedder.embedding_dim,  # Assuming your embedding dimension
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-west-2')
        )
    
    return pc.Index(INDEX_NAME)

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Function to fetch schema from PostgreSQL database
def fetch_schema(conn):
    try:
        query = """
        SELECT table_name, column_name
        FROM information_schema.columns
        WHERE table_schema = 'public'
        """
        schema_df = pd.read_sql(query, conn)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema: {e}")
        raise

# Function to process schema: remove special characters and convert to lowercase
def process_schema(schema_df):
    def clean_column_name(name):
        return re.sub(r'[^a-zA-Z]', '', name).lower()

    schema_df['processed_column_name'] = schema_df['column_name'].apply(clean_column_name)
    return schema_df

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Function to extract features (like project name, owner, etc.) using OpenAI's LLM
def extract_features_with_openai(user_input, processed_schema_df):
    schema_json = processed_schema_df.to_json(orient='records')
    
    # Construct prompt for OpenAI LLM
    prompt = f"""
    ## Database Schema Context
    {schema_json}

    ## User Input
    User has provided the following input: '{user_input}'.

    ## Instructions
    Based on the given user input and the database schema provided above, identify all features or values the user is trying to express. These features could include project names, owners, dates, statuses, or any other information that matches the schema. Extract them and return as a structured list.

    Provide the output in the following format:
    - project_name: 
    - owner: 
    - date: 
    - status:
    - [Any other relevant column values based on schema]
    """

    try:
        response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=500,
            temperature=0.5
        )
        extracted_features = response.choices[0].text.strip()
        return extracted_features
    except openai.OpenAIError as e:
        print(f"Error with OpenAI: {e}")
        raise

# Function to parse and store extracted features into a list
def parse_extracted_features(extracted_features):
    feature_list = []
    
    # Using regex to extract key-value pairs
    for line in extracted_features.splitlines():
        match = re.match(r"-\s(\w+):\s(.+)", line)
        if match:
            key = match.group(1).strip()
            value = match.group(2).strip()
            # Only add to the feature list if the value is not empty
            if value and value.lower() not in {"none", "not specified", " "}:
                feature_list.append(value)
    
    return feature_list

# Function to query Pinecone using the extracted features
def query_pinecone(index, feature_list):
    if not feature_list:
        print("No features to query.")
        return

    # Convert feature list into embeddings and query Pinecone
    for feature in feature_list:
        vector = text_to_vector(feature)

        try:
            # Query Pinecone with the embeddings, set top_k=1 for cosine similarity
            response = index.query(
                vector=vector,
                top_k=1,
                include_values=True,
                include_metadata=True
            )

            # Process and print matching records
            if 'matches' in response and response['matches']:
                matches = response['matches']
                for match in matches:
                    match_id = match.get('id', 'N/A')
                    score = match.get('score', 'N/A')
                    metadata = match.get('metadata', 'No metadata')
                    values = match.get('values', 'No values')

                    print(f"Feature: {feature}")
                    print(f"Match ID: {match_id}")
                    print(f"Score: {score}")
                    print(f"Metadata: {metadata}")
                    print(f"Values: {values}")
                    print("---")
            else:
                print(f"No matches found for feature: {feature}")

        except Exception as e:
            print(f"An error occurred querying Pinecone for feature '{feature}':", e)

# Main function to process user input, fetch schema, extract features, and query Pinecone
def process_user_input_and_query_pinecone(user_input):
    # Connect to DB and fetch schema
    conn = connect_to_db()
    schema_df = fetch_schema(conn)
    processed_schema_df = process_schema(schema_df)

    # Extract features from user input using OpenAI's LLM
    extracted_features = extract_features_with_openai(user_input, processed_schema_df)
    
    # Print the extracted features
    print("Extracted Features from OpenAI:")
    print(extracted_features)

    # Parse extracted features into a list
    feature_list = parse_extracted_features(extracted_features)
    
    # Print the parsed feature list
    print("\nParsed Feature List:")
    print(feature_list)

    # Initialize Pinecone and query based on the feature list
    index = initialize_pinecone()
    query_pinecone(index, feature_list)

# Get user input
user_input = input("Enter your query: ")

# Process user input and query Pinecone
process_user_input_and_query_pinecone(user_input)




Enter your query:  Show all projects owned by Mohammed


  schema_df = pd.read_sql(query, conn)


Extracted Features from OpenAI:
- project_name: 
- owner: Mohammed
- date: 
- status: 
- [Any other relevant column values based on schema]

Parsed Feature List:
['Mohammed']
Feature: Mohammed
Match ID: 3_26
Score: 0.735457838
Metadata: {'row_index': 3.0, 'value': 'Mohammed Yusha'}
Values: [-0.0639873147, 0.0578757264, -0.0094227856, -0.00599367591, -0.00531403488, 0.0162085332, 0.0698724687, -0.0379670858, 0.094297722, 0.00845798757, 0.0382332392, -0.0944266, 0.118299507, 0.0379873216, 0.0316385776, -0.0118874731, -0.0099654682, 0.0490820929, -0.0103816027, -0.103197865, -0.0953224599, 0.0329789259, 0.0279532634, 0.000633339339, -0.0105809383, -0.0359946229, 0.0605912767, 0.000426612038, -0.0226309616, -0.0995256454, -0.0103521114, -0.0683196485, 0.017640667, 0.00382588455, -0.052651003, 0.0636867061, 0.0360691696, 0.0557411872, 0.0407768972, -0.0141909784, 0.0346541367, -0.0356390402, 0.0815275088, -0.0227831509, 0.105226263, 0.00820185523, -0.0148531925, -0.00151236216, 0.032476224,