In [4]:
import psycopg2
import pandas as pd
from datetime import datetime
from sentence_transformers import SentenceTransformer
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone

# PostgreSQL database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Pinecone details
pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
index_name = "jagoai"
BATCH_SIZE = 200  # Adjust the batch size if necessary

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Fetch schema with column names and data types
def fetch_schema_with_data_types(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type, character_maximum_length
        FROM information_schema.columns
        WHERE table_schema = 'public'
          AND (data_type = 'character varying' OR data_type IN ('text', 'varchar'))
          AND table_name IN ('projects', 'milestones', 'tasks', 'users')
        ORDER BY table_name;
        """
        schema_df = pd.read_sql(query, conn)
        print(schema_df)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema with data types: {e}")
        raise

# Fetch unique values from each column for specified tables
def fetch_unique_values(conn, table_name, column_name):
    try:
        query = f"SELECT DISTINCT {column_name} FROM {table_name}"
        df = pd.read_sql(query, conn)
        return df[column_name].dropna().astype(str).tolist()
    except Exception as e:
        print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
        return []

# Load Hugging Face model for embeddings
def load_huggingface_model():
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    embedding_model = SentenceTransformer(model_name)
    return embedding_model

# Generate embeddings for unique values and store them in a dictionary
def generate_embeddings(embedding_model, unique_values_dict):
    embeddings_dict = {}
    for table_name, columns in unique_values_dict.items():
        embeddings_dict[table_name] = {}
        for column_name, unique_values in columns.items():
            if unique_values:  # Check if there are any unique values
                try:
                    embeddings = embedding_model.encode(unique_values)
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": embeddings
                    }
                except Exception as e:
                    print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": []
                    }
            else:
                print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
                embeddings_dict[table_name][column_name] = {
                    "unique_values": [],
                    "embeddings": []
                }
    return embeddings_dict

# Initialize Pinecone index
def initialize_pinecone():
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)
    return index

# Batch the embeddings for upserts
def batch_embeddings(upsert_data, batch_size):
    for i in range(0, len(upsert_data), batch_size):
        yield upsert_data[i:i + batch_size]

# Upsert embeddings into Pinecone with metadata for each table (namespace)
def upsert_embeddings(index, embeddings_dict):
    for table_name, columns in embeddings_dict.items():
        for column_name, data in columns.items():
            upsert_data = []
            for i, embedding in enumerate(data['embeddings']):
                unique_value = data['unique_values'][i]
                vector_id = f"{table_name}_{column_name}_{i}"
                metadata = {
                    "table_name": table_name,
                    "column_name": column_name,
                    "unique_value": unique_value
                }

                upsert_data.append({
                    "id": vector_id,
                    "values": embedding.tolist(),
                    "metadata": metadata
                })

            # Batch the upsert to avoid exceeding size limits
            for batch in batch_embeddings(upsert_data, BATCH_SIZE):
                index.upsert(vectors=batch)
                print(f"Upserted batch for {column_name} in {table_name}")

# Main function to execute the entire process
def main():
    # Connect to the database
    conn = connect_to_db()

    # Fetch the schema and unique values for relevant columns
    schema_df = fetch_schema_with_data_types(conn)
    unique_values_dict = {}
    for _, row in schema_df.iterrows():
        table_name = row['table_name']
        column_name = row['column_name']
        if table_name not in unique_values_dict:
            unique_values_dict[table_name] = {}
        unique_values_dict[table_name][column_name] = fetch_unique_values(conn, table_name, column_name)

    # Load the Hugging Face model for embeddings
    embedding_model = load_huggingface_model()

    # Generate embeddings for unique values
    embeddings_dict = generate_embeddings(embedding_model, unique_values_dict)

    # Initialize Pinecone and upsert embeddings
    index = initialize_pinecone()
    upsert_embeddings(index, embeddings_dict)

    # Close database connection
    conn.close()

# Execute main function
if __name__ == "__main__":
    main()


  schema_df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


    table_name       column_name          data_type  character_maximum_length
0   milestones      project_name  character varying                     255.0
1   milestones    milestone_name  character varying                     255.0
2   milestones        owner_name  character varying                     255.0
3   milestones  milestone_status  character varying                     100.0
4   milestones              flag  character varying                     100.0
..         ...               ...                ...                       ...
61       users        user_email               text                       NaN
62       users              role               text                       NaN
63       users           profile               text                       NaN
64       users            status               text                       NaN
65       users      active_users               text                       NaN

[66 rows x 4 columns]


  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = p

Upserted batch for project_name in milestones
Upserted batch for project_name in milestones
Upserted batch for project_name in milestones
Upserted batch for project_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for milestone_name in milestones
Upserted batch for owner_name in milestones
Upserted batch for milestone_status in milestones
Upserted batch for flag in milestones
Upserted batch for milestone_completion_mode in milestones
Upserted batch for milestone_id_string in milestones
Upserted batch for milestone_id_string in milestones
Upserted batch for milestone_id_string in milestones
Upserted batch for milestone_id_string in milestones
Upserted batch for milestone_