In [4]:
!pip install protoc-gen-openapiv2

Collecting protoc-gen-openapiv2
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl (7.9 kB)
Installing collected packages: protoc-gen-openapiv2
Successfully installed protoc-gen-openapiv2-0.0.1



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import psycopg2
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc_two"
PORT = 5432

# Pinecone details
pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
index_name = "smart-desk"
BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Fetch schema with column names and data types, only including string types
def fetch_schema_with_data_types(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type, character_maximum_length
FROM information_schema.columns
WHERE table_schema = 'public'
AND data_type = 'character varying'
OR data_type IN ('text', 'varchar')
AND table_name in('projects','milestones','tasks','users')
ORDER BY table_name;
        """
        schema_df = pd.read_sql(query, conn)
        print(schema_df)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema with data types: {e}")
        raise

# Fetch unique values from each column along with table details
def fetch_unique_values(conn, table_name, column_name):
    try:
        query = f"SELECT DISTINCT {column_name} FROM {table_name}"
        df = pd.read_sql(query, conn)
        return df[column_name].dropna().astype(str).tolist()
    except Exception as e:
        print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
        return []

# Fetch all unique values for each column and map them to table details
def fetch_all_unique_values_with_table(conn, schema_df):
    unique_values_dict = {}
    for table_name in schema_df['table_name'].unique():
        unique_values_dict[table_name] = {}
        table_columns = schema_df[schema_df['table_name'] == table_name]
        for column_name in table_columns['column_name']:
            unique_values = fetch_unique_values(conn, table_name, column_name)
            unique_values_dict[table_name][column_name] = unique_values
    return unique_values_dict

# Initialize SentenceTransformer model for Hugging Face embeddings
def load_huggingface_model():
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    embedding_model = SentenceTransformer(model_name)
    return embedding_model

# Generate embeddings for each unique value and store them
# Generate embeddings for each unique value and store them
def generate_and_store_embeddings(embedding_model, unique_values_dict):
    embeddings_dict = {}
    for table_name, columns in unique_values_dict.items():
        embeddings_dict[table_name] = {}
        for column_name, unique_values in columns.items():
            if unique_values:  # Check if there are any unique values
                try:
                    embeddings = embedding_model.encode(unique_values)
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": embeddings
                    }
                except Exception as e:
                    print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": []  # Store an empty list if encoding fails
                    }
            else:
                print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
                embeddings_dict[table_name][column_name] = {
                    "unique_values": [],
                    "embeddings": []
                }
    return embeddings_dict


# Initialize Pinecone
def initialize_pinecone():
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)
    return index

# Batch the embeddings for upserts
def batch_embeddings(upsert_data, batch_size):
    for i in range(0, len(upsert_data), batch_size):
        yield upsert_data[i:i + batch_size]

# Upsert embeddings into Pinecone with metadata for each table (namespace)
def upsert_embeddings_into_pinecone(index, embeddings_dict):
    for table_name, columns in embeddings_dict.items():
        for column_name, data in columns.items():
            upsert_data = []
            for i, embedding in enumerate(data['embeddings']):
                unique_value = data['unique_values'][i]
                vector_id = f"{table_name}_{column_name}_{i}"
                metadata = {"column_name": column_name, "unique_value": unique_value}

                upsert_data.append({
                    "id": vector_id,
                    "values": embedding.tolist(),
                    "metadata": metadata
                })

            # Batch the upsert to avoid exceeding size limits
            for batch in batch_embeddings(upsert_data, BATCH_SIZE):
                index.upsert(vectors=batch, namespace=table_name)
                print(f"Upserted batch for {column_name} in {table_name}")

# Main function to execute the process
def main():
    # Step 1: Connect to the database
    conn = connect_to_db()

    # Step 2: Fetch the schema with metadata and data types, only for string columns
    schema_df = fetch_schema_with_data_types(conn)
    print("Schema with string data types fetched successfully.")

    # Step 3: Fetch all unique values along with table details
    unique_values_dict = fetch_all_unique_values_with_table(conn, schema_df)
    print("Unique values for string columns fetched successfully.")

    # Step 4: Load the Hugging Face model for embeddings
    embedding_model = load_huggingface_model()
    print("Hugging Face model loaded successfully.")

    # Step 5: Generate embeddings for all unique values
    embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict)
    print("Embeddings for string columns generated successfully.")

    # Step 6: Initialize Pinecone and upsert embeddings under each table's namespace
    pinecone_index = initialize_pinecone()
    upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
    print("Embeddings upserted into Pinecone successfully.")

if __name__ == "__main__":
    main()

  schema_df = pd.read_sql(query, conn)


    table_name                column_name          data_type  \
0   milestones  milestone_completion_mode  character varying   
1   milestones               milestone_id  character varying   
2   milestones                 owner_name  character varying   
3   milestones           milestone_status  character varying   
4   milestones                 project_id  character varying   
5   milestones                application  character varying   
6   milestones                     status  character varying   
7   milestones               project_name  character varying   
8   milestones             milestone_name  character varying   
9     projects                  projectid  character varying   
10    projects                projectname  character varying   
11    projects                      owner  character varying   
12    projects                     status  character varying   
13    projects              delivery_team  character varying   
14       tasks     actual_time_taken_tem

  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = p

Unique values for string columns fetched successfully.




Hugging Face model loaded successfully.
No unique values found for actual_time_taken_temp in tasks. Skipping embeddings.
No unique values found for qc_owner in tasks. Skipping embeddings.
No unique values found for milestone_id1 in tasks. Skipping embeddings.
Embeddings for string columns generated successfully.
Upserted batch for milestone_completion_mode in milestones
Upserted batch for milestone_id in milestones
Upserted batch for milestone_id in milestones
Upserted batch for milestone_id in milestones
Upserted batch for milestone_id in milestones
Upserted batch for milestone_id in milestones
Upserted batch for milestone_id in milestones
Upserted batch for owner_name in milestones
Upserted batch for milestone_status in milestones
Upserted batch for project_id in milestones
Upserted batch for project_id in milestones
Upserted batch for application in milestones
Upserted batch for status in milestones
Upserted batch for project_name in milestones
Upserted batch for project_name in mil

In [25]:
import psycopg2
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc_two"
PORT = 5432

# Pinecone details
pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
index_name = "smart-desk"
BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Fetch schema with column names and data types, only including string types
def fetch_schema_with_data_types(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'public'
        AND data_type IN ('character varying', 'text', 'varchar') AND table_name = 'contacts'
        """
        schema_df = pd.read_sql(query, conn)
        print(schema_df)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema with data types: {e}")
        raise

# Fetch unique values from each column along with table details
def fetch_unique_values(conn, table_name, column_name):
    try:
        query = f"SELECT DISTINCT {column_name} FROM {table_name}"
        df = pd.read_sql(query, conn)
        return df[column_name].dropna().astype(str).tolist()
    except Exception as e:
        print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
        return []

# Fetch all unique values for each column and map them to table details
def fetch_all_unique_values_with_table(conn, schema_df):
    unique_values_dict = {}
    for table_name in schema_df['table_name'].unique():
        unique_values_dict[table_name] = {}
        table_columns = schema_df[schema_df['table_name'] == table_name]
        for column_name in table_columns['column_name']:
            unique_values = fetch_unique_values(conn, table_name, column_name)
            unique_values_dict[table_name][column_name] = unique_values
    return unique_values_dict

# Initialize SentenceTransformer model for Hugging Face embeddings
def load_huggingface_model():
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    embedding_model = SentenceTransformer(model_name)
    return embedding_model

# Generate embeddings for each unique value and store them
# Generate embeddings for each unique value and store them
def generate_and_store_embeddings(embedding_model, unique_values_dict):
    embeddings_dict = {}
    for table_name, columns in unique_values_dict.items():
        embeddings_dict[table_name] = {}
        for column_name, unique_values in columns.items():
            if unique_values:  # Check if there are any unique values
                try:
                    embeddings = embedding_model.encode(unique_values)
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": embeddings
                    }
                except Exception as e:
                    print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": []  # Store an empty list if encoding fails
                    }
            else:
                print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
                embeddings_dict[table_name][column_name] = {
                    "unique_values": [],
                    "embeddings": []
                }
    return embeddings_dict


# Initialize Pinecone
def initialize_pinecone():
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)
    return index

# Batch the embeddings for upserts
def batch_embeddings(upsert_data, batch_size):
    for i in range(0, len(upsert_data), batch_size):
        yield upsert_data[i:i + batch_size]

# Upsert embeddings into Pinecone with metadata for each table (namespace)
def upsert_embeddings_into_pinecone(index, embeddings_dict):
    for table_name, columns in embeddings_dict.items():
        for column_name, data in columns.items():
            upsert_data = []
            for i, embedding in enumerate(data['embeddings']):
                unique_value = data['unique_values'][i]
                vector_id = f"{table_name}_{column_name}_{i}"
                metadata = {"column_name": column_name, "unique_value": unique_value}

                upsert_data.append({
                    "id": vector_id,
                    "values": embedding.tolist(),
                    "metadata": metadata
                })

            # Batch the upsert to avoid exceeding size limits
            for batch in batch_embeddings(upsert_data, BATCH_SIZE):
                index.upsert(vectors=batch, namespace=table_name)
                print(f"Upserted batch for {column_name} in {table_name}")

# Main function to execute the process
def main():
    # Step 1: Connect to the database
    conn = connect_to_db()

    # Step 2: Fetch the schema with metadata and data types, only for string columns
    schema_df = fetch_schema_with_data_types(conn)
    print("Schema with string data types fetched successfully.")

    # Step 3: Fetch all unique values along with table details
    unique_values_dict = fetch_all_unique_values_with_table(conn, schema_df)
    print("Unique values for string columns fetched successfully.")

    # Step 4: Load the Hugging Face model for embeddings
    embedding_model = load_huggingface_model()
    print("Hugging Face model loaded successfully.")

    # Step 5: Generate embeddings for all unique values
    embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict)
    print("Embeddings for string columns generated successfully.")

    # Step 6: Initialize Pinecone and upsert embeddings under each table's namespace
    pinecone_index = initialize_pinecone()
    upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
    print("Embeddings upserted into Pinecone successfully.")

if __name__ == "__main__":
    main()

  schema_df = pd.read_sql(query, conn)


Empty DataFrame
Columns: [table_name, column_name, data_type]
Index: []
Schema with string data types fetched successfully.
Unique values for string columns fetched successfully.




Hugging Face model loaded successfully.
Embeddings for string columns generated successfully.
Embeddings upserted into Pinecone successfully.
