In [10]:
import psycopg2
import pandas as pd
from datetime import datetime, timedelta
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Pinecone details
pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
index_name = "smart-desk"
BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Fetch schema with column names and data types, dynamically including string and date/timestamp types
def fetch_schema_with_data_types(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'public'
        AND data_type IN ('character varying', 'text', 'varchar', 'date', 'timestamp without time zone', 'timestamp with time zone')
        """
        schema_df = pd.read_sql(query, conn)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema with data types: {e}")
        raise

# Find the 'created date' column dynamically based on its data type
def find_created_date_column(schema_df, table_name):
    try:
        date_columns = schema_df[
            (schema_df['table_name'] == table_name) & 
            (schema_df['data_type'].isin(['date', 'timestamp without time zone', 'timestamp with time zone']))
        ]
        if not date_columns.empty:
            return date_columns.iloc[0]['column_name']
        else:
            print(f"No date or timestamp column found for table {table_name}.")
            return None
    except Exception as e:
        print(f"Error finding created date column for {table_name}: {e}")
        return None

# Fetch unique values based on created date or time column dynamically
def fetch_unique_values(conn, table_name, column_name, created_date_column, from_date, to_date):
    try:
        query = f"""
        SELECT DISTINCT {column_name}
        FROM {table_name}
        WHERE {created_date_column} BETWEEN '{from_date}' AND '{to_date}'
        """
        df = pd.read_sql(query, conn)
        return df[column_name].dropna().astype(str).tolist()
    except Exception as e:
        print(f"Error fetching unique values for {column_name} in {table_name} based on {created_date_column}: {e}")
        return []

# Initialize SentenceTransformer model for Hugging Face embeddings
def load_huggingface_model():
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    embedding_model = SentenceTransformer(model_name)
    return embedding_model

# Initialize Pinecone
def initialize_pinecone():
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)
    return index

# Check if the unique value already exists in Pinecone
def check_existing_in_pinecone(index, table_name, column_name, unique_value):
    try:
        vector_id = f"{table_name}_{column_name}_{unique_value}"
        result = index.fetch(ids=[vector_id], namespace=table_name)
        if result and 'vectors' in result and result['vectors']:
            print(f"Record {vector_id} already exists in Pinecone.")
            return True
        return False
    except Exception as e:
        print(f"Error checking existence in Pinecone for {vector_id}: {e}")
        return False

# Generate embeddings for each unique value and store them
def generate_and_store_embeddings(embedding_model, unique_values_dict, index):
    embeddings_dict = {}
    for table_name, columns in unique_values_dict.items():
        embeddings_dict[table_name] = {}
        for column_name, unique_values in columns.items():
            if unique_values:  # Check if there are any unique values
                embeddings_list = []
                unique_values_filtered = []
                for value in unique_values:
                    if not check_existing_in_pinecone(index, table_name, column_name, value):
                        try:
                            embedding = embedding_model.encode(value)
                            embeddings_list.append(embedding)
                            unique_values_filtered.append(value)  # Only store values that don't exist
                        except Exception as e:
                            print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
                            continue
                embeddings_dict[table_name][column_name] = {
                    "unique_values": unique_values_filtered,
                    "embeddings": embeddings_list
                }
            else:
                print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
                embeddings_dict[table_name][column_name] = {
                    "unique_values": [],
                    "embeddings": []
                }
    return embeddings_dict

# Batch the embeddings for upserts
def batch_embeddings(upsert_data, batch_size):
    for i in range(0, len(upsert_data), batch_size):
        yield upsert_data[i:i + batch_size]

# Upsert embeddings into Pinecone with metadata for each table (namespace)
def upsert_embeddings_into_pinecone(index, embeddings_dict):
    for table_name, columns in embeddings_dict.items():
        for column_name, data in columns.items():
            upsert_data = []
            for i, embedding in enumerate(data['embeddings']):
                unique_value = data['unique_values'][i]
                vector_id = f"{table_name}_{column_name}_{i}"
                metadata = {"column_name": column_name, "unique_value": unique_value}

                upsert_data.append({
                    "id": vector_id,
                    "values": embedding.tolist(),
                    "metadata": metadata
                })

            # Batch the upsert to avoid exceeding size limits
            for batch in batch_embeddings(upsert_data, BATCH_SIZE):
                index.upsert(vectors=batch, namespace=table_name)
                print(f"Upserted batch for {column_name} in {table_name}")

# Main function to execute the process
def main():
    # Step 1: Connect to the database
    conn = connect_to_db()

    # Step 2: Fetch the schema with metadata and data types, including string and date/time columns
    schema_df = fetch_schema_with_data_types(conn)
    print("Schema with string and date/time data types fetched successfully.")

    # Step 3: Define the date range for today (for daily scheduler)
    today = datetime.now()
    from_date = today.strftime('%Y-%m-%d')  # Start of the day
    to_date = (today + timedelta(days=1)).strftime('%Y-%m-%d')  # End of the day

    # Example: Iterate through tables dynamically, finding the created date column and fetching unique values
    unique_values_dict = {}
    for table_name in schema_df['table_name'].unique():
        print(f"Processing table: {table_name}")

        # Step 4: Find the created date column dynamically for each table
        created_date_column = find_created_date_column(schema_df, table_name)

        # If a created date column is found, proceed to fetch unique values
        if created_date_column:
            # For simplicity, using the first string-based column for fetching unique values
            string_columns = schema_df[
                (schema_df['table_name'] == table_name) & 
                (schema_df['data_type'].isin(['character varying', 'text', 'varchar']))
            ]

            # Fetch unique values only if there are string columns
            if not string_columns.empty:
                column_name = string_columns.iloc[0]['column_name']  # Use the first string-based column
                unique_values = fetch_unique_values(conn, table_name, column_name, created_date_column, from_date, to_date)
                unique_values_dict.setdefault(table_name, {})[column_name] = unique_values
                print(f"Unique values for column {column_name} in table {table_name} between {from_date} and {to_date}: {unique_values}")
            else:
                print(f"No string columns found in table {table_name} to fetch unique values.")
        else:
            print(f"No 'created date' or timestamp column found for table {table_name}, skipping.")

    # Step 5: Load the Hugging Face model for embeddings
    embedding_model = load_huggingface_model()
    print("Hugging Face model loaded successfully.")

    # Step 6: Initialize Pinecone and check for existing records before generating embeddings
    pinecone_index = initialize_pinecone()

    # Step 7: Generate embeddings for all unique values, avoiding duplicates
    embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict, pinecone_index)
    print("Embeddings for string columns generated successfully.")

    # Step 8: Upsert embeddings under each table's namespace, skipping existing records
    upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)

    # Close the database connection
    conn.close()

if __name__ == "__main__":
    main()


  schema_df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


Schema with string and date/time data types fetched successfully.
Processing table: tasks
Unique values for column task_id in table tasks between 2024-10-07 and 2024-10-08: []
Processing table: milestones
Unique values for column project_name in table milestones between 2024-10-07 and 2024-10-08: []
Processing table: projects
Unique values for column project_id in table projects between 2024-10-07 and 2024-10-08: []
Processing table: contacts
No date or timestamp column found for table contacts.
No 'created date' or timestamp column found for table contacts, skipping.
Processing table: users
No date or timestamp column found for table users.
No 'created date' or timestamp column found for table users, skipping.




Hugging Face model loaded successfully.
No unique values found for task_id in tasks. Skipping embeddings.
No unique values found for project_name in milestones. Skipping embeddings.
No unique values found for project_id in projects. Skipping embeddings.
Embeddings for string columns generated successfully.


In [None]:
# import psycopg2
# import pandas as pd
# from datetime import datetime, timedelta

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "python_test_poc"
# PORT = 5432

# # Function to connect to PostgreSQL database
# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise

# # Fetch schema with column names and data types, dynamically including 'date' or 'timestamp' types for created dates
# def fetch_schema_with_data_types(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         AND data_type IN ('character varying', 'text', 'varchar', 'date', 'timestamp without time zone', 'timestamp with time zone')
#         """
#         schema_df = pd.read_sql(query, conn)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema with data types: {e}")
#         raise

# # Fetch unique values based on created date or time column dynamically
# def fetch_unique_values(conn, table_name, column_name, created_date_column, from_date, to_date):
#     try:
#         query = f"""
#         SELECT DISTINCT {column_name}
#         FROM {table_name}
#         WHERE {created_date_column} BETWEEN '{from_date}' AND '{to_date}'
#         """
#         df = pd.read_sql(query, conn)
#         return df[column_name].dropna().astype(str).tolist()
#     except Exception as e:
#         print(f"Error fetching unique values for {column_name} in {table_name} based on {created_date_column}: {e}")
#         return []

# # Find the 'created date' column dynamically based on its data type
# def find_created_date_column(schema_df, table_name):
#     try:
#         # Filter the schema for date or timestamp columns in the specified table
#         date_columns = schema_df[
#             (schema_df['table_name'] == table_name) & 
#             (schema_df['data_type'].isin(['date', 'timestamp without time zone', 'timestamp with time zone']))
#         ]
        
#         # Return the first 'date' or 'timestamp' column found (assuming it's the created date)
#         if not date_columns.empty:
#             return date_columns.iloc[0]['column_name']
#         else:
#             print(f"No date or timestamp column found for table {table_name}.")
#             return None
#     except Exception as e:
#         print(f"Error finding created date column for {table_name}: {e}")
#         return None

# # Main function to execute the process
# def main():
#     # Step 1: Connect to the database
#     conn = connect_to_db()

#     # Step 2: Fetch the schema with metadata and data types, including string and date/time columns
#     schema_df = fetch_schema_with_data_types(conn)
#     print("Schema with string and date/time data types fetched successfully.")

#     # Step 3: Define the date range for today (for daily scheduler)
#     today = datetime.now()
#     from_date = today.strftime('%Y-%m-%d')  # Start of the day
#     to_date = (today + timedelta(days=1)).strftime('%Y-%m-%d')  # End of the day

#     # Example: Iterate through tables dynamically, finding the created date column and fetching unique values
#     for table_name in schema_df['table_name'].unique():
#         print(f"Processing table: {table_name}")

#         # Step 4: Find the created date column dynamically for each table
#         created_date_column = find_created_date_column(schema_df, table_name)

#         # If a created date column is found, proceed to fetch unique values
#         if created_date_column:
#             # For simplicity, using the first string-based column for fetching unique values
#             string_columns = schema_df[
#                 (schema_df['table_name'] == table_name) & 
#                 (schema_df['data_type'].isin(['character varying', 'text', 'varchar']))
#             ]

#             # Fetch unique values only if there are string columns
#             if not string_columns.empty:
#                 column_name = string_columns.iloc[0]['column_name']  # Use the first string-based column
#                 unique_values = fetch_unique_values(conn, table_name, column_name, created_date_column, from_date, to_date)
#                 print(f"Unique values for column {column_name} in table {table_name} between {from_date} and {to_date}: {unique_values}")
#             else:
#                 print(f"No string columns found in table {table_name} to fetch unique values.")
#         else:
#             print(f"No 'created date' or timestamp column found for table {table_name}, skipping.")

#     # Close the connection after all operations
#     conn.close()

# if __name__ == "__main__":
#     main()


In [9]:
import psycopg2
import pandas as pd
from datetime import datetime, timedelta
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone

# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "python_test_poc"
PORT = 5432

# Pinecone details
pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
index_name = "smart-desk"
BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Fetch schema with column names and data types, dynamically including string and date/timestamp types
def fetch_schema_with_data_types(conn):
    try:
        query = """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'public'
        AND data_type IN ('character varying', 'text', 'varchar', 'date', 'timestamp without time zone', 'timestamp with time zone')
        """
        schema_df = pd.read_sql(query, conn)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema with data types: {e}")
        raise

# Find the 'created date' column dynamically based on its data type
def find_created_date_column(schema_df, table_name):
    try:
        date_columns = schema_df[
            (schema_df['table_name'] == table_name) & 
            (schema_df['data_type'].isin(['date', 'timestamp without time zone', 'timestamp with time zone']))
        ]
        if not date_columns.empty:
            return date_columns.iloc[0]['column_name']
        else:
            print(f"No date or timestamp column found for table {table_name}.")
            return None
    except Exception as e:
        print(f"Error finding created date column for {table_name}: {e}")
        return None

# Fetch unique values based on created date or time column dynamically
def fetch_unique_values(conn, table_name, column_name, created_date_column, from_date, to_date):
    try:
        query = f"""
        SELECT DISTINCT {column_name}
        FROM {table_name}
        WHERE {created_date_column} BETWEEN '{from_date}' AND '{to_date}'
        """
        df = pd.read_sql(query, conn)
        return df[column_name].dropna().astype(str).tolist()
    except Exception as e:
        print(f"Error fetching unique values for {column_name} in {table_name} based on {created_date_column}: {e}")
        return []

# Initialize SentenceTransformer model for Hugging Face embeddings
def load_huggingface_model():
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    embedding_model = SentenceTransformer(model_name)
    return embedding_model

# Generate embeddings for each unique value and store them
def generate_and_store_embeddings(embedding_model, unique_values_dict):
    embeddings_dict = {}
    for table_name, columns in unique_values_dict.items():
        embeddings_dict[table_name] = {}
        for column_name, unique_values in columns.items():
            if unique_values:  # Check if there are any unique values
                try:
                    embeddings = embedding_model.encode(unique_values)
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": embeddings
                    }
                except Exception as e:
                    print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
                    embeddings_dict[table_name][column_name] = {
                        "unique_values": unique_values,
                        "embeddings": []  # Store an empty list if encoding fails
                    }
            else:
                print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
                embeddings_dict[table_name][column_name] = {
                    "unique_values": [],
                    "embeddings": []
                }
    return embeddings_dict

# Initialize Pinecone
def initialize_pinecone():
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)
    return index

# Batch the embeddings for upserts
def batch_embeddings(upsert_data, batch_size):
    for i in range(0, len(upsert_data), batch_size):
        yield upsert_data[i:i + batch_size]

# Upsert embeddings into Pinecone with metadata for each table (namespace)
def upsert_embeddings_into_pinecone(index, embeddings_dict):
    for table_name, columns in embeddings_dict.items():
        for column_name, data in columns.items():
            upsert_data = []
            for i, embedding in enumerate(data['embeddings']):
                unique_value = data['unique_values'][i]
                vector_id = f"{table_name}_{column_name}_{i}"
                metadata = {"column_name": column_name, "unique_value": unique_value}

                upsert_data.append({
                    "id": vector_id,
                    "values": embedding.tolist(),
                    "metadata": metadata
                })

            # Batch the upsert to avoid exceeding size limits
            for batch in batch_embeddings(upsert_data, BATCH_SIZE):
                index.upsert(vectors=batch, namespace=table_name)
                print(f"Upserted batch for {column_name} in {table_name}")

# Main function to execute the process
def main():
    # Step 1: Connect to the database
    conn = connect_to_db()

    # Step 2: Fetch the schema with metadata and data types, including string and date/time columns
    schema_df = fetch_schema_with_data_types(conn)
    print("Schema with string and date/time data types fetched successfully.")

    # Step 3: Define the date range for today (for daily scheduler)
    today = datetime.now()
    from_date = today.strftime('%Y-%m-%d')  # Start of the day
    to_date = (today + timedelta(days=1)).strftime('%Y-%m-%d')  # End of the day

    # Example: Iterate through tables dynamically, finding the created date column and fetching unique values
    unique_values_dict = {}
    for table_name in schema_df['table_name'].unique():
        print(f"Processing table: {table_name}")

        # Step 4: Find the created date column dynamically for each table
        created_date_column = find_created_date_column(schema_df, table_name)

        # If a created date column is found, proceed to fetch unique values
        if created_date_column:
            # For simplicity, using the first string-based column for fetching unique values
            string_columns = schema_df[
                (schema_df['table_name'] == table_name) & 
                (schema_df['data_type'].isin(['character varying', 'text', 'varchar']))
            ]

            # Fetch unique values only if there are string columns
            if not string_columns.empty:
                column_name = string_columns.iloc[0]['column_name']  # Use the first string-based column
                unique_values = fetch_unique_values(conn, table_name, column_name, created_date_column, from_date, to_date)
                unique_values_dict.setdefault(table_name, {})[column_name] = unique_values
                print(f"Unique values for column {column_name} in table {table_name} between {from_date} and {to_date}: {unique_values}")
            else:
                print(f"No string columns found in table {table_name} to fetch unique values.")
        else:
            print(f"No 'created date' or timestamp column found for table {table_name}, skipping.")

    # Step 5: Load the Hugging Face model for embeddings
    embedding_model = load_huggingface_model()
    print("Hugging Face model loaded successfully.")

    # Step 6: Generate embeddings for all unique values
    embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict)
    print("Embeddings for string columns generated successfully.")

    # Step 7: Initialize Pinecone and upsert embeddings under each table's namespace
    pinecone_index = initialize_pinecone()
    upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
    print("Embeddings upserted into Pinecone successfully.")

    # Close the connection after all operations
    conn.close()

if __name__ == "__main__":
    main()


  schema_df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


Schema with string and date/time data types fetched successfully.
Processing table: tasks
Unique values for column task_id in table tasks between 2024-10-07 and 2024-10-08: []
Processing table: milestones
Unique values for column project_name in table milestones between 2024-10-07 and 2024-10-08: []
Processing table: projects
Unique values for column project_id in table projects between 2024-10-07 and 2024-10-08: []
Processing table: contacts
No date or timestamp column found for table contacts.
No 'created date' or timestamp column found for table contacts, skipping.
Processing table: users
No date or timestamp column found for table users.
No 'created date' or timestamp column found for table users, skipping.




Hugging Face model loaded successfully.
No unique values found for task_id in tasks. Skipping embeddings.
No unique values found for project_name in milestones. Skipping embeddings.
No unique values found for project_id in projects. Skipping embeddings.
Embeddings for string columns generated successfully.
Embeddings upserted into Pinecone successfully.


In [None]:
# #Vrsion One Vector DB Set Up

# import psycopg2
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# from pinecone.grpc import PineconeGRPC as Pinecone

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "postgres"
# PORT = 5432

# # Pinecone details
# pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
# index_name = "smart-desk"
# BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# # Function to connect to PostgreSQL database
# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise

# # Fetch schema with column names and data types, only including string types
# def fetch_schema_with_data_types(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         AND data_type IN ('character varying', 'text', 'varchar')
#         """
#         schema_df = pd.read_sql(query, conn)
#         print(schema_df)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema with data types: {e}")
#         raise

# # Fetch unique values from each column along with table details
# def fetch_unique_values(conn, table_name, column_name):
#     try:
#         query = f"SELECT DISTINCT {column_name} FROM {table_name}"
#         df = pd.read_sql(query, conn)
#         return df[column_name].dropna().astype(str).tolist()
#     except Exception as e:
#         print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
#         return []

# # Fetch all unique values for each column and map them to table details
# def fetch_all_unique_values_with_table(conn, schema_df):
#     unique_values_dict = {}
#     for table_name in schema_df['table_name'].unique():
#         unique_values_dict[table_name] = {}
#         table_columns = schema_df[schema_df['table_name'] == table_name]
#         for column_name in table_columns['column_name']:
#             unique_values = fetch_unique_values(conn, table_name, column_name)
#             unique_values_dict[table_name][column_name] = unique_values
#     return unique_values_dict

# # Initialize SentenceTransformer model for Hugging Face embeddings
# def load_huggingface_model():
#     model_name = 'sentence-transformers/all-mpnet-base-v2'
#     embedding_model = SentenceTransformer(model_name)
#     return embedding_model

# # Generate embeddings for each unique value and store them
# def generate_and_store_embeddings(embedding_model, unique_values_dict):
#     embeddings_dict = {}
#     for table_name, columns in unique_values_dict.items():
#         embeddings_dict[table_name] = {}
#         for column_name, unique_values in columns.items():
#             embeddings = embedding_model.encode(unique_values)
#             embeddings_dict[table_name][column_name] = {
#                 "unique_values": unique_values,
#                 "embeddings": embeddings
#             }
#     return embeddings_dict

# # Initialize Pinecone
# def initialize_pinecone():
#     pc = Pinecone(api_key=pinecone_api_key)
#     index = pc.Index(index_name)
#     return index

# # Batch the embeddings for upserts
# def batch_embeddings(upsert_data, batch_size):
#     for i in range(0, len(upsert_data), batch_size):
#         yield upsert_data[i:i + batch_size]

# # Upsert embeddings into Pinecone with metadata for each table (namespace)
# def upsert_embeddings_into_pinecone(index, embeddings_dict):
#     for table_name, columns in embeddings_dict.items():
#         for column_name, data in columns.items():
#             upsert_data = []
#             for i, embedding in enumerate(data['embeddings']):
#                 unique_value = data['unique_values'][i]
#                 vector_id = f"{table_name}_{column_name}_{i}"
#                 metadata = {"column_name": column_name, "unique_value": unique_value}

#                 upsert_data.append({
#                     "id": vector_id,
#                     "values": embedding.tolist(),
#                     "metadata": metadata
#                 })

#             # Batch the upsert to avoid exceeding size limits
#             for batch in batch_embeddings(upsert_data, BATCH_SIZE):
#                 index.upsert(vectors=batch, namespace=table_name)
#                 print(f"Upserted batch for {column_name} in {table_name}")

# # Main function to execute the process
# def main():
#     # Step 1: Connect to the database
#     conn = connect_to_db()

#     # Step 2: Fetch the schema with metadata and data types, only for string columns
#     schema_df = fetch_schema_with_data_types(conn)
#     print("Schema with string data types fetched successfully.")

#     # Step 3: Fetch all unique values along with table details
#     unique_values_dict = fetch_all_unique_values_with_table(conn, schema_df)
#     print("Unique values for string columns fetched successfully.")

#     # Step 4: Load the Hugging Face model for embeddings
#     embedding_model = load_huggingface_model()
#     print("Hugging Face model loaded successfully.")

#     # Step 5: Generate embeddings for all unique values
#     embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict)
#     print("Embeddings for string columns generated successfully.")

#     # Step 6: Initialize Pinecone and upsert embeddings under each table's namespace
#     pinecone_index = initialize_pinecone()
#     upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
#     print("Embeddings upserted into Pinecone successfully.")

# if __name__ == "__main__":
#     main()


In [None]:
# #Updated Pinecone Code to Create and Upsert Embeddings

# import psycopg2
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# from pinecone.grpc import PineconeGRPC as Pinecone

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "postgres"
# PORT = 5432

# # Pinecone details
# pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
# index_name = "smart-desk"
# BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# # Function to connect to PostgreSQL database
# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise

# # Fetch schema with column names and data types, only including string types
# def fetch_schema_with_data_types(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         AND data_type IN ('character varying', 'text', 'varchar') AND table_name = 'contacts'
#         """
#         schema_df = pd.read_sql(query, conn)
#         print(schema_df)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema with data types: {e}")
#         raise

# # Fetch unique values from each column along with table details
# def fetch_unique_values(conn, table_name, column_name):
#     try:
#         query = f"SELECT DISTINCT {column_name} FROM {table_name}"
#         df = pd.read_sql(query, conn)
#         return df[column_name].dropna().astype(str).tolist()
#     except Exception as e:
#         print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
#         return []

# # Fetch all unique values for each column and map them to table details
# def fetch_all_unique_values_with_table(conn, schema_df):
#     unique_values_dict = {}
#     for table_name in schema_df['table_name'].unique():
#         unique_values_dict[table_name] = {}
#         table_columns = schema_df[schema_df['table_name'] == table_name]
#         for column_name in table_columns['column_name']:
#             unique_values = fetch_unique_values(conn, table_name, column_name)
#             unique_values_dict[table_name][column_name] = unique_values
#     return unique_values_dict

# # Initialize SentenceTransformer model for Hugging Face embeddings
# def load_huggingface_model():
#     model_name = 'sentence-transformers/all-mpnet-base-v2'
#     embedding_model = SentenceTransformer(model_name)
#     return embedding_model

# # Generate embeddings for each unique value and store them
# # Generate embeddings for each unique value and store them
# def generate_and_store_embeddings(embedding_model, unique_values_dict):
#     embeddings_dict = {}
#     for table_name, columns in unique_values_dict.items():
#         embeddings_dict[table_name] = {}
#         for column_name, unique_values in columns.items():
#             if unique_values:  # Check if there are any unique values
#                 try:
#                     embeddings = embedding_model.encode(unique_values)
#                     embeddings_dict[table_name][column_name] = {
#                         "unique_values": unique_values,
#                         "embeddings": embeddings
#                     }
#                 except Exception as e:
#                     print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
#                     embeddings_dict[table_name][column_name] = {
#                         "unique_values": unique_values,
#                         "embeddings": []  # Store an empty list if encoding fails
#                     }
#             else:
#                 print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
#                 embeddings_dict[table_name][column_name] = {
#                     "unique_values": [],
#                     "embeddings": []
#                 }
#     return embeddings_dict


# # Initialize Pinecone
# def initialize_pinecone():
#     pc = Pinecone(api_key=pinecone_api_key)
#     index = pc.Index(index_name)
#     return index

# # Batch the embeddings for upserts
# def batch_embeddings(upsert_data, batch_size):
#     for i in range(0, len(upsert_data), batch_size):
#         yield upsert_data[i:i + batch_size]

# # Upsert embeddings into Pinecone with metadata for each table (namespace)
# def upsert_embeddings_into_pinecone(index, embeddings_dict):
#     for table_name, columns in embeddings_dict.items():
#         for column_name, data in columns.items():
#             upsert_data = []
#             for i, embedding in enumerate(data['embeddings']):
#                 unique_value = data['unique_values'][i]
#                 vector_id = f"{table_name}_{column_name}_{i}"
#                 metadata = {"column_name": column_name, "unique_value": unique_value}

#                 upsert_data.append({
#                     "id": vector_id,
#                     "values": embedding.tolist(),
#                     "metadata": metadata
#                 })

#             # Batch the upsert to avoid exceeding size limits
#             for batch in batch_embeddings(upsert_data, BATCH_SIZE):
#                 index.upsert(vectors=batch, namespace=table_name)
#                 print(f"Upserted batch for {column_name} in {table_name}")

# # Main function to execute the process
# def main():
#     # Step 1: Connect to the database
#     conn = connect_to_db()

#     # Step 2: Fetch the schema with metadata and data types, only for string columns
#     schema_df = fetch_schema_with_data_types(conn)
#     print("Schema with string data types fetched successfully.")

#     # Step 3: Fetch all unique values along with table details
#     unique_values_dict = fetch_all_unique_values_with_table(conn, schema_df)
#     print("Unique values for string columns fetched successfully.")

#     # Step 4: Load the Hugging Face model for embeddings
#     embedding_model = load_huggingface_model()
#     print("Hugging Face model loaded successfully.")

#     # Step 5: Generate embeddings for all unique values
#     embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict)
#     print("Embeddings for string columns generated successfully.")

#     # Step 6: Initialize Pinecone and upsert embeddings under each table's namespace
#     pinecone_index = initialize_pinecone()
#     upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
#     print("Embeddings upserted into Pinecone successfully.")

# if __name__ == "__main__":
#     main()


In [None]:
# import psycopg2
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# from pinecone.grpc import PineconeGRPC as Pinecone

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "python_test_poc"
# PORT = 5432

# # Pinecone details
# pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
# index_name = "smart-desk"
# BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# # Function to connect to PostgreSQL database
# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise

# # Fetch schema with column names and data types
# def fetch_schema_with_data_types(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         """
#         schema_df = pd.read_sql(query, conn)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema with data types: {e}")
#         raise

# # Fetch unique values from each column along with table details
# def fetch_unique_values(conn, table_name, column_name):
#     try:
#         query = f"SELECT DISTINCT {column_name} FROM {table_name}"
#         df = pd.read_sql(query, conn)
#         return df[column_name].dropna().astype(str).tolist()
#     except Exception as e:
#         print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
#         return []

# # Fetch all unique values for each column and map them to table details
# def fetch_all_unique_values_with_table(conn, schema_df):
#     unique_values_dict = {}
#     for table_name in schema_df['table_name'].unique():
#         unique_values_dict[table_name] = {}
#         table_columns = schema_df[schema_df['table_name'] == table_name]
#         for column_name in table_columns['column_name']:
#             unique_values = fetch_unique_values(conn, table_name, column_name)
#             unique_values_dict[table_name][column_name] = unique_values
#     return unique_values_dict

# # Initialize SentenceTransformer model for Hugging Face embeddings
# def load_huggingface_model():
#     model_name = 'sentence-transformers/all-mpnet-base-v2'
#     embedding_model = SentenceTransformer(model_name)
#     return embedding_model

# # Generate embeddings for each unique value and store them
# def generate_and_store_embeddings(embedding_model, unique_values_dict):
#     embeddings_dict = {}
#     for table_name, columns in unique_values_dict.items():
#         embeddings_dict[table_name] = {}
#         for column_name, unique_values in columns.items():
#             embeddings = embedding_model.encode(unique_values)
#             embeddings_dict[table_name][column_name] = {
#                 "unique_values": unique_values,
#                 "embeddings": embeddings
#             }
#     return embeddings_dict

# # Initialize Pinecone
# def initialize_pinecone():
#     pc = Pinecone(api_key=pinecone_api_key)
#     index = pc.Index(index_name)
#     return index

# # Batch the embeddings for upserts
# def batch_embeddings(upsert_data, batch_size):
#     for i in range(0, len(upsert_data), batch_size):
#         yield upsert_data[i:i + batch_size]

# # Upsert embeddings into Pinecone with metadata for each table (namespace)
# def upsert_embeddings_into_pinecone(index, embeddings_dict):
#     for table_name, columns in embeddings_dict.items():
#         for column_name, data in columns.items():
#             upsert_data = []
#             for i, embedding in enumerate(data['embeddings']):
#                 unique_value = data['unique_values'][i]
#                 vector_id = f"{table_name}_{column_name}_{i}"
#                 metadata = {"column_name": column_name, "unique_value": unique_value}

#                 upsert_data.append({
#                     "id": vector_id,
#                     "values": embedding.tolist(),
#                     "metadata": metadata
#                 })

#             # Batch the upsert to avoid exceeding size limits
#             for batch in batch_embeddings(upsert_data, BATCH_SIZE):
#                 index.upsert(vectors=batch, namespace=table_name)
#                 print(f"Upserted batch for {column_name} in {table_name}")

# # Main function to execute the process
# def main():
#     # Step 1: Connect to the database
#     conn = connect_to_db()

#     # Step 2: Fetch the schema with metadata and data types
#     schema_df = fetch_schema_with_data_types(conn)
#     print("Schema with data types fetched successfully.")

#     # Step 3: Fetch all unique values along with table details
#     unique_values_dict = fetch_all_unique_values_with_table(conn, schema_df)
#     print("Unique values fetched successfully.")

#     # Step 4: Load the Hugging Face model for embeddings
#     embedding_model = load_huggingface_model()
#     print("Hugging Face model loaded successfully.")

#     # Step 5: Generate embeddings for all unique values
#     embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict)
#     print("Embeddings generated successfully.")

#     # Step 6: Initialize Pinecone and upsert embeddings under each table's namespace
#     pinecone_index = initialize_pinecone()
#     upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
#     print("Embeddings upserted into Pinecone successfully.")

# if __name__ == "__main__":
#     main()


In [9]:
# #Updated Pinecone Code to Create and Upsert Embeddings

# import psycopg2
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# from pinecone.grpc import PineconeGRPC as Pinecone

# # Database connection details
# DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
# DATABASE_USERNAME = "postgres"
# DATABASE_PASSWORD = "valign#123"
# DATABASE_DB = "postgres"
# PORT = 5432

# # Pinecone details
# pinecone_api_key = "9fbe58e4-9e72-4023-90eb-ba8d022916b5"
# index_name = "smart-desk"
# BATCH_SIZE = 200  # Adjust the batch size to avoid exceeding the size limit

# # Function to connect to PostgreSQL database
# def connect_to_db():
#     try:
#         conn = psycopg2.connect(
#             dbname=DATABASE_DB,
#             user=DATABASE_USERNAME,
#             password=DATABASE_PASSWORD,
#             host=DATABASE_HOST,
#             port=PORT
#         )
#         return conn
#     except psycopg2.Error as e:
#         print(f"Error connecting to the database: {e}")
#         raise

# # Fetch schema with column names and data types, only including string types
# def fetch_schema_with_data_types(conn):
#     try:
#         query = """
#         SELECT table_name, column_name, data_type
#         FROM information_schema.columns
#         WHERE table_schema = 'public'
#         AND data_type IN ('character varying', 'text', 'varchar') AND table_name = 'contacts'
#         """
#         schema_df = pd.read_sql(query, conn)
#         print(schema_df)
#         return schema_df
#     except Exception as e:
#         print(f"Error fetching schema with data types: {e}")
#         raise

# # Fetch unique values from each column along with table details
# def fetch_unique_values(conn, table_name, column_name):
#     try:
#         query = f"SELECT DISTINCT {column_name} FROM {table_name}"
#         df = pd.read_sql(query, conn)
#         return df[column_name].dropna().astype(str).tolist()
#     except Exception as e:
#         print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
#         return []

# # Fetch all unique values for each column and map them to table details
# def fetch_all_unique_values_with_table(conn, schema_df):
#     unique_values_dict = {}
#     for table_name in schema_df['table_name'].unique():
#         unique_values_dict[table_name] = {}
#         table_columns = schema_df[schema_df['table_name'] == table_name]
#         for column_name in table_columns['column_name']:
#             unique_values = fetch_unique_values(conn, table_name, column_name)
#             unique_values_dict[table_name][column_name] = unique_values
#     return unique_values_dict

# # Initialize SentenceTransformer model for Hugging Face embeddings
# def load_huggingface_model():
#     model_name = 'sentence-transformers/all-mpnet-base-v2'
#     embedding_model = SentenceTransformer(model_name)
#     return embedding_model

# # Generate embeddings for each unique value and store them
# # Generate embeddings for each unique value and store them
# def generate_and_store_embeddings(embedding_model, unique_values_dict):
#     embeddings_dict = {}
#     for table_name, columns in unique_values_dict.items():
#         embeddings_dict[table_name] = {}
#         for column_name, unique_values in columns.items():
#             if unique_values:  # Check if there are any unique values
#                 try:
#                     embeddings = embedding_model.encode(unique_values)
#                     embeddings_dict[table_name][column_name] = {
#                         "unique_values": unique_values,
#                         "embeddings": embeddings
#                     }
#                 except Exception as e:
#                     print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
#                     embeddings_dict[table_name][column_name] = {
#                         "unique_values": unique_values,
#                         "embeddings": []  # Store an empty list if encoding fails
#                     }
#             else:
#                 print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
#                 embeddings_dict[table_name][column_name] = {
#                     "unique_values": [],
#                     "embeddings": []
#                 }
#     return embeddings_dict


# # Initialize Pinecone
# def initialize_pinecone():
#     pc = Pinecone(api_key=pinecone_api_key)
#     index = pc.Index(index_name)
#     return index

# # Batch the embeddings for upserts
# def batch_embeddings(upsert_data, batch_size):
#     for i in range(0, len(upsert_data), batch_size):
#         yield upsert_data[i:i + batch_size]

# # Upsert embeddings into Pinecone with metadata for each table (namespace)
# def upsert_embeddings_into_pinecone(index, embeddings_dict):
#     for table_name, columns in embeddings_dict.items():
#         for column_name, data in columns.items():
#             upsert_data = []
#             for i, embedding in enumerate(data['embeddings']):
#                 unique_value = data['unique_values'][i]
#                 vector_id = f"{table_name}_{column_name}_{i}"
#                 metadata = {"column_name": column_name, "unique_value": unique_value}

#                 upsert_data.append({
#                     "id": vector_id,
#                     "values": embedding.tolist(),
#                     "metadata": metadata
#                 })

#             # Batch the upsert to avoid exceeding size limits
#             for batch in batch_embeddings(upsert_data, BATCH_SIZE):
#                 index.upsert(vectors=batch, namespace=table_name)
#                 print(f"Upserted batch for {column_name} in {table_name}")

# # Main function to execute the process
# def main():
#     # Step 1: Connect to the database
#     conn = connect_to_db()

#     # Step 2: Fetch the schema with metadata and data types, only for string columns
#     schema_df = fetch_schema_with_data_types(conn)
#     print("Schema with string data types fetched successfully.")

#     # Step 3: Fetch all unique values along with table details
#     unique_values_dict = fetch_all_unique_values_with_table(conn, schema_df)
#     print("Unique values for string columns fetched successfully.")

#     # Step 4: Load the Hugging Face model for embeddings
#     embedding_model = load_huggingface_model()
#     print("Hugging Face model loaded successfully.")

#     # Step 5: Generate embeddings for all unique values
#     embeddings_dict = generate_and_store_embeddings(embedding_model, unique_values_dict)
#     print("Embeddings for string columns generated successfully.")

#     # Step 6: Initialize Pinecone and upsert embeddings under each table's namespace
#     pinecone_index = initialize_pinecone()
#     upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
#     print("Embeddings upserted into Pinecone successfully.")

# if __name__ == "__main__":
#     main()


  schema_df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


   table_name                 column_name          data_type
0    contacts                contact_name  character varying
1    contacts               customer_name  character varying
2    contacts                 vendor_name  character varying
3    contacts                company_name  character varying
4    contacts                contact_type  character varying
5    contacts               currency_code  character varying
6    contacts                  first_name  character varying
7    contacts                   last_name  character varying
8    contacts                       email  character varying
9    contacts                       phone  character varying
10   contacts                      mobile  character varying
11   contacts                      gst_no  character varying
12   contacts               gst_treatment  character varying
13   contacts  place_of_contact_formatted  character varying
14   contacts            place_of_contact  character varying
15   contacts           

  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


Unique values for string columns fetched successfully.




Hugging Face model loaded successfully.
No unique values found for vendor_name in contacts. Skipping embeddings.
No unique values found for currency_code in contacts. Skipping embeddings.
No unique values found for first_name in contacts. Skipping embeddings.
No unique values found for last_name in contacts. Skipping embeddings.
No unique values found for email in contacts. Skipping embeddings.
No unique values found for phone in contacts. Skipping embeddings.
No unique values found for mobile in contacts. Skipping embeddings.
No unique values found for gst_no in contacts. Skipping embeddings.
No unique values found for gst_treatment in contacts. Skipping embeddings.
No unique values found for place_of_contact_formatted in contacts. Skipping embeddings.
No unique values found for place_of_contact in contacts. Skipping embeddings.
No unique values found for pan_no in contacts. Skipping embeddings.
Embeddings for string columns generated successfully.
Upserted batch for contact_name in c