In [1]:
import openai

In [2]:
import psycopg2
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from pinecone.grpc import PineconeGRPC as Pinecone
from sentence_transformers import SentenceTransformer, util
from pinecone import ServerlessSpec
# Database connection details
DATABASE_HOST = "database-test-postgress-instance.cpk2uyae6iza.ap-south-1.rds.amazonaws.com"
DATABASE_USERNAME = "postgres"
DATABASE_PASSWORD = "valign#123"
DATABASE_DB = "zoho_projects_data_v2"
PORT = 5432

# Pinecone details
pinecone_api_key = "pcsk_32zXtP_T9wkSxopwxc5vqKe56jjsi3c2tDysBNighCv4hDx8JtC2MrKduZe1QDKJwf99zX"
index_name = "jagoai"
BATCH_SIZE = 512  # Adjust the batch size to avoid exceeding the size limit

# Function to connect to PostgreSQL database
def connect_to_db():
    try:
        conn = psycopg2.connect(
            dbname=DATABASE_DB,
            user=DATABASE_USERNAME,
            password=DATABASE_PASSWORD,
            host=DATABASE_HOST,
            port=PORT
        )
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        raise

# Fetch schema with column names and data types, only including string types
def fetch_schema_with_data_types(conn):
    try:
        query = """
         SELECT table_name, column_name, data_type, character_maximum_length
        FROM information_schema.columns
        WHERE table_schema = 'public'
          AND (data_type = 'character varying' OR data_type IN ('text', 'varchar'))
        ORDER BY table_name;
        """
        schema_df = pd.read_sql(query, conn)
        print(schema_df)
        return schema_df
    except Exception as e:
        print(f"Error fetching schema with data types: {e}")
        raise

# Fetch unique values from each column along with table details
def fetch_unique_values(conn, table_name, column_name):
    try:
        query = f"SELECT DISTINCT {column_name} FROM {table_name}"
        df = pd.read_sql(query, conn)
        return df[column_name].dropna().astype(str).tolist()
    except Exception as e:
        print(f"Error fetching unique values for {column_name} in {table_name}: {e}")
        return []

# Fetch all unique values for each column and map them to table details
def fetch_all_unique_values_with_table(conn, schema_df):
    unique_values_dict = {}
    for table_name in schema_df['table_name'].unique():
        unique_values_dict[table_name] = {}
        table_columns = schema_df[schema_df['table_name'] == table_name]
        for column_name in table_columns['column_name']:
            unique_values = fetch_unique_values(conn, table_name, column_name)
            unique_values_dict[table_name][column_name] = unique_values
    return unique_values_dict


# Generate embeddings for each unique value using BERT model
def generate_and_store_embeddings(unique_values_dict):
    openai.api_key='your api key'
    embeddings_dict = {}
    for table_name, columns in unique_values_dict.items():
        embeddings_dict[table_name] = {}
        for column_name, unique_values in columns.items():
            if unique_values:
                unique_values=set(unique_values)
                unique_values=list(unique_values)
                unique_values = [item for item in unique_values if not (isinstance(item, (int, float)) or item is None or item == "")]
                for val in unique_values:
                    print(val)
                    try:
                        #print("unique Values:",unique_values)
                        response = openai.embeddings.create(
                            model="text-embedding-3-large",
                            input=val
                        )
                        embedding = response.data[0].embedding
                        #print("Embeding:",embedding)
                        # Output the embedding
                        embeddings_dict[table_name][column_name] = {
                            "unique_values": val,
                            "embeddings": embedding}
                    except Exception as e:
                        #print("unique_values:",unique_values)
                        print(f"Error generating embeddings for {column_name} in {table_name}: {e}")
                        embeddings_dict[table_name][column_name] = {
                            "unique_values": val,
                            "embeddings": []
                        }
            else:
                print(f"No unique values found for {column_name} in {table_name}. Skipping embeddings.")
    return embeddings_dict

# Initialize Pinecone with Dot Product metric
def initialize_pinecone():
    pc = Pinecone(api_key=pinecone_api_key)
    '''pc.create_index(spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"),
        name=index_name,dimension=768, metric="cosine")'''
    index=pc.Index(index_name)
    return index

# Batch the embeddings for upserts
def batch_embeddings(upsert_data, batch_size):
    for i in range(0, len(upsert_data), batch_size):
        yield upsert_data[i:i + batch_size]

# Upsert embeddings into Pinecone with metadata for each table (namespace)
def upsert_embeddings_into_pinecone(index, embeddings_dict):
    for table_name, columns in embeddings_dict.items():
        print(table_name, columns)
        for column_name, data in columns.items():
            upsert_data = []
            for i, embedding in enumerate(data['embeddings']):
                unique_value = data['unique_values'][i]
                vector_id = f"{table_name}_{column_name}_{i}"
                metadata = {"column_name": column_name, "unique_value": unique_value}

                upsert_data.append({
                    "id": vector_id,
                    "values": embedding,
                    "metadata": metadata
                })

            # Batch the upsert to avoid exceeding size limits
            for batch in batch_embeddings(upsert_data, BATCH_SIZE):
                index.upsert(vectors=batch, namespace=table_name)
                print(f"Upserted batch for {column_name} in {table_name}")




In [3]:

# Step 1: Connect to the database
conn = connect_to_db()

# Step 2: Fetch the schema with metadata and data types, only for string columns
schema_df = fetch_schema_with_data_types(conn)
print("Schema with string data types fetched successfully.")


               table_name     column_name data_type character_maximum_length
0   issues_zoho_projects_     projectname      text                     None
1   issues_zoho_projects_   milestonename      text                     None
2   issues_zoho_projects_          module      text                     None
3   issues_zoho_projects_  classification      text                     None
4   issues_zoho_projects_    reproducible      text                     None
..                    ...             ...       ...                      ...
84   users_zoho_projects_          status      text                     None
85   users_zoho_projects_         profile      text                     None
86   users_zoho_projects_            role      text                     None
87   users_zoho_projects_       useremail      text                     None
88   users_zoho_projects_        username      text                     None

[89 rows x 4 columns]
Schema with string data types fetched successfully.


  schema_df = pd.read_sql(query, conn)


In [4]:

# Step 3: Fetch all unique values along with table details
unique_values_dict = fetch_all_unique_values_with_table(conn, schema_df)
print("Unique values for string columns fetched successfully.")



  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


Error fetching unique values for milestoneurl in milestones_zoho_projects_: Execution failed on sql: SELECT DISTINCT milestoneurl FROM milestones_zoho_projects_
server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

unable to rollback
Error fetching unique values for milestonename in milestones_zoho_projects_: connection already closed
Error fetching unique values for delaypercentage in milestones_zoho_projects_: connection already closed
Error fetching unique values for milestonedelaytime in milestones_zoho_projects_: connection already closed
Error fetching unique values for flag in milestones_zoho_projects_: connection already closed
Error fetching unique values for duration in milestones_zoho_projects_: connection already closed
Error fetching unique values for actu

In [5]:
# Step 5: Generate embeddings for all unique values
embeddings_dict = generate_and_store_embeddings(unique_values_dict)
print("Embeddings for string columns generated successfully.")

Bahwan IT LLC - Zoho one
A Y & J Solicitors. - Zoho Creator
Sarthy geotech Zoho Books & Zoho Expense
Agappe - Zoho CRM Revamp Project
Ecovinal - Grading Process revaluation CR
Stanzen Order management
EMERGING MARKET SOLUTIONS
Voice Enabled AI Platform - Release 1.0
Davinci_Zoho Books
VADM ERP
Aura_Zoho Books
RGBSI Zoho Books
Sona_Zoho CRM_Zoho Books
IIFL SAMASTA Demand App Implementation CR
JSW One Platforms - Zoho Creator & Zoho Books | Dedicated Resource
Cyraacs Zoho People, Zoho Recruit and Zoho Payroll
Educational Initiatives_CRM CR2022
Axiobio
Ecovinal - Zoho Creator - CR
Amalfiee Ceramics_Zoho Inventory
Angadi Ventures Pvt. Ltd. Zoho Creator
VSInterconnect Zoho creator - Inventory and Order Management -Project 1
Namma Sampradaya_Zoho One
iDatalytics - Zoho Devt(NMSDC)- Roll Out
Vestian Global_Zoho Creator
Moolya_Zoho Books
IIFl Samasta CPL CR
Rooman_Zoho Books
ICAD: Zoho Creator - CR
Ecovinal - Zoho Creator CR
Connectivity_Zoho Creator Service Desk
ROCKSOLID - ZOHO BOOKS - AU
Ec

In [6]:
import pickle
#save embedings
def save_embeddings_to_pickle(embeddings_dict, file_path):
    try:
        # Save to file
        with open(file_path, "wb") as pickle_file:
            pickle.dump(embeddings_dict, pickle_file)
        print(f"Embeddings successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving embeddings to Pickle: {e}")

# Example usage
save_embeddings_to_pickle(embeddings_dict, "embeddings_oprnai_emb_large_update_1.pkl")

Embeddings successfully saved to embeddings_oprnai_emb_large_update_1.pkl


In [7]:
import pickle

def load_embeddings_from_pickle(file_path):
    try:
        with open(file_path, "rb") as pickle_file:
            embeddings_dict = pickle.load(pickle_file)
        print(f"Embeddings successfully loaded from {file_path}")
        return embeddings_dict
    except Exception as e:
        print(f"Error loading embeddings from Pickle: {e}")
        return {}

In [8]:
embeddings_dict = load_embeddings_from_pickle("embeddings_oprnai_emb_large_update_1.pkl")

Embeddings successfully loaded from embeddings_oprnai_emb_large_update_1.pkl


In [18]:
def upsert_embeddings_into_pinecone(index, embeddings_dict):
    for table_name, columns in embeddings_dict.items():
        print(f"Processing table: {table_name}")
        for column_name, data in columns.items():
            # Ensure embeddings and unique_values are appropriately structured
            embeddings = data['embeddings']
            if isinstance(data['unique_values'], str):
                # Expand unique_values to match the number of embeddings
                unique_values = [data['unique_values']] * len(embeddings)
            else:
                unique_values = data['unique_values']
            
            # Prepare data for upserting
            upsert_data = []
            for i, embedding in enumerate(embeddings):
                if not isinstance(embedding, list):
                    embedding = [embedding]  # Ensure each embedding is a list

                unique_value = unique_values[i]
                vector_id = f"{table_name}_{column_name}_{i}"
                metadata = {"column_name": column_name, "unique_value": unique_value}

                upsert_data.append({
                    "id": vector_id,
                    "values": embedding,
                    "metadata": metadata
                })

            # Batch the upsert to avoid exceeding size limits
            for batch in batch_embeddings(upsert_data, BATCH_SIZE):
                index.upsert(vectors=batch, namespace=table_name)
                print(f"Upserted batch for column '{column_name}' in table '{table_name}'")


In [9]:
# Step 6: Initialize Pinecone and upsert embeddings under each table's namespace
pinecone_index = initialize_pinecone()

In [19]:
upsert_embeddings_into_pinecone(pinecone_index, embeddings_dict)
print("Embeddings upserted into Pinecone successfully.")

Processing table: issues_zoho_projects_


PineconeException: UNKNOWN:Error received from peer  {grpc_message:"Vector dimension 1 does not match the dimension of the index 3072", grpc_status:3, created_time:"2024-11-28T02:28:30.8003789+00:00"}

In [11]:
# Assuming `pinecone_index` is your initialized index and `embeddings_dict` contains your data
# `embeddings_dict` is expected to be a dictionary where keys are IDs and values are embeddings

# Displaying the upserted IDs
upserted_ids = list(embeddings_dict.keys())
print("Upserted IDs:", upserted_ids)

# Fetch and display the data for each ID
for id in upserted_ids:
    response = pinecone_index.fetch(ids=[id])
    print(f"Data for ID {id}: {response}")


Upserted IDs: ['issues_zoho_projects_', 'milestones_zoho_projects_', 'project_groups_zoho_projects_', 'projects_zoho_projects_', 'tags_zoho_projects_', 'tasks_zoho_projects_', 'teams_zoho_projects_', 'test', 'timesheet_zoho_projects_', 'users_zoho_projects_']
Data for ID issues_zoho_projects_: {'namespace': '', 'usage': {'read_units': 0}, 'vectors': {}}
Data for ID milestones_zoho_projects_: {'namespace': '', 'usage': {'read_units': 0}, 'vectors': {}}
Data for ID project_groups_zoho_projects_: {'namespace': '', 'usage': {'read_units': 0}, 'vectors': {}}
Data for ID projects_zoho_projects_: {'namespace': '', 'usage': {'read_units': 0}, 'vectors': {}}
Data for ID tags_zoho_projects_: {'namespace': '', 'usage': {'read_units': 0}, 'vectors': {}}
Data for ID tasks_zoho_projects_: {'namespace': '', 'usage': {'read_units': 0}, 'vectors': {}}
Data for ID teams_zoho_projects_: {'namespace': '', 'usage': {'read_units': 0}, 'vectors': {}}
Data for ID test: {'namespace': '', 'usage': {'read_units'

In [12]:
upserted_ids = list(embeddings_dict.keys())

# Fetch all embeddings using their IDs
all_embeddings = {}
for id in upserted_ids:
    response = pinecone_index.fetch(ids=[id])
    all_embeddings[id] = response['vectors'].get(id, {}).get('values', None)

# Display the embeddings
for id, embedding in all_embeddings.items():
    print(f"ID: {id}, Embedding: {embedding}")

ID: issues_zoho_projects_, Embedding: None
ID: milestones_zoho_projects_, Embedding: None
ID: project_groups_zoho_projects_, Embedding: None
ID: projects_zoho_projects_, Embedding: None
ID: tags_zoho_projects_, Embedding: None
ID: tasks_zoho_projects_, Embedding: None
ID: teams_zoho_projects_, Embedding: None
ID: test, Embedding: None
ID: timesheet_zoho_projects_, Embedding: None
ID: users_zoho_projects_, Embedding: None


In [13]:
upserted_ids = list(embeddings_dict.keys())

# Fetch all embeddings using their IDs and print them
print("All Embedding Values:")
for id in upserted_ids:
    response = pinecone_index.fetch(ids=[id])
    embedding_values = response['vectors'].get(id, {}).get('values', None)
    if embedding_values:
        print(f"ID: {id}, Embedding Values: {embedding_values}")
    else:
        print(f"ID: {id} has no embedding data.")

All Embedding Values:
ID: issues_zoho_projects_ has no embedding data.
ID: milestones_zoho_projects_ has no embedding data.
ID: project_groups_zoho_projects_ has no embedding data.
ID: projects_zoho_projects_ has no embedding data.
ID: tags_zoho_projects_ has no embedding data.
ID: tasks_zoho_projects_ has no embedding data.
ID: teams_zoho_projects_ has no embedding data.
ID: test has no embedding data.
ID: timesheet_zoho_projects_ has no embedding data.
ID: users_zoho_projects_ has no embedding data.


In [None]:
#The issue is with single entity should be with single embeding
#Here we are creating single embeding for multiple entities
#Reolve it and unit test in first half(hopefuly the issues should be resolved