In [5]:
import pandas as pd
import re
import json
import numpy as np
from openai_manager import *
from initial_config import *
from db_manager import *

class Pinecone_manager:
    def __init__(self, schema_df):
        self.NAMESPACE = []  # Replace with your namespace
        self.columnnames = {}
        self.searched_cols = []
        self.searched_tables = []
        self.augmented_input = ''
        self.intermediate_input=''
        self.schema_df = schema_df
        self.extracted_Features = None
        self.cleaned_feature_dict = None
        self.tokenizer= None
        self.pinecone_index = None
        self.selection={}
        self.selection_required=False
        self.ic=Initialize_config()

    def clear_all(self):
        self.NAMESPACE = []  # Replace with your namespace
        self.columnnames = {}
        self.searched_cols = []
        self.searched_tables = []
        self.augmented_input = ''
        self.intermediate_input=''
        self.selection={}
        self.selection_required=False

    def process_user_input(self, user_input):
        self.extracted_Features = OpenAI_manager.extract_features_with_openai(OpenAI_manager, user_input, self.schema_df)
        #print(self.extracted_Features)

    def process_extracted_features(self):
        def clean_extracted_features(feature_dict):
            print(feature_dict)
            # Remove any keys with None or empty values
            cleaned_feature_dict = {k: v for k, v in feature_dict.items() if v not in [None, '', [], {}, 'none', 'null', 'n/a', 'not specified']}
            # Extract the non-null values into a list
            feature_list = list(cleaned_feature_dict.values())
            return cleaned_feature_dict, feature_list

        try:
            # Remove the "## Solution:" part and any other non-JSON text
            json_match = re.search(r'\{.*\}', self.extracted_Features, re.DOTALL)
            
            if json_match:
                # Extract the JSON part from the matched result
                cleaned_features = json_match.group(0)

                # Convert JSON string to a Python dictionary
                feature_dict = json.loads(cleaned_features)

                # Clean feature dictionary and feature list to remove nulls and empty values
                self.cleaned_feature_dict, feature_list = clean_extracted_features(feature_dict)

                # Return cleaned JSON and feature list
                return json.dumps(self.cleaned_feature_dict, indent=4), feature_list
            else:
                return None, []
        except (json.JSONDecodeError, ValueError) as e:
            print(f"Error parsing features: {e}")
            return None, []

    def extract_namespace(self):
        for key in self.extracted_dict.keys():
            self.NAMESPACE.append(key)
            self.columnnames[key] = extracted_dict[key]
    #reframe the input with selected values
    def call_query_pinecone1(self, user_input, p_i, data):
        print("pineconedata", data)
        for x in data.keys():
            selected=str(data[x])
            user_input=user_input.replace(x,selected)
        self.augmented_input=user_input
        print("augumented_input", self.augmented_input)
        self.selection_required=False
        
    #check if any multiple values for each entity found in vectorDB
    def call_query_pinecone(self, user_input, p_i):
        res=''
        self.pinecone_index = p_i
        for key, val in self.cleaned_feature_dict.items():
            columns = list(val.keys())
            tables=list(val.values())
            print(columns)
            if self.augmented_input == '':
                res = self.query_pinecone_and_augment_input(user_input, key, columns,tables)
            else:
                res = self.query_pinecone_and_augment_input(self.augmented_input, key, columns,tables)
        print("augumentedinput",res)
        self.clear_all()
        return res
    def create_prompt(self,query):
        """
        Creates a structured prompt to handle negation and context.
        """
        if "not" in query.lower():
            return f"Input query: {query}. Focus on the meaning and negation."
        return f"Input query: {query}. Interpret the query accurately."
    

    def query_pinecone_and_augment_input(self, user_input, namespace, columns,tables):
        openai.api_key=self.ic.return_key()
        self.augmented_input = user_input

        def flatten_dict(d, parent_key=''):
            items = []
            for k, v in d.items():
                new_key = f"{parent_key}.{k}" if parent_key else k
                if isinstance(v, dict):
                    items.extend(flatten_dict(v, new_key).items())
                else:
                    items.append((new_key, v))
            return dict(items)

        flat_entities = flatten_dict(self.cleaned_feature_dict)
        #print(flat_entities)
        for column_name,table_name in zip(columns,tables):
            print(column_name)
            
            if column_name not in self.searched_cols or table_name not in self.searched_tables:
                self.searched_cols.append(column_name)
                self.searched_tables.append(table_name)

                # Obtain the entity value corresponding to the current column
                entity_value = self.cleaned_feature_dict[namespace].get(column_name, None)
                #print(entity_value)
                if not entity_value:
                    print(True)
                    continue  # Skip to the next column if no value is found
                #entity_value=self.create_prompt(entity_value)

                # Generate the query embedding for the entity value
                split_entities = entity_value.split()
                if len(split_entities)>1:
                    entity_value = split_entities[1]
                response = openai.embeddings.create(
                    model="text-embedding-3-large",  # Correct embedding model
                    input=entity_value # Input must be a list
                )
                embedding = response.data[0].embedding

                try:
                    result = self.pinecone_index.query(
                        namespace=namespace,
                        vector=embedding,
                        filter={"column_name": {"$eq": column_name}},
                        top_k=3,
                        include_values=True,
                        include_metadata=True
                    )

                    matches = result.get('matches', [])
                    #print(matches)
                    if matches:
                        get_match=[]
                        # Sort matches by score in descending order
                        matches.sort(key=lambda x: x['score'], reverse=True)

                        # Check if multiple matches have a significant score difference
                        best_match = matches[0]
                        print("match1:",matches[0]['metadata'].get('unique_value', entity_value))
                        print("match2:",matches[1]['metadata'].get('unique_value', entity_value))
                        print("match3:",matches[2]['metadata'].get('unique_value', entity_value))
                        print("Best match:",matches[0]['metadata'].get('unique_value', entity_value))
                        best_score = best_match['score']
                        print("Best Score:",best_score)
                        selection_required = False
                        selected_match = best_match['metadata'].get('unique_value', entity_value)

                        # Check if any other match has a score difference < 0.1
                        for match in matches[1:]:
                            print("MAtch score:",match['score'])
                            score_diff = best_score - match['score']
                            if score_diff < 0.02:
                                selection_required = True
                                break
                            else:
                                continue
                                
                        if selection_required:
                            # Record the values for multiple values to select among the matches
                            print(f"Multiple matches found with significant score difference for '{entity_value}'. Please select:")
                            for i, match in enumerate(matches):
                                get_match.append(match['metadata'].get('unique_value', entity_value))
                            self.selection[entity_value]=get_match
                            self.selection_required=True
                        else:
                            best_match_for_1_entity = matches[0]['metadata'].get('unique_value', entity_value)
                            #print('best_match_for_1_entity', best_match_for_1_entity)
                            self.augmented_input = self.augmented_input.replace(entity_value, best_match_for_1_entity)

                        
                    else:
                        print(f"No matches found for {entity_value} in Pinecone.")
                except Exception as e:
                    print(f"Error querying Pinecone: {str(e)}")
            else:
                print("Column already searched")
        if self.selection_required==True:
            
            print("Selection dict:",self.selection)
            print("Recent:",self.intermediate_input)
            return {"selection": self.selection}
        else:
            return self.augmented_input


In [6]:
from schema_manager import *
DB=DB_Manager()
openai_manager=OpenAI_manager()
p=Initialize_config()
p.assign_pinecone_index()
p.process_openAI_model()
p.set_prompt_template()
db_name="zoho_projects_data_v2_backup"
conn = DB.connect(DATABASE_DB = f"{db_name}")
schema='public'
query = f"""
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = '{schema}'
        """
schema_manager=Schema_manager(conn,query,schema)
schema_manager.fetch_schema_with_data_types()
schema_manager.format_schema()

  self.schema_df=pd.read_sql(self.query, self.conn)


In [7]:
user_input="give me the count all not completed projects which have delayed milestone"
pine_cone=Pinecone_manager(schema_manager.schema_df)
pine_cone.process_user_input(user_input)
_, feature_list=pine_cone.process_extracted_features()
res1=pine_cone.call_query_pinecone(user_input,p.pinecone_index)
print(res1)



{'projects_zoho_projects_': {'status': 'not completed'}, 'milestones_zoho_projects_': {'status': 'delayed'}}
['status']
status
match1: Completed
match2: Cancelled
match3: To Do
Best match: Completed
Best Score: 0.872679412
MAtch score: 0.51363337
MAtch score: 0.362889528
['status']
status
match1: Overdue
match2: Archived
match3: Upcoming
Best match: Overdue
Best Score: 0.479148567
MAtch score: 0.395186186
MAtch score: 0.372353315
augumentedinput give me the count all not Completed projects which have Overdue milestone
give me the count all not Completed projects which have Overdue milestone


In [8]:
import pandas as pd
import re
import json
import openai
from openai_manager import OpenAI_manager
from initial_config import Initialize_config
from db_manager import DB_Manager
from schema_manager import Schema_manager

class PineconeManager:
    def __init__(self, schema_df):
        self.NAMESPACE = []
        self.columnnames = {}
        self.searched_cols = []
        self.searched_tables = []
        self.augmented_input = ''
        self.intermediate_input = ''
        self.schema_df = schema_df
        self.extracted_features = None
        self.cleaned_feature_dict = None
        self.tokenizer = None
        self.pinecone_index = None
        self.selection = {}
        self.selection_required = False
        self.ic = Initialize_config()

    def clear_all(self):
        """Reset all class variables to their initial state."""
        self.NAMESPACE = []
        self.columnnames = {}
        self.searched_cols = []
        self.searched_tables = []
        self.augmented_input = ''
        self.intermediate_input = ''
        self.selection = {}
        self.selection_required = False

    def process_user_input(self, user_input):
        """Extract features using OpenAI."""
        self.extracted_features = OpenAI_manager.extract_features_with_openai(
            OpenAI_manager, user_input, self.schema_df
        )

    def process_extracted_features(self):
        """Clean and parse extracted features into a structured format."""
        def clean_extracted_features(feature_dict):
            cleaned_feature_dict = {
                k: v for k, v in feature_dict.items()
                if v not in [None, '', [], {}, 'none', 'null', 'n/a', 'not specified']
            }
            feature_list = list(cleaned_feature_dict.values())
            return cleaned_feature_dict, feature_list

        try:
            json_match = re.search(r'\{.*\}', self.extracted_features, re.DOTALL)
            if json_match:
                cleaned_features = json_match.group(0)
                feature_dict = json.loads(cleaned_features)
                self.cleaned_feature_dict, feature_list = clean_extracted_features(feature_dict)
                return json.dumps(self.cleaned_feature_dict, indent=4), feature_list
            return None, []
        except (json.JSONDecodeError, ValueError) as e:
            print(f"Error parsing features: {e}")
            return None, []

    def call_query_pinecone(self, user_input, pinecone_index):
        """Process input with Pinecone to refine the query."""
        self.pinecone_index = pinecone_index
        for key, val in self.cleaned_feature_dict.items():
            columns = list(val.keys())
            tables = list(val.values())
            if not self.augmented_input:
                res = self.query_pinecone_and_augment_input(user_input, key, columns, tables)
            else:
                res = self.query_pinecone_and_augment_input(self.augmented_input, key, columns, tables)
        self.clear_all()
        return res

    def query_pinecone_and_augment_input(self, user_input, namespace, columns, tables):
        """Query Pinecone and augment user input with refined values."""
        openai.api_key = self.ic.return_key()
        self.augmented_input = user_input

        for column_name, table_name in zip(columns, tables):
            if column_name not in self.searched_cols or table_name not in self.searched_tables:
                self.searched_cols.append(column_name)
                self.searched_tables.append(table_name)

                entity_value = self.cleaned_feature_dict[namespace].get(column_name, None)
                if not entity_value:
                    continue

                response = openai.embeddings.create(
                    model="text-embedding-3-large",  # Correct embedding model
                    input=entity_value # Input must be a list
                )
                embedding = response.data[0].embedding

                try:
                    result = self.pinecone_index.query(
                        namespace=namespace,
                        vector=embedding,
                        filter={"column_name": {"$eq": column_name}},
                        top_k=3,
                        include_values=True,
                        include_metadata=True
                    )
                    matches = result.get('matches', [])
                    if matches:
                        matches.sort(key=lambda x: x['score'], reverse=True)
                        best_match = matches[0]
                        best_score = best_match['score']
                        selection_required = False

                        for match in matches[1:]:
                            score_diff = best_score - match['score']
                            if score_diff < 0.02:
                                selection_required = True
                                break

                        if selection_required:
                            self.selection[entity_value] = [
                                match['metadata'].get('unique_value', entity_value) for match in matches
                            ]
                            self.selection_required = True
                        else:
                            best_match_value = matches[0]['metadata'].get('unique_value', entity_value)
                            self.augmented_input = self.augmented_input.replace(entity_value, best_match_value)
                except Exception as e:
                    print(f"Error querying Pinecone: {str(e)}")
            else:
                print("Column already searched")

        return {"selection": self.selection} if self.selection_required else self.augmented_input



In [9]:
if __name__ == "__main__":
    DB = DB_Manager()
    openai_manager = OpenAI_manager()
    config = Initialize_config()
    config.assign_pinecone_index()
    config.process_openAI_model()
    config.set_prompt_template()

    db_name = "zoho_projects_data_v2_backup"
    conn = DB.connect(DATABASE_DB=db_name)
    schema = 'public'
    query = f"""
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = '{schema}'
    """
    schema_manager = Schema_manager(conn, query, schema)
    schema_manager.fetch_schema_with_data_types()
    schema_manager.format_schema()

    user_input = "give me the count of all not completed projects which have delayed milestone"
    pinecone_manager = PineconeManager(schema_manager.schema_df)
    pinecone_manager.process_user_input(user_input)
    _, feature_list = pinecone_manager.process_extracted_features()
    result = pinecone_manager.call_query_pinecone(user_input, config.pinecone_index)
    print(result)


  self.schema_df=pd.read_sql(self.query, self.conn)


give me the count of all Completed projects which have Overdue milestone


In [13]:
from nltk.corpus import wordnet

def get_unigram_from_ngram(ngram: str) -> str:
    """
    Finds a unigram with a similar meaning to a given bigram or trigram using WordNet.

    Args:
        ngram (str): Input bigram or trigram (e.g., 'machine learning').

    Returns:
        str: A unigram with a similar meaning, if found; otherwise, an empty string.
    """
    # Split the ngram into words
    words = ngram.split()

    # Generate synonyms using WordNet for each word
    synonyms = set()
    for word in words:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())

    # Find the intersection of synonyms for all words
    # This is a naive heuristic; a more advanced method may use semantic similarity.
    common_synonyms = synonyms.intersection(words)
    print("common_synonyms:",common_synonyms)

    # Return a suitable unigram if found, otherwise a fallback
    return list(common_synonyms)[0] if common_synonyms else "No match found"

# Example usage
bigrams = ["not completed", "most heavy"]
trigrams = ["premier show time", "computer vision"]

for phrase in bigrams + trigrams:
    unigram = get_unigram_from_ngram(phrase)
    print(f"{phrase} -> {unigram}")


common_synonyms: {'not', 'completed'}
not completed -> not
common_synonyms: {'heavy', 'most'}
most heavy -> heavy
common_synonyms: {'show', 'premier', 'time'}
premier show time -> show
common_synonyms: {'vision', 'computer'}
computer vision -> vision


In [14]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def find_closest_unigram(ngram: str, candidate_unigrams: list) -> str:
    """
    Finds the closest unigram to a given n-gram using embeddings.

    Args:
        ngram (str): Input bigram or trigram (e.g., 'machine learning').
        candidate_unigrams (list): List of unigrams to compare against.

    Returns:
        str: The closest unigram by semantic similarity.
    """
    # Compute embedding for the n-gram
    ngram_embedding = model.encode(ngram, convert_to_tensor=True)

    # Compute embeddings for unigrams
    unigram_embeddings = model.encode(candidate_unigrams, convert_to_tensor=True)

    # Find the most similar unigram
    similarities = util.pytorch_cos_sim(ngram_embedding, unigram_embeddings)
    closest_index = similarities.argmax().item()
    return candidate_unigrams[closest_index]

# Example usage
bigrams = ["machine learning", "deep learning"]
candidate_unigrams = ["AI", "automation", "technology", "intelligence"]

for phrase in bigrams:
    closest_unigram = find_closest_unigram(phrase, candidate_unigrams)
    print(f"{phrase} -> {closest_unigram}")


machine learning -> AI
deep learning -> AI


In [15]:
def get_specific_unigram(phrase: str) -> str:
    """
    Returns a specific unigram equivalent for a given bigram/trigram.

    Args:
        phrase (str): Input bigram or trigram (e.g., 'not completed').

    Returns:
        str: The specific unigram if found; otherwise, the original phrase.
    """
    mapping = {
        "not completed": "incomplete",
        "not started": "uninitiated",
        "not working": "broken",
        "not allowed": "prohibited",
        "natural language processing": "NLP",
        "machine learning": "AI"
    }
    return mapping.get(phrase.lower(), phrase)

phrases = ["not completed", "not started", "machine learning", "natural language processing"]
for phrase in phrases:
    unigram = get_specific_unigram(phrase)
    print(f"{phrase} -> {unigram}")


not completed -> incomplete
not started -> uninitiated
machine learning -> AI
natural language processing -> NLP
