In [68]:
from sentence_transformers import SentenceTransformer
from typing import List

class UnoplatEmbeddingGenerator:
    def __init__(self):
        self.model = SentenceTransformer('jinaai/jina-embeddings-v3', trust_remote_code=True)

    def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
        task = 'retrieval.query'
        return self.model.encode(texts, task=task).tolist()
   
    def generate_embeddings_for_single_text(self, text: str) -> List[float]:
        task = 'retrieval.query'
        return self.model.encode(text, task=task).tolist()

In [69]:
from neo4j import GraphDatabase
uri = 'bolt://localhost:7687'
user = 'neo4j'
password = 'Ke7Rk7jB:Jn2Uz:'

driver = GraphDatabase.driver(uri, auth=(user, password))


In [70]:
def run_query(query, parameters=None):
    with driver.session() as session:
        result = session.run(query, parameters)
        return result.data()

In [4]:
user_query = "which function processes string-based inputs to return an output string?"
embedding_generator = UnoplatEmbeddingGenerator()
embedding = embedding_generator.generate_embeddings_for_single_text(user_query)


In [5]:
# Define the Cypher query
cypher_query = """
CALL db.index.vector.queryNodes($index_name, $k, $embedding) 
YIELD node, score
RETURN node.qualified_name AS MethodName, node.objective AS Objective, score
ORDER BY score DESC
LIMIT 3;
"""

# Parameters for the query
parameters = {
    "index_name": "Method_implementation_embedding_vector_index",  # Replace with your index name
    "k": 10,  # Number of nearest neighbors
    "embedding": embedding  # Your query vector
}

# Execute the query
results = run_query(cypher_query, parameters)

# Display the results
for record in results:
    print(record)

{'MethodName': 'autograd.StringBasedFunction.forward', 'Objective': 'The `forward` function processes string-based inputs to return an output string with an optional role description, while logging the input-output relationship and setting the gradient function for backpropagation, thereby supporting machine learning optimization and feedback mechanisms.', 'score': 0.7843685150146484}
{'MethodName': 'autograd.StringBasedFunction.__init__', 'Objective': 'The `__init__` function of the `StringBasedFunction` class sets up an autograd function for string operations, initializing internal state and logging capabilities while ensuring type safety and integration within complex systems.', 'score': 0.7730774879455566}
{'MethodName': 'autograd.StringBasedFunction._backward_through_string_fn_base', 'Objective': 'The `_backward_through_string_fn_base` function computes gradients for a list of variables based on a response variable by constructing an input string and generating a backward prompt. 

In [103]:
from enum import Enum
import json
import dspy 
from pydantic import BaseModel, Field
from typing import List

import litellm
litellm.set_verbose=False
#ollama_provider = dspy.OllamaLocal(model="qwen2.5:72b-instruct-fp16",base_url="http://206.1.58.174:11434",temperature=0.0,format="json")


ollama_provider = dspy.LM(model="ollama/qwen2.5:72b-instruct-fp16",api_base="http://206.1.58.174:11434",temperature=0.0)
openai_provider = dspy.LM(model="openai/gpt-4o-mini",api_key="insert your api key here",max_tokens=512,temperature=0.0)
dspy.configure(lm=openai_provider)

# Prepare intent descriptions
intent_descriptions = {
    "CODE_SUMMARIZATION": "User wants an overview or summary of the codebase.",
    "CODE_FEATURE": "User is looking for specific features that can be answered by going through the package summaries.",
    "FUNCTIONAL_IMPLEMENTATION": "User wants detailed understanding at the function level."
}

# Create a context string
def get_intent_context():
    context = "The possible user intents are:\n"
    for intent_name, description in intent_descriptions.items():
        context += f"- **{intent_name}**: {description}\n"
    return context

# class UserIntent(str, Enum):
#     CODE_SUMMARIZATION = "CODE_SUMMARIZATION"
#     CODE_FEATURE = "CODE_FEATURE"
#     FUNCTIONAL_IMPLEMENTATION = "FUNCTIONAL_IMPLEMENTATION"




class CodeConfluenceUserQuerySignature(dspy.Signature):
    """Based on user query and context of intents, return the user intent as list of intents in valid json format. Verify the json format strictly before returning."""
    user_query: str = dspy.InputField(desc="This will contain user query")
    intent_descriptions: str = dspy.InputField(desc="this will contain intents and their respective descriptions")
    user_intent: List[str] = dspy.OutputField(default_factory=list,desc="This will strictly return json format of list of items from intents")

    
   
class CodeConfluenceIntentDetectionModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.intent_detection = dspy.TypedChainOfThought(CodeConfluenceUserQuerySignature)

    def forward(self, user_query: str):
        intent_detection = self.intent_detection(user_query=user_query,intent_descriptions=json.dumps(get_intent_context()))
        print(intent_detection)
        return dspy.Prediction(answer=intent_detection)
        
 
intent_module = CodeConfluenceIntentDetectionModule()
print(intent_module(user_query="tell me about codebase overview and which function processes string-based inputs to return an output string?"))

Prediction(
    reasoning='The user is asking for an overview of the codebase and specifically inquiring about a function that processes string-based inputs. This indicates a need for both a summary of the codebase and details about a specific function, which aligns with the intents of CODE_SUMMARIZATION and FUNCTIONAL_IMPLEMENTATION. We will include both intents in the response.',
    user_intent=['CODE_SUMMARIZATION', 'FUNCTIONAL_IMPLEMENTATION']
)
Prediction(
    answer=Prediction(
    reasoning='The user is asking for an overview of the codebase and specifically inquiring about a function that processes string-based inputs. This indicates a need for both a summary of the codebase and details about a specific function, which aligns with the intents of CODE_SUMMARIZATION and FUNCTIONAL_IMPLEMENTATION. We will include both intents in the response.',
    user_intent=['CODE_SUMMARIZATION', 'FUNCTIONAL_IMPLEMENTATION']
)
)


	1.	What is the primary purpose of this codebase?
	•	Understanding the main goal or functionality of the application or module.
	2.	How is the project structured?
	•	Exploring the directory layout, modules, and how they interact.
	3.	What are the core features and functionalities?
	•	Identifying the key components and services provided.
	4.	Where is the code related to specific features located?
	•	Locating classes, methods, or scripts that implement certain functionalities.
	5.	How do different components communicate?
	•	Understanding APIs, interfaces, and data flow between modules.
	6.	What are the entry points of the application?
	•	Finding the main functions, scripts, or controllers that start processes.
	7.	How is data handled and stored?
	•	Looking into databases, data models, and storage mechanisms used.
	8.	What external dependencies or services are used?
	•	Identifying third-party libraries, APIs, or microservices integrated into the project.
	9.	How is the application configured?
	•	Understanding configuration files, environment variables, and settings.
	10.	How is error handling and logging implemented?
	•	Reviewing how the application handles exceptions and records events.


In [105]:
user_query_list = ["looking for function designed for handling datasets specifically from the BIG-Bench Hard challenge.",
                   "which function processes string-based inputs to return an output string?",
                   ]

In [91]:
def search_similar_functions(tx, query_embedding, top_k=5):
    query = """
    CALL db.index.vector.queryNodes('Method_objective_embedding_vector_index', $top_k, $embedding)
    YIELD node, score
    RETURN node.qualified_name AS function_name, node.objective AS function_objective, node.implementation_summary AS function_summary, score
    ORDER BY score DESC
    """
    return list(tx.run(query, embedding=query_embedding, top_k=top_k))

def search_similar_packages(tx, query_embedding, top_k=5):
    query = """
    CALL db.index.vector.queryNodes('Package_objective_embedding_vector_index', $top_k, $embedding)
    YIELD node, score
    RETURN node.qualified_name AS package_name, node.objective AS package_objective, score
    ORDER BY score DESC
    """
    return list(tx.run(query, embedding=query_embedding, top_k=top_k))

def search_similar_classes(tx, query_embedding, top_k=5):
    query = """
    CALL db.index.vector.queryNodes('Class_objective_embedding_vector_index', $top_k, $embedding)
    YIELD node, score
    RETURN node.qualified_name AS class_name, node.objective AS class_objective, score
    ORDER BY score DESC
    """
    return list(tx.run(query, embedding=query_embedding, top_k=top_k))

def search_similar_codebases(tx, query_embedding, top_k=5):
    query = """
    CALL db.index.vector.queryNodes('Codebase_objective_embedding_vector_index', $top_k, $embedding)
    YIELD node, score
    RETURN node.qualified_name AS codebase_name, node.objective AS codebase_objective, score
    ORDER BY score DESC
    """
    return list(tx.run(query, embedding=query_embedding, top_k=top_k))

def get_function_hierarchy_and_details(tx, function_name):
    query = """
    MATCH (f:Method {qualified_name: $function_name})
    OPTIONAL MATCH (f)<-[:CONTAINS]-(c:Class)<-[:CONTAINS]-(p:Package)<-[:CONTAINS]-(cb:Codebase)
    RETURN 
        cb.qualified_name AS codebase_name,
        cb.objective AS codebase_objective,
        cb.implementation_summary AS codebase_summary,
        p.qualified_name AS package_name,
        p.objective AS package_objective,
        p.implementation_summary AS package_summary,
        c.qualified_name AS class_name,
        c.objective AS class_objective,
        c.implementation_summary AS class_summary,
        f.qualified_name AS function_name,
        f.objective AS function_objective,
        f.implementation_summary AS function_summary
    """
    return tx.run(query, function_name=function_name).single()



In [75]:

from typing import List
from loguru import logger as log


class UnoplatRerankEmbedding:
    def __init__(self, sentence_transformer_model: str):
        self.sentence_rerank_model = SentenceTransformer(sentence_transformer_model, trust_remote_code=False)
        self.query_prompt_name = "s2p_query"  # Change to "s2s_query" for sentence-to-sentence tasks

    def generate_rerank_embedding(self, query: List[str], documents: List[str]):
        # Encode the queries with the s2p query prompt
        query_embeddings = self.sentence_rerank_model.encode(query, prompt_name=self.query_prompt_name)
        # Encode the documents (no prompt needed)
        doc_embeddings = self.sentence_rerank_model.encode(documents)
        similarities = self.sentence_rerank_model.similarity(query_embeddings, doc_embeddings)
        log.info(f"Similarity Matrix: {similarities}")
        return similarities



In [76]:
import dspy 
from typing import Dict, List

import litellm
litellm.set_verbose=False
#ollama_provider = dspy.OllamaLocal(model="qwen2.5:72b-instruct-fp16",base_url="http://206.1.58.174:11434",temperature=0.0,format="json")


class CodeConfluenceUserQueryReRankSignature(dspy.Signature):
    """Based on user query and possible answers, return the most relevant function names from the list based on the user query"""
    user_query: str = dspy.InputField(desc="This will contain user query")
    possible_answers: Dict[str,str] = dspy.InputField(desc="this will contain list of possibly relevant answers with function name and their description ")
    relevant_answers: Dict[str,int] = dspy.OutputField(default_factory=list,desc="return  the most relevant function names from the list based on the functions descriptions matching with user query with score from 1 to 10 with 10 being the highest match ")

    
   
class CodeConfluenceUserQueryReRankModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.rerank_module = dspy.TypedChainOfThought(CodeConfluenceUserQueryReRankSignature)

    def forward(self, user_query: str, possible_answers: List[str]):
        rerank_answers = self.rerank_module(user_query=user_query,possible_answers=possible_answers)
        print(rerank_answers)
        return dspy.Prediction(answer=rerank_answers)
        
 


In [88]:
# pydantic data model to hold <Record codebase_name='textgrad' codebase_objective='Provide a modular framework for managing and validating language model engines, enhancing logging, error handling, and performance optimization, while supporting machine learning tasks and educational analysis through customizable prompts.' codebase_summary='The `engine` package provides a modular framework for managing and validating language model engines, essential for state management and evaluation in machine learning tasks, including those involving GPQA, MMLU, and GSM8K datasets. It implements a singleton pattern for consistent engine management, enhances logging capabilities, and includes comprehensive error handling and data caching. Utility functions optimize performance, such as image URL validation and efficient downloading, which are vital for structured task configuration and dataset management. The package also supports automatic differentiation and efficient gradient computation, integrating optimization strategies for parameter gradients, thereby improving usability in educational navigation and analysis through customizable prompts.' package_name='textgrad.tasks' package_objective='The `tasks` package aims to provide a comprehensive suite of tools for managing and evaluating GPQA, MMLU, and GSM8K datasets, facilitating structured task configuration, question extraction, answer evaluation, and efficient dataset management for enhanced educational navigation and analysis.' package_summary='The `tasks` package offers a comprehensive suite of tools for managing GPQA, MMLU, and GSM8K datasets, with a strong emphasis on the `GSM8K` class for extracting questions, answers, and reasoning for mathematical problem-solving. It efficiently supports the BIG-Bench Hard challenge by enabling structured task configuration, parameter validation, and effective data loading for reasoning questions. The `GPQA` class retrieves questions, shuffles multiple-choice answers, and evaluates responses using regular expressions for scoring accuracy. The `MMLU` class evaluates multiple-choice responses, integrating libraries for dataset management and evaluation metrics. The `GPQAInstanceDataset` class enhances usability with methods for answer extraction, scoring, and question retrieval, while the `MMLUInstanceDataset` class manages MMLU datasets, offering methods for question retrieval and answer formatting. The `big_bench_hard` class introduces advanced data processing methods for numeric extraction and string comparison, utilizing libraries for complex manipulations in machine learning evaluations. The `LeetCodeHardEval` class manages a dataset of challenging LeetCode problems, providing methods for retrieving problem descriptions and specific problem data, ensuring dataset availability. The `DataLoader` class efficiently manages datasets for machine learning by enabling batch processing and iteration, with customizable options for batch size and shuffling. Additionally, the package includes the `Dataset` abstract base class, which defines a template for structured datasets, requiring subclasses to implement indexing and providing a method for length retrieval. The package supports dynamic instance loading with robust error handling and configuration options, collectively improving educational navigation and user interaction for comprehensive dataset analysis and evaluation.' class_name='textgrad.tasks.mmlu' class_objective='The `mmlu` class evaluates multiple-choice responses for correctness using regular expressions and integrates libraries for dataset management and evaluation metrics.' class_summary='The `mmlu` class is designed for evaluating multiple-choice responses, utilizing regular expressions to assess answer correctness against a provided key. It integrates various libraries for dataset management and evaluation metrics, facilitating robust assessment processes.' function_name='textgrad.tasks.mmlu.eval_string_based' function_objective='The function `eval_string_based` evaluates a response string for a correct answer (A-D) by extracting the answer using regular expressions and comparing it to a provided correct answer, returning a score of 1.0 for a match and 0.0 otherwise, while handling cases with no answer found.' function_summary='The function `eval_string_based` is designed to evaluate a response string to determine if it contains a correct answer (A-D) based on a specified pattern. It extracts the answer from the response using regular expressions and compares it to the provided correct answer. The function returns a score of 1.0 for a correct match and 0.0 for an incorrect one. It effectively handles cases where no answer is found, ensuring robust evaluation. This function is part of the `mmlu` class, which may utilize various imports such as `platformdirs`, `textgrad.variable`, and `datasets` for enhanced functionality, although it does not directly interact with any specific fields or extensions.'>
class CodeConfluenceFunctionHiearchy(BaseModel):
    codebase_name: str = Field(description="The name of the codebase")
    codebase_objective: str = Field(description="The objective of the codebase")
    codebase_summary: str = Field(description="The summary of the codebase")
    package_name: str = Field(description="The name of the package")
    package_objective: str = Field(description="The objective of the package")
    package_summary: str = Field(description="The summary of the package")
    class_name: str = Field(description="The name of the class")
    class_objective: str = Field(description="The objective of the class")
    class_summary: str = Field(description="The summary of the class")
    function_name: str = Field(description="The name of the function")
    function_objective: str = Field(description="The objective of the function")
    function_summary: str = Field(description="The summary of the function")

class CodeConfluenceFunctionHiearchySub(BaseModel):
    codebase_name: str = Field(description="The name of the codebase",default=None)
    codebase_objective: str = Field(description="The objective of the codebase",default=None)
    package_name: str = Field(description="The name of the package",default=None)
    package_objective: str = Field(description="The objective of the package",default=None)
    class_name: str = Field(description="The name of the class",default=None)
    class_objective: str = Field(description="The objective of the class",default=None)
    function_name: str = Field(description="The name of the function",default=None)
    function_summary: str = Field(description="The summary of the function",default=None)
    relevance_score: int = Field(description="The relevance score of the function",default=None)

    

In [99]:
from typing import Optional
import dspy

class CodeConfluenceUserQueryResponseSignature(dspy.Signature):
    """Generate a comprehensive response to the user query using the code hierarchy data."""
    user_query: str = dspy.InputField(desc="The user's original query.")
    code_hierarchy: CodeConfluenceFunctionHiearchySub = dspy.InputField(desc="The code hierarchy data relevant to the user query.")
    existing_respone : str = dspy.InputField(default="No existing response yet",desc="The existing response to the user query based on multiple code hiearchy. It will be empty in the first instance or if there is just one relevant code hiearchy for user query")
    final_response: str = dspy.OutputField(desc="final response based on user_query , code_hierarchy and existing_response if it exists")

class CodeConfluenceUserQueryResponseModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.response_module = dspy.ChainOfThought(CodeConfluenceUserQueryResponseSignature)

    def forward(self, user_query: str, code_hierarchy: CodeConfluenceFunctionHiearchySub, existing_respone: Optional[str]):
        if existing_respone is None:
            final_response = self.response_module(user_query=user_query, code_hierarchy=code_hierarchy,existing_respone="No existing response yet")
        else:
            final_response = self.response_module(user_query=user_query, code_hierarchy=code_hierarchy, existing_respone=existing_respone)
        return dspy.Prediction(answer=final_response)

response_module = CodeConfluenceUserQueryResponseModule()


In [106]:
from neo4j import GraphDatabase
import time

embedding_generator = UnoplatEmbeddingGenerator()
# rerank_embedding = UnoplatRerankEmbedding(sentence_transformer_model="dunzhang/stella_en_1.5B_v5")
rerank_module = CodeConfluenceUserQueryReRankModule()
context = Dict[str,CodeConfluenceFunctionHiearchySub]
litellm.set_verbose=False
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Ke7Rk7jB:Jn2Uz:"))


for user_query in user_query_list:
    # Generate embedding for user query
    embedding_start = time.time()
    user_query_embedding = embedding_generator.generate_embeddings_for_single_text(user_query)
    embedding_end = time.time()
    embedding_time = embedding_end - embedding_start

    user_intent_list: List[str] = intent_module(user_query=user_query).answer.user_intent
    
    if "FUNCTIONAL_IMPLEMENTATION" in user_intent_list:
        # Search similar functions
        search_start = time.time()
        results = search_similar_functions(driver.session(), user_query_embedding)
        function_objective_dict = {result["function_name"]:result["function_summary"] for result in results}
        print(function_objective_dict)
        rerank_results = rerank_module(user_query=user_query,possible_answers=function_objective_dict).answer.relevant_answers
        filtered_rerank_results = {k: v for k, v in rerank_results.items() if v > 7}
        context = {k: v for k, v in function_objective_dict.items() if k in filtered_rerank_results.keys()}
        search_end = time.time()
        search_time = search_end - search_start

        # Get hierarchy for all function names
        hierarchy_start = time.time()
        for function_name in context.keys():
            function_hierarchy = get_function_hierarchy_and_details(driver.session(), function_name)
            function_hierarchy_object = CodeConfluenceFunctionHiearchySub(**function_hierarchy)
            function_hierarchy_object.relevance_score = filtered_rerank_results[function_name]
            context[function_name] = function_hierarchy_object
        hierarchy_end = time.time()
        hierarchy_time = hierarchy_end - hierarchy_start

        # Generate final response
        response_start = time.time()
        existing_response = None
        final_response = ""
        for function_name in context.keys():
            final_response = response_module(user_query=user_query, code_hierarchy=context[function_name], existing_respone=existing_response)
            existing_response = final_response.answer
        response_end = time.time()
        response_time = response_end - response_start

        print(f"Embedding time: {embedding_time:.2f}s")
        print(f"Search time: {search_time:.2f}s")
        print(f"Hierarchy time: {hierarchy_time:.2f}s")
        print(f"Response time: {response_time:.2f}s")
        print(f"Total time: {embedding_time + search_time + hierarchy_time + response_time:.2f}s")
        print("Final response:")
        print(final_response.answer)

       


    

Prediction(
    reasoning='The user is looking for a function designed for handling datasets specifically from the BIG-Bench Hard challenge, which indicates a need for a detailed understanding at the function level. Therefore, the user intent is likely to be related to functional implementation.',
    user_intent=['FUNCTIONAL_IMPLEMENTATION']
)
{'textgrad.tasks.BigBenchHard.__init__': 'The `__init__` function initializes an instance of the `BigBenchHard` class, which is designed for handling datasets specifically from the BIG-Bench Hard challenge. This function requires a `task_name` parameter to specify the task being addressed. Additionally, it accepts an optional `root` directory to define where the dataset is located and a `split` type that defaults to "train". The function performs validation checks to ensure that the `root` directory is valid and that the specified `split` is one of the acceptable options. It constructs the path to the dataset file based on the provided parameter