In [None]:
import os
from edoc.gpt_helpers.gpt_basics import create_chat_completion
from pydantic import BaseModel, Field
from typing import List, Optional

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [None]:
from dotenv import load_dotenv

load_dotenv()

#### First step is to set up retrievers for the unstructured lookup. This will make use of the vector index.

- create extraction for named code entities in the text
- create vector lookup

In [None]:
class ProgrammingNamedEntities(BaseModel):
    """Identifying information about code entities."""
    
    entities: List = Field(
        default=[],
        description="Extracted programming specific named entities, such as named directories, "
        "files, functions, classes, or imports in a single list (name matters only).",
    )

def extract_code_entities(string_with_entities, model='gpt-4o-mini'):
    """
    Extracts named entities from a given code string, including directories, files,  imports, function names, and class names.

    This function uses a language model to analyze the provided text and extract named entities related to programming or coding

    Args:
        string_with_entities (str): Unstructured text as a string from which to extract entities.
        model (str): The LLM model to use
        
    Returns:
        entities: An instance of ProgrammingNamedEntities containing the extracted directories, files, imports, functions, and classes.
    """

    llm=ChatOpenAI(
        model_name=model
    )
    # Modify the prompt to focus on extracting code entities
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are extracting directories, files, imports, functions, and classes from the given text.",
            ),
            (
                "human",
                "Use the given format to extract information from the following input: {code_snippet}",
            ),
        ]
    )

    # Set up the chain to extract the structured output
    entity_chain = prompt | llm.with_structured_output(ProgrammingNamedEntities)

    entities = entity_chain.invoke({'code_snippet': string_with_entities})

    entities = entities.entities

    return entities

In [None]:
#Example usage
test_entity_string = """
"Hey, can you check the file src/utils/helpers.py? I think the process_data function is 
missing an import. You should import numpy and pandas at the beginning. Also, take a look 
at the DataProcessor class in models/data_processor.py. There's a bug in the transform_data 
method. Finally, the config/settings.json file might need an update to include 
the new API endpoint."
"""

programming_entities = extract_code_entities(string_with_entities=test_entity_string)
print(programming_entities)

In [None]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from edoc.gpt_helpers.connect import connect_to_neo4j

NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
URL = "bolt://localhost:7687"

vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(model="text-embedding-3-small"),
    url=URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name= "chunkSummaryVectorIndex",
    search_type="hybrid",
    node_label="Chunk",
    text_node_properties=["id", "summary"],
    embedding_node_property="summary_embedding"
)


In [None]:
question = """
What is this project about?
"""

top_n_unstructured_data = vector_index.similarity_search(question, k=3)
top_n_unstructured_data = [item.page_content for item in top_n_unstructured_data]

print(
    '\n'.join(top_n_unstructured_data)
)

#### Extending this to work for any of our 4 index

In [None]:
def create_vector_index(vector_index_name, keyword_index_name, node_label, embedding_property, text_properties, model="text-embedding-3-small", search_type="hybrid"):
    """
    Create a vector index for a given node label and embedding type.

    Args:
        vector_index_name (str): The name of the vector index we would like to use.
        keyword_index_name (str) Keyword index name to use (created if run the first time)
        node_label (str): The label of the nodes (e.g., 'Chunk', 'File', 'Directory').
        embedding_property (str): The property name for the embeddings (e.g., 'summary_embedding', 'raw_embedding').
        text_properties (list): List of text properties to include in the index (e.g., ['id', 'summary', 'raw_code']).
        model (str): The OpenAI model to use. Default is 'text-embedding-3-small'.
        search_type (str): The type of search ('hybrid', 'exact', etc.). Default is 'hybrid'.

    Returns:
        Neo4jVector: The vector index object.
    """
    return Neo4jVector.from_existing_graph(
        OpenAIEmbeddings(model=model),
        url=URL,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
        search_type=search_type,
        index_name= vector_index_name,
        keyword_index_name=keyword_index_name,
        node_label=node_label,
        text_node_properties=text_properties,
        embedding_node_property=embedding_property
    )

def perform_similarity_search(vector_indexes, question, top_k=3):
    """
    Perform a similarity search across one or more vector indexes.

    Args:
        vector_indexes (list): A list of Neo4jVector objects to search.
        question (str): The search query.
        top_k (int): The number of top results to return. Default is 3.

    Returns:
        list: A list of top results across all vector indexes.
    """
    results = []
    
    for vector_index in vector_indexes:
        top_n_data = vector_index.similarity_search(question, k=top_k)
        results.extend(top_n_data)

    return [item.page_content for item in results]


In [None]:
# Create vector indexes
chunk_summary_index = create_vector_index("chunkSummaryVectorIndex", "code_summary_keyword", "Chunk", "summary_embedding", ["id", "summary"])
chunk_raw_index = create_vector_index("chunkRawVectorIndex", "code_raw_keyword", "Chunk", "chunk_embedding", ["id", "raw_code"])
file_summary_index = create_vector_index("fileSummaryVectorIndex", "file_keyword", "File", "summary_embedding", ["path", "summary"])
dir_summary_index = create_vector_index("dirSummaryVectorIndex", "dir_keyword", "Directory", "summary_embedding", ["path", "summary"])


In [None]:
# Example 1: Search in chunk summaries and raw code
vector_indexes = [chunk_summary_index, chunk_raw_index]
results = perform_similarity_search(vector_indexes, "Can you tell me more about `ResumeSection`?", top_k=2)

print(
    '\n'.join(results)
)

In [None]:
# Example 2: Search across all available summaries (chunk, file, directory)
vector_indexes = [chunk_summary_index, file_summary_index, dir_summary_index]
results = perform_similarity_search(vector_indexes, "Summarize the contents of the project please", top_k=2)


print(
    '\n'.join(results)
)

In [None]:
# Example 3: Search only in directory summaries
vector_indexes = [dir_summary_index]
results = perform_similarity_search(vector_indexes, "Summarize the file system from the top", top_k=3)


print(
    '\n'.join(results)
)