#### Create base graph by walking through the directory and finding files and dirs, adding them to the graph

In [None]:
import os
from datetime import datetime
from neo4j import GraphDatabase
from dotenv import load_dotenv
from edoc.connect import connect_to_neo4j

class CodebaseGraph:
    def __init__(self, uri="bolt://localhost:7687", user=None, password=None, openai_api_key=None):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            uri (str): The URI of the Neo4j database. Defaults to localhost.
            user (str): Username for Neo4j. If None, loads from environment variable NEO4J_USERNAME.
            password (str): Password for Neo4j. If None, loads from environment variable NEO4J_PASSWORD.
            openai_api_key (str): Key needed to access OpenAI API
        """
        # Load environment variables
        load_dotenv()

        # Set up the Neo4j connection
        self.uri = uri
        self.NEO4J_USER = user or os.getenv("NEO4J_USERNAME")
        self.NEO4J_PASSWORD =  password or os.getenv("NEO4J_PASSWORD")
        self.OPENAI_API_KEY = openai_api_key or os.getenv("OPENAI_API_KEY")

        if not self.NEO4J_USER or not self.NEO4J_PASSWORD:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")
        
        if not self.OPENAI_API_KEY:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")

        self.kg = connect_to_neo4j()

    def get_file_info(self, file_path):
        """
        Get information about a file, including type, size, last modified date, creation date, permissions, owner, and hash.

        Args:
            file_path (str): The path to the file.

        Returns:
            dict: A dictionary containing file information.
        """
        stats = os.stat(file_path)
        file_type = os.path.splitext(file_path)[1][1:]  # Get file extension without the dot
        size = stats.st_size
        last_modified = datetime.fromtimestamp(stats.st_mtime).isoformat()
        created = datetime.fromtimestamp(stats.st_ctime).isoformat()


        return {
            "type": file_type,
            "size": size,
            "last_modified": last_modified,
            "created": created,
        }

    def create_graph(self, root_directory):
        """
        Traverse a directory and create a graph in Neo4j representing the directory structure and file information.

        Args:
            root_directory (str): The root directory to start traversing from.
        """
        for root, dirs, files in os.walk(root_directory):

            dir_name = os.path.basename(root)

            # Create node for the directory
            self.kg.query(
                """
                MERGE (dir:Directory {name: $dir_name, path: $path})
                ON CREATE SET dir.created = $created, dir.last_modified = $last_modified
                """,
                {
                    'dir_name':dir_name,
                    'path':root,
                    'created':datetime.fromtimestamp(os.stat(root).st_ctime).isoformat(),
                    'last_modified':datetime.fromtimestamp(os.stat(root).st_mtime).isoformat(),
                }
            )

            # Create nodes for subdirectories
            for dir_name in dirs:
                dir_path = os.path.join(root, dir_name)
                self.kg.query(
                    """
                        MERGE (subdir:Directory {name: $dir_name, path: $subdir_path})
                        ON CREATE SET subdir.created = $created, subdir.last_modified = $last_modified
                        WITH subdir
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(subdir)
                    """,
                    {
                        'dir_name':dir_name,
                        'subdir_path':dir_path, 
                        'parent_path':root,
                        'created':datetime.fromtimestamp(os.stat(dir_path).st_ctime).isoformat(),
                        'last_modified':datetime.fromtimestamp(os.stat(dir_path).st_mtime).isoformat(),
                    }
                )

            # Create nodes for files
            for file_name in files:
                file_path = os.path.join(root, file_name)
                file_info = self.get_file_info(file_path)
                self.kg.query(
                    """
                        MERGE (file:File {name: $file_name, path: $file_path})
                        ON CREATE SET file.type = $type, file.size = $size, file.last_modified = $last_modified, file.created = $created
                        WITH file
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(file)
                    """,
                    {
                        'file_name':file_name, 
                        'file_path':file_path, 
                        'type':file_info['type'], 
                        'size':file_info['size'], 
                        'last_modified':file_info['last_modified'], 
                        'created':file_info['created'], 
                        'parent_path':root
                    }
                )


In [None]:
# Example usage
codebase_graph = CodebaseGraph()
codebase_graph.create_graph("C:\\Users\\willd\\Documents\\Git\\graphRag\\test_project")

In [None]:
print(codebase_graph.kg.schema)

#### Next work on file enrichments. We will need to create functions to connect to opneai api as well as functions to perform embeddings and extractoins

In [None]:
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

def create_chat_completion(messages, model='gpt-4o-mini'):
    """
    Create a chat completion using the OpenAI API.

    Args:
        messages (list): A list of message dictionaries for the conversation.
        model (str): The OpenAI model to use. Default is 'gpt-4o-mini'.

    Returns:
        str: The content of the response from the OpenAI API.
    """
    client = OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(messages=messages, model=model)
    return response.choices[0].message.content

def summarize_file_chunk(chunk_text, file_name, model='gpt-4o-mini'):
    """
    Summarize a chunk of text from a file using OpenAI's language model.

    Args:
        chunk_text (str): The text chunk to summarize.
        file_name (str): The name of the file from which the chunk was extracted.
        model (str): The OpenAI model to use. Default is 'gpt-4o-mini'.

    Returns:
        str: A brief and clear summary of the chunk.
    """
    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"I have this text from a file named {file_name}. The text is:\n{chunk_text}\nPlease summarize it in simple terms. Try to keep the summary brief, but maintain clarity."}
    ]
    return create_chat_completion(messages=prompt, model=model)


In [None]:
test_file_name = "utils.py"
test_chunk = """
import os
from dotenv import load_dotenv

def process_file(input_path, output_path):
    with open(input_path, 'r') as f:
        content = f.read()
    
    with open(output_path, 'w') as f:
        f.write(content.upper())

    print(f"Processed {{input_path}} and saved results to {{output_path}}")

"""

In [None]:
test_summary = summarize_file_chunk(file_name=test_file_name, chunk_text=test_chunk)
print(test_summary)

#### Up next we will create embeddings from a summary

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
    """
    Generate an embedding for a given text using OpenAI's embedding model.

    Args:
        text (str): The text to be embedded. Newlines are replaced with spaces.
        model (str): The OpenAI model to use. Default is 'text-embedding-3-small'.

    Returns:
        list: A list of floats representing the embedding vector of the input text.
    """
    client = OpenAI(api_key=OPENAI_API_KEY)
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [None]:
test_embedding = get_embedding(text=test_summary)
len(test_embedding)

### Onto creating a structured output to use with llms. The goal should be to take code and extract imports, functions, and classes. We use [this](https://medium.com/neo4j/enhancing-the-accuracy-of-rag-applications-with-knowledge-graphs-ad5e2ffab663) for insperation.

In [None]:
from pydantic import BaseModel, Field
from typing import List, Optional

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

llm=ChatOpenAI(
    model_name='gpt-4o-mini'
)

# Define a model to capture code entities
class CodeEntities(BaseModel):
    """Identifying information about code entities."""
    
    imports: Optional[List[dict]] = Field(
        default=[],
        description="All the import statements in the code, with the module "
        "and specific entities being imported.",
    )
    functions: Optional[List[dict]] = Field(
        default=[],
        description="All the function names in the code, including their parameters "
        "and types if available.",
    )
    classes: Optional[List[dict]] = Field(
        default=[],
        description="All the class names in the code, including their parameters "
        "and types if available.",
    )

# Modify the prompt to focus on extracting code entities
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting imports, function names, and class names from the given code. "
            "For imports, provide the module and specific entities being imported. "
            "For functions and classes, include their parameters and types if available.",
        ),
        (
            "human",
            "Use the given format to extract information from the following input: {code_snippet}",
        ),
    ]
)

# Set up the chain to extract the structured output
entity_chain = prompt | llm.with_structured_output(CodeEntities)


In [None]:
entity_example = """
import java.util.List;
import java.util.Optional;

public class ExampleClass {
    private String param1;
    private int param2;

    public ExampleClass(String param1, int param2) {
        this.param1 = param1;
        this.param2 = param2;
    }

    public boolean exampleMethod(List<String> arg1, Optional<Integer> arg2) {
        return arg2.isPresent();
    }
}

"""

In [None]:
test_entities = entity_chain.invoke({'code_snippet': entity_example})

print("Imports:")
for imp in test_entities.imports:
    module = imp.get("module", "Unknown")
    imported_entities = ", ".join(imp.get("entities", [])) or "all entities"
    print(f"  - {module}: {imported_entities}")

print("\nFunctions:")
for func in test_entities.functions:
    name = func.get("name", "Unnamed function")
    parameters = ", ".join([f"{param['name']}: {param['type']}" for param in func.get("parameters", [])])
    return_type = func.get("return_type", "Unknown")
    print(f"  - {name}({parameters}) -> {return_type}")

print("\nClasses:")
for cls in test_entities.classes:
    name = cls.get("name", "Unnamed class")
    parameters = ", ".join([f"{param['name']}: {param['type']}" for param in cls.get("parameters", [])])
    print(f"  - {name}({parameters})")