#### Create base graph by walking through the directory and finding files and dirs, adding them to the graph

In [None]:
import os
from datetime import datetime
from neo4j import GraphDatabase
from dotenv import load_dotenv
from edoc.connect import connect_to_neo4j

class CodebaseGraph:
    def __init__(self, root_directory, uri="bolt://localhost:7687", user=None, password=None, openai_api_key=None):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            uri (str): The URI of the Neo4j database. Defaults to localhost.
            user (str): Username for Neo4j. If None, loads from environment variable NEO4J_USERNAME.
            password (str): Password for Neo4j. If None, loads from environment variable NEO4J_PASSWORD.
            openai_api_key (str): Key needed to access OpenAI API
        """
        # Load environment variables
        load_dotenv()

        # Set up the Neo4j connection
        self.uri = uri
        self.NEO4J_USER = user or os.getenv("NEO4J_USERNAME")
        self.NEO4J_PASSWORD =  password or os.getenv("NEO4J_PASSWORD")
        self.OPENAI_API_KEY = openai_api_key or os.getenv("OPENAI_API_KEY")

        self.root_directory = root_directory

        if not self.NEO4J_USER or not self.NEO4J_PASSWORD:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")
        
        if not self.OPENAI_API_KEY:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")

        self.kg = connect_to_neo4j()

    def get_file_info(self, file_path):
        """
        Get information about a file, including type, size, last modified date, creation date, permissions, owner, and hash.

        Args:
            file_path (str): The path to the file.

        Returns:
            dict: A dictionary containing file information.
        """
        stats = os.stat(file_path)
        file_type = os.path.splitext(file_path)[1][1:]  # Get file extension without the dot
        size = stats.st_size
        last_modified = datetime.fromtimestamp(stats.st_mtime).isoformat()
        created = datetime.fromtimestamp(stats.st_ctime).isoformat()


        return {
            "type": file_type,
            "size": size,
            "last_modified": last_modified,
            "created": created,
        }

    def _load_dirs_and_files_to_graph(self):
        """
        Traverse a directory and create a graph in Neo4j representing the directory structure and file information.

        Args:
            root_directory (str): The root directory to start traversing from.
        """
        for root, dirs, files in os.walk(self.root_directory):

            dir_name = os.path.basename(root)

            # Create node for the directory
            self.kg.query(
                """
                MERGE (dir:Directory {name: $dir_name, path: $path})
                ON CREATE SET dir.created = $created, dir.last_modified = $last_modified
                """,
                {
                    'dir_name':dir_name,
                    'path':root,
                    'created':datetime.fromtimestamp(os.stat(root).st_ctime).isoformat(),
                    'last_modified':datetime.fromtimestamp(os.stat(root).st_mtime).isoformat(),
                }
            )

            # Create nodes for subdirectories
            for dir_name in dirs:
                dir_path = os.path.join(root, dir_name)
                self.kg.query(
                    """
                        MERGE (subdir:Directory {name: $dir_name, path: $subdir_path})
                        ON CREATE SET subdir.created = $created, subdir.last_modified = $last_modified
                        WITH subdir
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(subdir)
                    """,
                    {
                        'dir_name':dir_name,
                        'subdir_path':dir_path, 
                        'parent_path':root,
                        'created':datetime.fromtimestamp(os.stat(dir_path).st_ctime).isoformat(),
                        'last_modified':datetime.fromtimestamp(os.stat(dir_path).st_mtime).isoformat(),
                    }
                )

            # Create nodes for files
            for file_name in files:
                file_path = os.path.join(root, file_name)
                file_info = self.get_file_info(file_path)
                self.kg.query(
                    """
                        MERGE (file:File {name: $file_name, path: $file_path})
                        ON CREATE SET file.type = $type, file.size = $size, file.last_modified = $last_modified, file.created = $created
                        WITH file
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(file)
                    """,
                    {
                        'file_name':file_name, 
                        'file_path':file_path, 
                        'type':file_info['type'], 
                        'size':file_info['size'], 
                        'last_modified':file_info['last_modified'], 
                        'created':file_info['created'], 
                        'parent_path':root
                    }
                )
        
    def create_graph(self):

        self._load_dirs_and_files_to_graph()

        


#### Next work on file enrichments. We will need to create functions to connect to opneai api as well as functions to perform embeddings and extractoins

In [None]:
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

def create_chat_completion(messages, model='gpt-4o-mini'):
    """
    Create a chat completion using the OpenAI API.

    Args:
        messages (list): A list of message dictionaries for the conversation.
        model (str): The OpenAI model to use. Default is 'gpt-4o-mini'.

    Returns:
        str: The content of the response from the OpenAI API.
    """
    client = OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(messages=messages, model=model)
    return response.choices[0].message.content

def summarize_file_chunk(chunk_text, file_name, model='gpt-4o-mini'):
    """
    Summarize a chunk of text from a file using OpenAI's language model.

    Args:
        chunk_text (str): The text chunk to summarize.
        file_name (str): The name of the file from which the chunk was extracted.
        model (str): The OpenAI model to use. Default is 'gpt-4o-mini'.

    Returns:
        str: A brief and clear summary of the chunk.
    """
    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"I have this text from a file named {file_name}. The text is:\n{chunk_text}\nPlease summarize it in simple terms. Try to keep the summary brief, but maintain clarity."}
    ]
    return create_chat_completion(messages=prompt, model=model)


In [None]:
test_file_name = "utils.py"
test_chunk = """
import os
from dotenv import load_dotenv

def process_file(input_path, output_path):
    with open(input_path, 'r') as f:
        content = f.read()
    
    with open(output_path, 'w') as f:
        f.write(content.upper())

    print(f"Processed {{input_path}} and saved results to {{output_path}}")

"""

In [None]:
test_summary = summarize_file_chunk(file_name=test_file_name, chunk_text=test_chunk)
print(test_summary)

#### Up next we will create embeddings from a summary

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
    """
    Generate an embedding for a given text using OpenAI's embedding model.

    Args:
        text (str): The text to be embedded. Newlines are replaced with spaces.
        model (str): The OpenAI model to use. Default is 'text-embedding-3-small'.

    Returns:
        list: A list of floats representing the embedding vector of the input text.
    """
    client = OpenAI(api_key=OPENAI_API_KEY)
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [None]:
test_embedding = get_embedding(text=test_summary)
len(test_embedding)

### Onto creating a structured output to use with llms. The goal should be to take code and extract imports, functions, and classes. We use [this](https://medium.com/neo4j/enhancing-the-accuracy-of-rag-applications-with-knowledge-graphs-ad5e2ffab663) for insperation.

In [None]:
from pydantic import BaseModel, Field
from typing import List, Optional

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

llm=ChatOpenAI(
    model_name='gpt-4o-mini'
)

class Parameter(BaseModel):
    """Model representing a function or class parameter."""
    name: str
    type: str

class FunctionEntity(BaseModel):
    """Model representing a function entity."""
    name: str
    parameters: List[Parameter]
    return_type: Optional[str] = None

class ClassEntity(BaseModel):
    """Model representing a class entity."""
    name: str
    parameters: List[Parameter]

class ImportEntity(BaseModel):
    """Model representing an import entity."""
    module: str
    entities: List[str]

class CodeEntities(BaseModel):
    """Identifying information about code entities."""
    
    imports: List[ImportEntity] = Field(
        default=[],
        description="All the import statements in the code, with the module "
        "and specific entities being imported.",
    )
    functions: List[FunctionEntity] = Field(
        default=[],
        description="All the function names in the code, including their parameters, returns, "
        "and types if available.",
    )
    classes: List[ClassEntity] = Field(
        default=[],
        description="All the class names in the code, including their parameters "
        "and types if available.",
    )

def extract_code_entities(code_string):
    """
    Extracts code entities from a given code string, including imports, function names, and class names.

    This function uses a language model to analyze the provided code string and extract relevant code entities. 
    For imports, it identifies the module and specific entities being imported. For functions and classes, it 
    captures their names, parameters, and types if available.

    Args:
        code_string (str): The code snippet as a string from which to extract entities.

    Returns:
        entities: An instance of CodeEntities containing the extracted imports, functions, and classes.
    """
    # Modify the prompt to focus on extracting code entities
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are extracting imports, function names, and class names from the given code. "
                "For imports, provide the module and specific entities being imported. "
                "For functions and classes, include their parameters and types if available.",
            ),
            (
                "human",
                "Use the given format to extract information from the following input: {code_snippet}",
            ),
        ]
    )

    # Set up the chain to extract the structured output
    entity_chain = prompt | llm.with_structured_output(CodeEntities)

    entities = entity_chain.invoke({'code_snippet': code_string})

    entities = entities.dict()

    return entities

In [None]:
entity_example = """
import java.util.List;
import java.util.Optional;

public class ExampleClass {
    private String param1;
    private int param2;

    public ExampleClass(String param1, int param2) {
        this.param1 = param1;
        this.param2 = param2;
    }

    public boolean exampleMethod(List<String> arg1, Optional<Integer> arg2) {
        return arg2.isPresent();
    }
}

"""

In [None]:
test_entities = extract_code_entities(entity_example)

test_entities

#### Now prepare to chunk files to be iterated through

In [None]:
from langchain.text_splitter import PythonCodeTextSplitter

text_splitter = PythonCodeTextSplitter(chunk_size=500, chunk_overlap=25)
chunks = text_splitter.split_text(entity_example)

In [None]:
chunks

In [None]:
def read_file_contents(file_path):
    """
    Opens a file and reads its contents as text.

    Args:
        file_path (str): The path to the file to be read.

    Returns:
        str: The contents of the file as a string.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            contents = file.read()
        return contents
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

In [None]:
import os
import json
from datetime import datetime
from neo4j import GraphDatabase
from dotenv import load_dotenv
from edoc.connect import connect_to_neo4j
from tqdm import tqdm

from langchain.text_splitter import PythonCodeTextSplitter


class CodebaseGraph:
    def __init__(
            self, 
            root_directory, 
            uri="bolt://localhost:7687", 
            user=None, 
            password=None, 
            openai_api_key=None,
            chunk_size=500,
            chunk_overlap=25
    ):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            uri (str): The URI of the Neo4j database. Defaults to localhost.
            user (str): Username for Neo4j. If None, loads from environment variable NEO4J_USERNAME.
            password (str): Password for Neo4j. If None, loads from environment variable NEO4J_PASSWORD.
            openai_api_key (str): Key needed to access OpenAI API
        """
        # Load environment variables
        load_dotenv()

        # Set up the Neo4j connection
        self.uri = uri
        self.NEO4J_USER = user or os.getenv("NEO4J_USERNAME")
        self.NEO4J_PASSWORD =  password or os.getenv("NEO4J_PASSWORD")
        self.OPENAI_API_KEY = openai_api_key or os.getenv("OPENAI_API_KEY")

        self.root_directory = root_directory

        if not self.NEO4J_USER or not self.NEO4J_PASSWORD:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")
        
        if not self.OPENAI_API_KEY:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")

        self.kg = connect_to_neo4j()

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        

    def get_file_info(self, file_path):
        """
        Get information about a file, including type, size, last modified date, creation date, permissions, owner, and hash.

        Args:
            file_path (str): The path to the file.

        Returns:
            dict: A dictionary containing file information.
        """
        stats = os.stat(file_path)
        file_type = os.path.splitext(file_path)[1][1:]  # Get file extension without the dot
        size = stats.st_size
        last_modified = datetime.fromtimestamp(stats.st_mtime).isoformat()
        created = datetime.fromtimestamp(stats.st_ctime).isoformat()


        return {
            "type": file_type,
            "size": size,
            "last_modified": last_modified,
            "created": created,
        }
    
    def read_file_contents(self, file_path):
        """
        Opens a file and reads its contents as text.

        Args:
            file_path (str): The path to the file to be read.

        Returns:
            str: The contents of the file as a string.
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                contents = file.read()
            return contents
        except Exception as e:
            print(f"An error occurred while reading the file: {e}")
            return None

    def _load_dirs_and_files_to_graph(self):
        """
        Traverse a directory and create a graph in Neo4j representing the directory structure and file information.

        Args:
            root_directory (str): The root directory to start traversing from.
        """
        for root, dirs, files in os.walk(self.root_directory):

            dir_name = os.path.basename(root)

            # Create node for the directory
            self.kg.query(
                """
                MERGE (dir:Directory {name: $dir_name, path: $path})
                ON CREATE SET dir.created = $created, dir.last_modified = $last_modified
                """,
                {
                    'dir_name':dir_name,
                    'path':root,
                    'created':datetime.fromtimestamp(os.stat(root).st_ctime).isoformat(),
                    'last_modified':datetime.fromtimestamp(os.stat(root).st_mtime).isoformat(),
                }
            )

            # Create nodes for subdirectories
            for dir_name in dirs:
                dir_path = os.path.join(root, dir_name)
                self.kg.query(
                    """
                        MERGE (subdir:Directory {name: $dir_name, path: $subdir_path})
                        ON CREATE SET subdir.created = $created, subdir.last_modified = $last_modified
                        WITH subdir
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(subdir)
                    """,
                    {
                        'dir_name':dir_name,
                        'subdir_path':dir_path, 
                        'parent_path':root,
                        'created':datetime.fromtimestamp(os.stat(dir_path).st_ctime).isoformat(),
                        'last_modified':datetime.fromtimestamp(os.stat(dir_path).st_mtime).isoformat(),
                    }
                )

            # Create nodes for files
            for file_name in files:
                file_path = os.path.join(root, file_name)
                file_info = self.get_file_info(file_path)
                self.kg.query(
                    """
                        MERGE (file:File {name: $file_name, path: $file_path})
                        ON CREATE SET file.type = $type, file.size = $size, file.last_modified = $last_modified, file.created = $created
                        WITH file
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(file)
                    """,
                    {
                        'file_name':file_name, 
                        'file_path':file_path, 
                        'type':file_info['type'], 
                        'size':file_info['size'], 
                        'last_modified':file_info['last_modified'], 
                        'created':file_info['created'], 
                        'parent_path':root
                    }
                )

    def _enrich_graph(self):
        """
        Enriches the knowledge graph by processing files, creating and linking code chunks, and extracting unique code entities.
        """
        text_splitter = PythonCodeTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

        # Query to find files without chunk nodes
        query = """
        MATCH (file:File)
        WHERE NOT (file)-[:CONTAINS]->(:Chunk)
        RETURN file.path AS file_path
        """

        result = self.kg.query(query)
        file_paths = [record['file_path'] for record in result]

        for file in tqdm(file_paths):
            file_contents = self.read_file_contents(file)

            if file_contents is not None:
                chunks = text_splitter.split_text(file_contents)

                # Initialize collections for unique entities
                unique_imports = {}
                unique_functions = {}
                unique_classes = {}

                for idx, chunk in enumerate(chunks):
                    chunk_id = f"{file}_chunk_{idx}"
                    chunk_summary = summarize_file_chunk(chunk_text=chunk, file_name=file)
                    summary_embedding = get_embedding(chunk_summary)
                    chunk_embedding = get_embedding(chunk)

                    # Create the chunk node and link it to the file
                    self.kg.query("""
                        MERGE (chunk:Chunk {id: $chunk_id})
                        SET chunk.raw_code = $raw_code, 
                            chunk.summary = $summary, 
                            chunk.summary_embedding = $summary_embedding, 
                            chunk.chunk_embedding = $chunk_embedding
                        WITH chunk
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:CONTAINS]->(chunk)
                    """, {
                        'chunk_id': chunk_id,
                        'raw_code': chunk,
                        'summary': chunk_summary,
                        'summary_embedding': summary_embedding,
                        'chunk_embedding': chunk_embedding,
                        'file_path': file
                    })

                    # Extract and process code entities for each chunk
                    chunk_entities = extract_code_entities(chunk)

                    # Collect unique imports
                    for imp in chunk_entities['imports']:
                        module_name = imp['module']
                        if module_name not in unique_imports:
                            unique_imports[module_name] = set(imp['entities'])
                        else:
                            unique_imports[module_name].update(imp['entities'])

                    # Collect unique functions
                    for func in chunk_entities['functions']:
                        func_name = func['name']
                        if func_name not in unique_functions:
                            unique_functions[func_name] = {
                                'parameters': json.dumps([{'name': param['name'], 'type': param['type']} for param in func['parameters']]),
                                'return_type': func['return_type']
                            }

                    # Collect unique classes
                    for cls in chunk_entities['classes']:
                        cls_name = cls['name']
                        if cls_name not in unique_classes:
                            unique_classes[cls_name] = {
                                'parameters': json.dumps([{'name': param['name'], 'type': param['type']} for param in cls['parameters']])
                            }

                # Store unique entities in the graph

                # Create and link unique import nodes
                for module, entities in unique_imports.items():
                    self.kg.query("""
                        MERGE (import:Import {module: $module})
                        SET import.entities = $entities
                        WITH import
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:CALLS]->(import)
                    """, {
                        'module': module,
                        'entities': list(entities),
                        'file_path': file
                    })

                # Create and link unique function nodes
                for name, func in unique_functions.items():
                    self.kg.query("""
                        MERGE (function:Function {name: $name})
                        SET function.parameters = $parameters, function.return_type = $return_type
                        WITH function
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:DEFINES]->(function)
                    """, {
                        'name': name,
                        'parameters': func['parameters'],
                        'return_type': func['return_type'],
                        'file_path': file
                    })

                # Create and link unique class nodes
                for name, cls in unique_classes.items():
                    self.kg.query("""
                        MERGE (class:Class {name: $name})
                        SET class.parameters = $parameters
                        WITH class
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:DEFINES]->(class)
                    """, {
                        'name': name,
                        'parameters': cls['parameters'],
                        'file_path': file
                    })

                # Link all chunks in sequence using APOC's `NEXT` relationship
                self.kg.query("""
                    MATCH (file:File {path: $file_path})-[:CONTAINS]->(chunk:Chunk)
                    WITH chunk ORDER BY chunk.id ASC
                    WITH collect(chunk) AS chunks
                    CALL apoc.nodes.link(chunks, 'NEXT')
                    RETURN count(*)
                """, {
                    'file_path': file
                })


    def create_graph(self):

        self._load_dirs_and_files_to_graph()

        self._enrich_graph()

        


#### Final step will be to go through the graph and look at files with summaries, and then use those summaries to summarize files. This aggregation procecss can be used for all files to summarize directories. After this we need to create vector index for the code snippets and the summaries

In [None]:
def summarize_list_of_summaries(summaries, model='gpt-4o-mini'):
    """
    Summarize a chunk of text from a file using OpenAI's language model.

    Args:
        summaries (lst[str]): The text chunk to summarize.
        model (str): The OpenAI model to use. Default is 'gpt-4o-mini'.

    Returns:
        str: A brief and clear summary of the chunk.
    """
    context = '\n'.join(summaries)

    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user", 
            "content": f"""I would like you to summarize some information for me. The goal is to condense understanding across a file system.
            It may include chunk summaries (which are smaller sections of a file), file summaries, or subdirectory summareis. 
            The goal is to gain global understanding by merging themes at higher levels.
            Please use the given context to make a summary, no need to confirm. Simply return simple and brief summaries based on context. The context is: {context}."""
        }
    ]
    return create_chat_completion(messages=prompt, model=model)

In [None]:
import os
import json
from datetime import datetime
from neo4j import GraphDatabase
from dotenv import load_dotenv
from edoc.connect import connect_to_neo4j
from tqdm import tqdm

from langchain.text_splitter import PythonCodeTextSplitter


class CodebaseGraph:
    def __init__(
            self, 
            root_directory, 
            uri="bolt://localhost:7687", 
            user=None, 
            password=None, 
            openai_api_key=None,
            chunk_size=500,
            chunk_overlap=25
    ):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            uri (str): The URI of the Neo4j database. Defaults to localhost.
            user (str): Username for Neo4j. If None, loads from environment variable NEO4J_USERNAME.
            password (str): Password for Neo4j. If None, loads from environment variable NEO4J_PASSWORD.
            openai_api_key (str): Key needed to access OpenAI API
        """
        # Load environment variables
        load_dotenv()

        # Set up the Neo4j connection
        self.uri = uri
        self.NEO4J_USER = user or os.getenv("NEO4J_USERNAME")
        self.NEO4J_PASSWORD =  password or os.getenv("NEO4J_PASSWORD")
        self.OPENAI_API_KEY = openai_api_key or os.getenv("OPENAI_API_KEY")

        self.root_directory = root_directory

        if not self.NEO4J_USER or not self.NEO4J_PASSWORD:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")
        
        if not self.OPENAI_API_KEY:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")

        self.kg = connect_to_neo4j()

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        

    def get_file_info(self, file_path):
        """
        Get information about a file, including type, size, last modified date, creation date, permissions, owner, and hash.

        Args:
            file_path (str): The path to the file.

        Returns:
            dict: A dictionary containing file information.
        """
        stats = os.stat(file_path)
        file_type = os.path.splitext(file_path)[1][1:]  # Get file extension without the dot
        size = stats.st_size
        last_modified = datetime.fromtimestamp(stats.st_mtime).isoformat()
        created = datetime.fromtimestamp(stats.st_ctime).isoformat()


        return {
            "type": file_type,
            "size": size,
            "last_modified": last_modified,
            "created": created,
        }
    
    def read_file_contents(self, file_path):
        """
        Opens a file and reads its contents as text.

        Args:
            file_path (str): The path to the file to be read.

        Returns:
            str: The contents of the file as a string.
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                contents = file.read()
            return contents
        except Exception as e:
            print(f"An error occurred while reading the file: {e}")
            return None
        
    def _should_skip_file(self, file_path):
        """
        Determine if a file should be skipped based on its type or size.

        Args:
            file_path (str): The path to the file.

        Returns:
            bool: True if the file should be skipped, False otherwise.
        """
        # Get the file extension
        _, ext = os.path.splitext(file_path)

        # Define file types to skip
        skip_extensions = {
            '.lock', '.png', '.jpg', '.gif', '.pdf', '.zip', '.class', '.o', '.out',
            '.md', '.rst', '.csv', '.tsv'
        }

        # Define directories to skip
        skip_directories = {'node_modules', '.git', '.svn'}

        # Check if the file extension is in the skip list
        if ext.lower() in skip_extensions:
            return True

        # Check if the file is in a directory that should be skipped
        if any(skip_dir in file_path for skip_dir in skip_directories):
            return True

        # Optionally, skip large files (e.g., > 5MB)
        if os.path.getsize(file_path) > 5 * 1024 * 1024:
            return True

        return False


    def _load_dirs_and_files_to_graph(self):
        """
        Traverse a directory and create a graph in Neo4j representing the directory structure and file information.

        Args:
            root_directory (str): The root directory to start traversing from.
        """
        print("Creating file and dir nodes from Walk")
        for root, dirs, files in os.walk(self.root_directory):

            dir_name = os.path.basename(root)

            # Create node for the directory
            self.kg.query(
                """
                MERGE (dir:Directory {name: $dir_name, path: $path})
                ON CREATE SET dir.created = $created, dir.last_modified = $last_modified
                """,
                {
                    'dir_name':dir_name,
                    'path':root,
                    'created':datetime.fromtimestamp(os.stat(root).st_ctime).isoformat(),
                    'last_modified':datetime.fromtimestamp(os.stat(root).st_mtime).isoformat(),
                }
            )

            # Create nodes for subdirectories
            for dir_name in dirs:
                dir_path = os.path.join(root, dir_name)
                self.kg.query(
                    """
                        MERGE (subdir:Directory {name: $dir_name, path: $subdir_path})
                        ON CREATE SET subdir.created = $created, subdir.last_modified = $last_modified
                        WITH subdir
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(subdir)
                    """,
                    {
                        'dir_name':dir_name,
                        'subdir_path':dir_path, 
                        'parent_path':root,
                        'created':datetime.fromtimestamp(os.stat(dir_path).st_ctime).isoformat(),
                        'last_modified':datetime.fromtimestamp(os.stat(dir_path).st_mtime).isoformat(),
                    }
                )

            # Create nodes for files
            for file_name in files:
                file_path = os.path.join(root, file_name)
                file_info = self.get_file_info(file_path)
                self.kg.query(
                    """
                        MERGE (file:File {name: $file_name, path: $file_path})
                        ON CREATE SET file.type = $type, file.size = $size, file.last_modified = $last_modified, file.created = $created
                        WITH file
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(file)
                    """,
                    {
                        'file_name':file_name, 
                        'file_path':file_path, 
                        'type':file_info['type'], 
                        'size':file_info['size'], 
                        'last_modified':file_info['last_modified'], 
                        'created':file_info['created'], 
                        'parent_path':root
                    }
                )

    def _enrich_graph(self):
        """
        Enriches the knowledge graph by processing files, creating and linking code chunks, and extracting unique code entities.
        """
        text_splitter = PythonCodeTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

        # Query to find files without chunk nodes
        query = """
        MATCH (file:File)
        WHERE NOT (file)-[:CONTAINS]->(:Chunk)
        RETURN file.path AS file_path
        """

        result = self.kg.query(query)
        file_paths = [record['file_path'] for record in result]

        file_paths = [file for file in file_paths if not self._should_skip_file(file)]

        for file in tqdm(file_paths, desc='Creating chunks from files'):
            file_contents = self.read_file_contents(file)

            if file_contents is not None:
                chunks = text_splitter.split_text(file_contents)

                # Initialize collections for unique entities
                unique_imports = {}
                unique_functions = {}
                unique_classes = {}

                for idx, chunk in enumerate(chunks):
                    chunk_id = f"{file}_chunk_{idx:06d}"
                    chunk_summary = summarize_file_chunk(chunk_text=chunk, file_name=file)
                    summary_embedding = get_embedding(chunk_summary)
                    chunk_embedding = get_embedding(chunk)

                    # Create the chunk node and link it to the file
                    self.kg.query("""
                        MERGE (chunk:Chunk {id: $chunk_id})
                        SET chunk.raw_code = $raw_code, 
                            chunk.summary = $summary, 
                            chunk.summary_embedding = $summary_embedding, 
                            chunk.chunk_embedding = $chunk_embedding
                        WITH chunk
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:CONTAINS]->(chunk)
                    """, {
                        'chunk_id': chunk_id,
                        'raw_code': chunk,
                        'summary': chunk_summary,
                        'summary_embedding': summary_embedding,
                        'chunk_embedding': chunk_embedding,
                        'file_path': file
                    })

                    # Extract and process code entities for each chunk
                    chunk_entities = extract_code_entities(chunk)

                    # Collect unique imports
                    for imp in chunk_entities['imports']:
                        module_name = imp['module']
                        if module_name not in unique_imports:
                            unique_imports[module_name] = set(imp['entities'])
                        else:
                            unique_imports[module_name].update(imp['entities'])

                    # Collect unique functions
                    for func in chunk_entities['functions']:
                        func_name = func['name']
                        if func_name not in unique_functions:
                            unique_functions[func_name] = {
                                'parameters': json.dumps([{'name': param['name'], 'type': param['type']} for param in func['parameters']]),
                                'return_type': func['return_type']
                            }

                    # Collect unique classes
                    for cls in chunk_entities['classes']:
                        cls_name = cls['name']
                        if cls_name not in unique_classes:
                            unique_classes[cls_name] = {
                                'parameters': json.dumps([{'name': param['name'], 'type': param['type']} for param in cls['parameters']])
                            }

                # Store unique entities in the graph

                # Create and link unique import nodes
                for module, entities in unique_imports.items():
                    self.kg.query("""
                        MERGE (import:Import {module: $module})
                        SET import.entities = $entities
                        WITH import
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:CALLS]->(import)
                    """, {
                        'module': module,
                        'entities': list(entities),
                        'file_path': file
                    })

                # Create and link unique function nodes
                for name, func in unique_functions.items():
                    self.kg.query("""
                        MERGE (function:Function {name: $name})
                        SET function.parameters = $parameters, function.return_type = $return_type
                        WITH function
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:DEFINES]->(function)
                    """, {
                        'name': name,
                        'parameters': func['parameters'],
                        'return_type': func['return_type'],
                        'file_path': file
                    })

                # Create and link unique class nodes
                for name, cls in unique_classes.items():
                    self.kg.query("""
                        MERGE (class:Class {name: $name})
                        SET class.parameters = $parameters
                        WITH class
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:DEFINES]->(class)
                    """, {
                        'name': name,
                        'parameters': cls['parameters'],
                        'file_path': file
                    })

                # Link all chunks in sequence using APOC's `NEXT` relationship
                self.kg.query("""
                    MATCH (file:File {path: $file_path})-[:CONTAINS]->(chunk:Chunk)
                    WITH chunk ORDER BY chunk.id ASC
                    WITH collect(chunk) AS chunks
                    CALL apoc.nodes.link(chunks, 'NEXT')
                    RETURN count(*)
                """, {
                    'file_path': file
                })

    def _find_files_without_summaries(self):
        """
        Find all files in the graph that do not have summaries.

        Returns:
            List[str]: A list of file paths that do not have summaries.
        """
        query = """
        MATCH (file:File)
        WHERE file.summary IS NULL
        RETURN file.path AS file_path
        """
        result = self.kg.query(query)
        return [record['file_path'] for record in result]

    def _find_directories_without_summaries(self):
        """
        Find all directories in the graph that do not have summaries.

        Returns:
            List[str]: A list of directory paths that do not have summaries.
        """
        query = """
        MATCH (dir:Directory)
        WHERE dir.summary IS NULL
        RETURN dir.path AS dir_path
        """
        result = self.kg.query(query)
        return [record['dir_path'] for record in result]
    
    def _find_nodes_without_embeddings(self):
        """
        Find all files and directories in the graph that have summaries but do not have embeddings.

        Returns:
            List[dict]: A list of dictionaries containing the node type ('File' or 'Directory') and the path.
        """
        query = """
        MATCH (n)
        WHERE n.summary IS NOT NULL AND n.summary_embedding IS NULL AND (n:File OR n:Directory)
        RETURN labels(n) AS node_type, n.path AS node_path
        """
        result = self.kg.query(query)
        return [{'node_type': record['node_type'][0], 'node_path': record['node_path']} for record in result]
    
    
    def _summarize_file_from_chunks(self, file_path):
        """
        Summarize a file based on the summaries of its chunks.

        Args:
            file_path (str): The path to the file to summarize.

        Returns:
            str: The summary of the file.
        """
        # Query to get summaries of all chunks associated with the file
        query = """
        MATCH (file:File {path: $file_path})-[:CONTAINS]->(chunk:Chunk)
        RETURN chunk.summary AS chunk_summary
        ORDER BY chunk.id ASC
        """
        result = self.kg.query(query, {'file_path': file_path})
        chunk_summaries = ['Chunk summaries'] + [record['chunk_summary'] for record in result]

        # Summarize the list of chunk summaries
        file_summary = summarize_list_of_summaries(chunk_summaries)

        # Store the file summary in the graph under the "summary" attribute
        self.kg.query("""
            MATCH (file:File {path: $file_path})
            SET file.summary = $file_summary
        """, {
            'file_path': file_path,
            'file_summary': file_summary
        })

        return file_summary
    
    def _summarize_directory(self, directory_path):
        """
        Summarize a directory based on the summaries of its files and subdirectories.

        Args:
            directory_path (str): The path to the directory to summarize.

        Returns:
            str: The summary of the directory.
        """
        # Query to get summaries of all files directly contained in the directory
        file_query = """
        MATCH (dir:Directory {path: $directory_path})-[:CONTAINS]->(file:File)
        RETURN file.summary AS file_summary
        """
        file_result = self.kg.query(file_query, {'directory_path': directory_path})
        file_summaries = [record['file_summary'] for record in file_result]

        # Query to get all subdirectories directly contained in the directory
        subdir_query = """
        MATCH (dir:Directory {path: $directory_path})-[:CONTAINS]->(subdir:Directory)
        RETURN subdir.path AS subdir_path
        """
        subdir_result = self.kg.query(subdir_query, {'directory_path': directory_path})
        subdir_paths = [record['subdir_path'] for record in subdir_result]

        # Recursively summarize each subdirectory if it doesn't already have a summary
        subdir_summaries = []
        for subdir_path in subdir_paths:
            subdir_summary = self._summarize_directory(subdir_path)
            subdir_summaries.append(subdir_summary)

        # Combine file summaries and subdirectory summaries
        all_summaries = ['File summaries: '] + file_summaries + ['Subdirectory summaries: ']  + subdir_summaries

        # Summarize the list of all summaries (files + subdirectories)
        directory_summary = summarize_list_of_summaries(all_summaries)

        # Store the directory summary in the graph under the "summary" attribute
        self.kg.query("""
            MATCH (dir:Directory {path: $directory_path})
            SET dir.summary = $directory_summary
        """, {
            'directory_path': directory_path,
            'directory_summary': directory_summary
        })

        return directory_summary

    def _automate_summarization(self):
        """
        Automate the summarization process for files and directories in the graph.
        """
        # Summarize files without summaries
        files_without_summaries = self._find_files_without_summaries()
        for file_path in tqdm(files_without_summaries, desc='Summarizing files'):
            self._summarize_file_from_chunks(file_path)

        # Summarize directories without summaries
        directories_without_summaries = self._find_directories_without_summaries()
        for dir_path in tqdm(directories_without_summaries, 'Summarizing directories'):
            self._summarize_directory(dir_path)

        # Generate embeddings for nodes that have summaries but no embeddings
        self._generate_and_store_embeddings()

    def _generate_and_store_embeddings(self):
        """
        Generate embeddings for files and directories that have summaries but lack embeddings.
        """
        nodes_without_embeddings = self._find_nodes_without_embeddings()

        for node in tqdm(nodes_without_embeddings, desc='Creating File and Directory embeddings'):
            # Retrieve the summary of the node
            query = f"""
            MATCH (n:{node['node_type']} {{path: $node_path}})
            RETURN n.summary AS summary
            """
            result = self.kg.query(query, {'node_path': node['node_path']})
            summary = result[0]['summary']

            # Generate the embedding for the summary
            embedding = get_embedding(summary)

            # Store the embedding back in the graph
            query = f"""
            MATCH (n:{node['node_type']} {{path: $node_path}})
            SET n.summary_embedding = $embedding
            """
            self.kg.query(query, {
                'node_path': node['node_path'],
                'embedding': embedding
            })

    def _create_vector_index(self, label, property_name="summary_embeddings", index_name=None, dimensions=1536):
        """
        Create a vector index for the specified label if it does not already exist.

        Args:
            label (str): The label of the nodes (e.g., 'File', 'Directory', 'Chunk').
            property_name (str): The property name on which the vector index is created. Default is 'summary_embeddings'.
            index_name (str): The name of the index. If None, it will default to 'labelVectorIndex'.
            dimensions (int): The dimensionality of the vectors. Default is 1536.
        """
        if not index_name:
            index_name = f"{label.lower()}VectorIndex"

        query = f"""
        CREATE VECTOR INDEX {index_name} IF NOT EXISTS
        FOR (n:{label})
        ON n.{property_name}
        OPTIONS {{
            indexConfig: {{
                `vector.dimensions`: {dimensions},
                `vector.similarity_function`: 'cosine'
            }}
        }}
        """
        try:
            self.kg.query(query)
            print(f"Vector index {index_name} for label {label} created successfully.")
        except Exception as e:
            print(f"An error occurred while creating the vector index: {e}")

    def _create_all_vector_indexes(self):
        """
        Create vector indexes for chunks, files, and directories. The indexes are separated for chunks and summaries.
        """
        # Create index for chunks
        self._create_vector_index(label="Chunk", property_name="chunk_embeddings", index_name="chunkRawVectorIndex")
        self._create_vector_index(label="Chunk", property_name="summary_embeddings", index_name="chunkSummaryVectorIndex")

        # Create index for files and directories
        self._create_vector_index(label="File", property_name="summary_embeddings", index_name="fileSummaryVectorIndex")
        self._create_vector_index(label="Directory", property_name="summary_embeddings", index_name="dirSummaryVectorIndex")




    def create_graph(self):

        self._load_dirs_and_files_to_graph()

        self._enrich_graph()

        self._automate_summarization()

        self._generate_and_store_embeddings()

        self._create_all_vector_indexes()



#### Refactor since class has gotten huge

In [None]:
def read_file_contents(file_path):
    """
    Opens a file and reads its contents as text.

    Args:
        file_path (str): The path to the file to be read.

    Returns:
        str: The contents of the file as a string.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            contents = file.read()
        return contents
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

def should_skip_file(file_path):
    """
    Determine if a file should be skipped based on its type or size.

    Args:
        file_path (str): The path to the file.

    Returns:
        bool: True if the file should be skipped, False otherwise.
    """
    # Get the file extension
    _, ext = os.path.splitext(file_path)

    # Define file types to skip
    skip_extensions = {
        '.lock', '.png', '.jpg', '.gif', '.pdf', '.zip', '.class', '.o', '.out',
        '.md', '.rst', '.csv', '.tsv'
    }

    # Define directories to skip
    skip_directories = {'node_modules', '.git', '.svn'}

    # Check if the file extension is in the skip list
    if ext.lower() in skip_extensions:
        return True

    # Check if the file is in a directory that should be skipped
    if any(skip_dir in file_path for skip_dir in skip_directories):
        return True

    # Optionally, skip large files (e.g., > 5MB)
    if os.path.getsize(file_path) > 5 * 1024 * 1024:
        return True

    return False

class FileSystemProcessor:
    def __init__(
            self, 
            root_directory 
    ):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            root_directory (str): The directory to be extracted into knowledge.
        """

        self.root_directory = root_directory
        

    def _get_file_info(self, file_path):
        """
        Get information about a file, including type, size, last modified date, creation date, permissions, owner, and hash.

        Args:
            file_path (str): The path to the file.

        Returns:
            dict: A dictionary containing file information.
        """
        stats = os.stat(file_path)
        file_type = os.path.splitext(file_path)[1][1:]  # Get file extension without the dot
        size = stats.st_size
        last_modified = datetime.fromtimestamp(stats.st_mtime).isoformat()
        created = datetime.fromtimestamp(stats.st_ctime).isoformat()


        return {
            "type": file_type,
            "size": size,
            "last_modified": last_modified,
            "created": created,
        }

    def load_dirs_and_files_to_graph(self, kg):
        """
        Traverse a directory and create a graph in Neo4j representing the directory structure and file information.

        Args:
            kg (Neo4jGraph): graph object to complete cypher queries
        """
        print("Creating file and dir nodes from Walk")
        for root, dirs, files in os.walk(self.root_directory):

            dir_name = os.path.basename(root)

            # Create node for the directory
            kg.query(
                """
                MERGE (dir:Directory {name: $dir_name, path: $path})
                ON CREATE SET dir.created = $created, dir.last_modified = $last_modified
                """,
                {
                    'dir_name':dir_name,
                    'path':root,
                    'created':datetime.fromtimestamp(os.stat(root).st_ctime).isoformat(),
                    'last_modified':datetime.fromtimestamp(os.stat(root).st_mtime).isoformat(),
                }
            )

            # Create nodes for subdirectories
            for dir_name in dirs:
                dir_path = os.path.join(root, dir_name)
                kg.query(
                    """
                        MERGE (subdir:Directory {name: $dir_name, path: $subdir_path})
                        ON CREATE SET subdir.created = $created, subdir.last_modified = $last_modified
                        WITH subdir
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(subdir)
                    """,
                    {
                        'dir_name':dir_name,
                        'subdir_path':dir_path, 
                        'parent_path':root,
                        'created':datetime.fromtimestamp(os.stat(dir_path).st_ctime).isoformat(),
                        'last_modified':datetime.fromtimestamp(os.stat(dir_path).st_mtime).isoformat(),
                    }
                )

            # Create nodes for files
            for file_name in files:
                file_path = os.path.join(root, file_name)
                file_info = self._get_file_info(file_path)
                kg.query(
                    """
                        MERGE (file:File {name: $file_name, path: $file_path})
                        ON CREATE SET file.type = $type, file.size = $size, file.last_modified = $last_modified, file.created = $created
                        WITH file
                        MATCH (parent:Directory {path: $parent_path})
                        MERGE (parent)-[:CONTAINS]->(file)
                    """,
                    {
                        'file_name':file_name, 
                        'file_path':file_path, 
                        'type':file_info['type'], 
                        'size':file_info['size'], 
                        'last_modified':file_info['last_modified'], 
                        'created':file_info['created'], 
                        'parent_path':root
                    }
                )

class GraphBuilder:
    def __init__(
            self, 
            kg,
            chunk_size=500,
            chunk_overlap=25
    ):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            kg (Neo4jGraph): graph object to complete cypher queries
            chunk_size (int): size of chunk to use (by number of tokens)
            chunk_overlap (int): number of chunks to overlap when splitting
        """
        self.kg = kg
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def enrich_graph(self):
        """
        Enriches the knowledge graph by processing files, creating and linking code chunks, and extracting unique code entities.
        """
        text_splitter = PythonCodeTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

        # Query to find files without chunk nodes
        query = """
        MATCH (file:File)
        WHERE NOT (file)-[:CONTAINS]->(:Chunk)
        RETURN file.path AS file_path
        """

        result = self.kg.query(query)
        file_paths = [record['file_path'] for record in result]

        file_paths = [file for file in file_paths if not should_skip_file(file)]

        for file in tqdm(file_paths, desc='Creating chunks from files'):
            file_contents = read_file_contents(file)

            if file_contents is not None:
                chunks = text_splitter.split_text(file_contents)

                # Initialize collections for unique entities
                unique_imports = {}
                unique_functions = {}
                unique_classes = {}

                for idx, chunk in enumerate(chunks):
                    chunk_id = f"{file}_chunk_{idx:06d}"
                    chunk_summary = summarize_file_chunk(chunk_text=chunk, file_name=file)
                    summary_embedding = get_embedding(chunk_summary)
                    chunk_embedding = get_embedding(chunk)

                    # Create the chunk node and link it to the file
                    self.kg.query("""
                        MERGE (chunk:Chunk {id: $chunk_id})
                        SET chunk.raw_code = $raw_code, 
                            chunk.summary = $summary, 
                            chunk.summary_embedding = $summary_embedding, 
                            chunk.chunk_embedding = $chunk_embedding
                        WITH chunk
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:CONTAINS]->(chunk)
                    """, {
                        'chunk_id': chunk_id,
                        'raw_code': chunk,
                        'summary': chunk_summary,
                        'summary_embedding': summary_embedding,
                        'chunk_embedding': chunk_embedding,
                        'file_path': file
                    })

                    # Extract and process code entities for each chunk
                    chunk_entities = extract_code_entities(chunk)

                    # Collect unique imports
                    for imp in chunk_entities['imports']:
                        module_name = imp['module']
                        if module_name not in unique_imports:
                            unique_imports[module_name] = set(imp['entities'])
                        else:
                            unique_imports[module_name].update(imp['entities'])

                    # Collect unique functions
                    for func in chunk_entities['functions']:
                        func_name = func['name']
                        if func_name not in unique_functions:
                            unique_functions[func_name] = {
                                'parameters': json.dumps([{'name': param['name'], 'type': param['type']} for param in func['parameters']]),
                                'return_type': func['return_type']
                            }

                    # Collect unique classes
                    for cls in chunk_entities['classes']:
                        cls_name = cls['name']
                        if cls_name not in unique_classes:
                            unique_classes[cls_name] = {
                                'parameters': json.dumps([{'name': param['name'], 'type': param['type']} for param in cls['parameters']])
                            }

                # Store unique entities in the graph

                # Create and link unique import nodes
                for module, entities in unique_imports.items():
                    self.kg.query("""
                        MERGE (import:Import {module: $module})
                        SET import.entities = $entities
                        WITH import
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:CALLS]->(import)
                    """, {
                        'module': module,
                        'entities': list(entities),
                        'file_path': file
                    })

                # Create and link unique function nodes
                for name, func in unique_functions.items():
                    self.kg.query("""
                        MERGE (function:Function {name: $name})
                        SET function.parameters = $parameters, function.return_type = $return_type
                        WITH function
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:DEFINES]->(function)
                    """, {
                        'name': name,
                        'parameters': func['parameters'],
                        'return_type': func['return_type'],
                        'file_path': file
                    })

                # Create and link unique class nodes
                for name, cls in unique_classes.items():
                    self.kg.query("""
                        MERGE (class:Class {name: $name})
                        SET class.parameters = $parameters
                        WITH class
                        MATCH (file:File {path: $file_path})
                        MERGE (file)-[:DEFINES]->(class)
                    """, {
                        'name': name,
                        'parameters': cls['parameters'],
                        'file_path': file
                    })

                # Link all chunks in sequence using APOC's `NEXT` relationship
                self.kg.query("""
                    MATCH (file:File {path: $file_path})-[:CONTAINS]->(chunk:Chunk)
                    WITH chunk ORDER BY chunk.id ASC
                    WITH collect(chunk) AS chunks
                    CALL apoc.nodes.link(chunks, 'NEXT')
                    RETURN count(*)
                """, {
                    'file_path': file
                })

    def _create_vector_index(self, label, property_name="summary_embeddings", index_name=None, dimensions=1536):
        """
        Create a vector index for the specified label if it does not already exist.

        Args:
            label (str): The label of the nodes (e.g., 'File', 'Directory', 'Chunk').
            property_name (str): The property name on which the vector index is created. Default is 'summary_embeddings'.
            index_name (str): The name of the index. If None, it will default to 'labelVectorIndex'.
            dimensions (int): The dimensionality of the vectors. Default is 1536.
        """
        if not index_name:
            index_name = f"{label.lower()}VectorIndex"

        query = f"""
        CREATE VECTOR INDEX {index_name} IF NOT EXISTS
        FOR (n:{label})
        ON n.{property_name}
        OPTIONS {{
            indexConfig: {{
                `vector.dimensions`: {dimensions},
                `vector.similarity_function`: 'cosine'
            }}
        }}
        """
        try:
            self.kg.query(query)
            print(f"Vector index {index_name} for label {label} created successfully.")
        except Exception as e:
            print(f"An error occurred while creating the vector index: {e}")

    def create_all_vector_indexes(self):
        """
        Create vector indexes for chunks, files, and directories. The indexes are separated for chunks and summaries.
        """
        # Create index for chunks
        self._create_vector_index(label="Chunk", property_name="chunk_embeddings", index_name="chunkRawVectorIndex")
        self._create_vector_index(label="Chunk", property_name="summary_embeddings", index_name="chunkSummaryVectorIndex")

        # Create index for files and directories
        self._create_vector_index(label="File", property_name="summary_embeddings", index_name="fileSummaryVectorIndex")
        self._create_vector_index(label="Directory", property_name="summary_embeddings", index_name="dirSummaryVectorIndex")

class SummaryManager:
    def __init__(
            self, 
            kg
    ):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            kg (Neo4jGraph): graph object to complete cypher queries
        """
        self.kg = kg
    
    def _find_files_without_summaries(self):
        """
        Find all files in the graph that do not have summaries.

        Returns:
            List[str]: A list of file paths that do not have summaries.
        """
        query = """
        MATCH (file:File)
        WHERE file.summary IS NULL
        RETURN file.path AS file_path
        """
        result = self.kg.query(query)
        return [record['file_path'] for record in result]

    def _find_directories_without_summaries(self):
        """
        Find all directories in the graph that do not have summaries.

        Returns:
            List[str]: A list of directory paths that do not have summaries.
        """
        query = """
        MATCH (dir:Directory)
        WHERE dir.summary IS NULL
        RETURN dir.path AS dir_path
        """
        result = self.kg.query(query)
        return [record['dir_path'] for record in result]
    
    def _find_nodes_without_embeddings(self):
        """
        Find all files and directories in the graph that have summaries but do not have embeddings.

        Returns:
            List[dict]: A list of dictionaries containing the node type ('File' or 'Directory') and the path.
        """
        query = """
        MATCH (n)
        WHERE n.summary IS NOT NULL AND n.summary_embedding IS NULL AND (n:File OR n:Directory)
        RETURN labels(n) AS node_type, n.path AS node_path
        """
        result = self.kg.query(query)
        return [{'node_type': record['node_type'][0], 'node_path': record['node_path']} for record in result]
    
    
    def _summarize_file_from_chunks(self, file_path):
        """
        Summarize a file based on the summaries of its chunks.

        Args:
            file_path (str): The path to the file to summarize.

        Returns:
            str: The summary of the file.
        """
        # Query to get summaries of all chunks associated with the file
        query = """
        MATCH (file:File {path: $file_path})-[:CONTAINS]->(chunk:Chunk)
        RETURN chunk.summary AS chunk_summary
        ORDER BY chunk.id ASC
        """
        result = self.kg.query(query, {'file_path': file_path})
        chunk_summaries = ['Chunk summaries'] + [record['chunk_summary'] for record in result]

        # Summarize the list of chunk summaries
        file_summary = summarize_list_of_summaries(chunk_summaries)

        # Store the file summary in the graph under the "summary" attribute
        self.kg.query("""
            MATCH (file:File {path: $file_path})
            SET file.summary = $file_summary
        """, {
            'file_path': file_path,
            'file_summary': file_summary
        })

        return file_summary
    
    def _summarize_directory(self, directory_path):
        """
        Summarize a directory based on the summaries of its files and subdirectories.

        Args:
            directory_path (str): The path to the directory to summarize.

        Returns:
            str: The summary of the directory.
        """
        # Query to get summaries of all files directly contained in the directory
        file_query = """
        MATCH (dir:Directory {path: $directory_path})-[:CONTAINS]->(file:File)
        RETURN file.summary AS file_summary
        """
        file_result = self.kg.query(file_query, {'directory_path': directory_path})
        file_summaries = [record['file_summary'] for record in file_result]

        # Query to get all subdirectories directly contained in the directory
        subdir_query = """
        MATCH (dir:Directory {path: $directory_path})-[:CONTAINS]->(subdir:Directory)
        RETURN subdir.path AS subdir_path
        """
        subdir_result = self.kg.query(subdir_query, {'directory_path': directory_path})
        subdir_paths = [record['subdir_path'] for record in subdir_result]

        # Recursively summarize each subdirectory if it doesn't already have a summary
        subdir_summaries = []
        for subdir_path in subdir_paths:
            subdir_summary = self._summarize_directory(subdir_path)
            subdir_summaries.append(subdir_summary)

        # Combine file summaries and subdirectory summaries
        all_summaries = ['File summaries: '] + file_summaries + ['Subdirectory summaries: ']  + subdir_summaries

        # Summarize the list of all summaries (files + subdirectories)
        directory_summary = summarize_list_of_summaries(all_summaries)

        # Store the directory summary in the graph under the "summary" attribute
        self.kg.query("""
            MATCH (dir:Directory {path: $directory_path})
            SET dir.summary = $directory_summary
        """, {
            'directory_path': directory_path,
            'directory_summary': directory_summary
        })

        return directory_summary

    def _generate_and_store_embeddings(self):
        """
        Generate embeddings for files and directories that have summaries but lack embeddings.
        """
        nodes_without_embeddings = self._find_nodes_without_embeddings()

        for node in tqdm(nodes_without_embeddings, desc='Creating File and Directory embeddings'):
            # Retrieve the summary of the node
            query = f"""
            MATCH (n:{node['node_type']} {{path: $node_path}})
            RETURN n.summary AS summary
            """
            result = self.kg.query(query, {'node_path': node['node_path']})
            summary = result[0]['summary']

            # Generate the embedding for the summary
            embedding = get_embedding(summary)

            # Store the embedding back in the graph
            query = f"""
            MATCH (n:{node['node_type']} {{path: $node_path}})
            SET n.summary_embedding = $embedding
            """
            self.kg.query(query, {
                'node_path': node['node_path'],
                'embedding': embedding
            })

    def automate_summarization(self):
        """
        Automate the summarization process for files and directories in the graph.
        """
        # Summarize files without summaries
        files_without_summaries = self._find_files_without_summaries()
        for file_path in tqdm(files_without_summaries, desc='Summarizing files'):
            self._summarize_file_from_chunks(file_path)

        # Summarize directories without summaries
        directories_without_summaries = self._find_directories_without_summaries()
        for dir_path in tqdm(directories_without_summaries, 'Summarizing directories'):
            self._summarize_directory(dir_path)

        # Generate embeddings for nodes that have summaries but no embeddings
        self._generate_and_store_embeddings()

class CodebaseGraph:
    def __init__(
            self, 
            root_directory, 
            uri="bolt://localhost:7687", 
            user=None, 
            password=None, 
            openai_api_key=None,
            chunk_size=500,
            chunk_overlap=25
    ):
        """
        Initialize the CodebaseGraph with a connection to Neo4j.

        Args:
            root_directory (str): The directory to be extracted into knowledge.
            uri (str): The URI of the Neo4j database. Defaults to localhost.
            user (str): Username for Neo4j. If None, loads from environment variable NEO4J_USERNAME.
            password (str): Password for Neo4j. If None, loads from environment variable NEO4J_PASSWORD.
            openai_api_key (str): Key needed to access OpenAI API
            chunk_size (int): size of chunk to use (by number of tokens)
            chunk_overlap (int): number of chunks to overlap when splitting
        """
        # Load environment variables
        load_dotenv()

        # Set up the Neo4j connection
        self.uri = uri
        self.NEO4J_USER = user or os.getenv("NEO4J_USERNAME")
        self.NEO4J_PASSWORD =  password or os.getenv("NEO4J_PASSWORD")
        self.OPENAI_API_KEY = openai_api_key or os.getenv("OPENAI_API_KEY")

        if not self.NEO4J_USER or not self.NEO4J_PASSWORD:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")
        
        if not self.OPENAI_API_KEY:
            raise ValueError("NEO4J_USERNAME and NEO4J_PASSWORD must be provided either as arguments or environment variables.")

        self.kg = connect_to_neo4j()

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        self.fs_processor = FileSystemProcessor(root_directory)
        self.graph_builder = GraphBuilder(
            self.kg, 
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
        self.summary_manager = SummaryManager(self.kg)

    def create_graph(self):
        self.fs_processor.load_dirs_and_files_to_graph(self.kg)
        self.graph_builder.enrich_graph()
        self.summary_manager.automate_summarization()
        self.graph_builder.create_all_vector_indexes()

In [None]:
# Example usage
codebase_graph = CodebaseGraph(
    root_directory="C:\\Users\\willd\\Documents\\Git\\graphRag\\test_project"
)
codebase_graph.create_graph()

#### We need to improve summarizations. Do this in two steps:

1. Create an ascii visual of the file system to be passed to the summarization components (for files and dirs)
2. Create a markdown template for the summaries, possibly splitting the file and dir components

Step (2) can be modified as is from the `build_tools\utils.py` and `summary_tools\utils.py` files

In [3]:
import os
from edoc.gpt_helpers.gpt_basics import create_chat_completion

def generate_ascii_structure(root_directory, model='gpt-4o-mini'):
    """
    Generates an ASCII file structure from the root directory using OpenAI's language model.

    Args:
        root_directory (str): The root directory to summarize.
        model (str): The OpenAI model to use. Default is 'gpt-4o-mini'.

    Returns:
        str: The ASCII file structure summarized by the model.
    """
    # Step 1: Create a basic tree structure using os.walk()
    root_directory = str(root_directory)
    
    file_structure = ""
    for root, dirs, files in os.walk(root_directory):
        # Calculate the level of depth (indentation)
        root = str(root)
        level = root.replace(root_directory, '').count(os.sep)
        indent = ' ' * 4 * level
        # Add directory
        file_structure += f"{indent}{os.path.basename(root)}/\n"
        # Add files within the directory
        sub_indent = ' ' * 4 * (level + 1)
        for f in files:
            file_structure += f"{sub_indent}{f}\n"

    # Step 2: Generate the prompt for the LLM
    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"I have the following file structure:\n{file_structure}\nPlease convert this into a clean and simple ASCII tree format. No need for any extra words, just the tree please."}
    ]
    
    # Step 3: Use the LLM to refine the file structure into a well-formatted ASCII tree
    ascii_tree = create_chat_completion(messages=prompt, model=model)
    
    return ascii_tree


In [None]:
ascii_structure_as_str = generate_ascii_structure(root_directory="C:\\Users\\willd\\Documents\\Git\\resume_website\\react-resume-template")
print(ascii_structure_as_str)