In [1]:
%pip install langchain_community langchain-openai python-dotenv tqdm



Note: you may need to restart the kernel to use updated packages.


In [2]:
# setup
import os

import os
import subprocess
from datetime import datetime

def get_git_root(repo_directory):
    """
    Determines the root directory of the Git repository.
    
    Args:
        repo_directory (str): Path to any directory within the Git repository.
    
    Returns:
        str: Path to the Git repository root, or None if not a Git repository.
    """
    try:
        result = subprocess.run(
            ['git', '-C', repo_directory, 'rev-parse', '--show-toplevel'],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        git_root = result.stdout.strip()
        return git_root
    except subprocess.CalledProcessError:
        print(f"Directory {repo_directory} is not a Git repository.")
        return None
    except Exception as e:
        print(f"Error determining Git root for {repo_directory}: {e}")
        return None

# Set the path to your Flutter/Dart project directory
repo_directory = "/Volumes/Mac-External/Development/Tradework/tradework_platform"

# Check if the directory exists
if os.path.exists(repo_directory):
    print(f"Directory exists: {repo_directory}")
else:
    print(f"Directory does not exist: {repo_directory}")

# Determine the Git root
git_root = get_git_root(repo_directory)
if git_root:
    print(f"Git repository root: {git_root}")
else:
    print("The specified directory is not within a Git repository.")
    exit(1)  # Exit if not a Git repository

import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

Directory exists: /Volumes/Mac-External/Development/Tradework/tradework_platform
Git repository root: /Volumes/Mac-External/Development/Tradework/tradework_platform


True

In [3]:

# Comment styles based on file type
COMMENT_STYLES = {
    '.dart': '//',
    '.py': '#',
    '.yaml': '#',
    '.yml': '#',
    '.json': '//',     # JSON typically doesn't support comments, but we use // for this purpose
    '.html': '<!--',   # HTML uses <!-- -->
    '.css': '/*',      # CSS uses /* */
    '.js': '//',       # JavaScript uses //
    '.ts': '//',       # TypeScript uses //
    '.md': '```',      # Markdown for code blocks (if needed)
    '.xml': '<!--',    # XML uses <!-- -->
    '.cpp': '//',      # C++ style
    '.java': '//',     # Java style
}

def get_comment_style(file_path):
    """Determine the comment style based on the file extension."""
    _, ext = os.path.splitext(file_path)
    return COMMENT_STYLES.get(ext, '#')  # Default to '#' if extension not found


In [4]:
# Define the template for generating metadata
metadata_prompt = """
You are an AI assistant helping with code documentation for the TradeWork platform. TradeWork is a UK-based property management platform built with Firebase as the backend and Flutter as the Web App frontend. The platform connects landlords, developers, contractors, and subcontractors, supporting project and tender management, vendor selection, and payment processing.

The following information is provided to you:
- File: {file_name}
- Module: {module_name}
- Date Created: {date_created}
- Last Updated: {last_updated}
- Existing Comments: {existing_comments}

Based on the file content and the provided information, generate a detailed metadata block. If any field cannot be determined from the file content, infer the best possible answer based on the context of the platform.

The format should be:

# File: {file_name}
# Module: {module_name}
# Description: A brief description of what the file does based on its content.
# Dependencies: Any dependencies or related files (e.g., Firebase, chat, payment integration, AI chatbot).
# Components: The major components or classes defined in the file, such as widgets, BLoCs, models.
# Role: Whether the file is role-specific (e.g., Landlord, Developer, Contractor, Subcontractor).
# Author: Piers
# Date Created: {date_created}
# Last Updated: {last_updated}
# Related Files: Any related files (BLoC, models, services, etc.).
# Key: Keywords like bloc, widget, model, firebase, payment, chat, etc.
"""

In [5]:
# Define the app context with full TradeWork details
app_context = """
TradeWork is a UK-based platform that aggregates resources and professionals within the property market. It allows landlords, developers, and contractors to manage projects, jobs, and tasks. The platform is designed to streamline property-related workflows by providing access to a wide network of skilled professionals and data resources.

Key details:
- Backend: Firebase (authentication, Firestore, functions for backend logic).
- Frontend: Flutter Web App.
- Role-based workflows: Landlord, Developer, Contractor, Subcontractor.
- Core functionalities: Project creation, tender management, vendor selection, chat, peer ratings, invoice management, payment integration, and AI assistance.

Folder structure:
1. Core Shared Components (twcore/):
   - application/: Manages common logic and state using BLoCs (e.g., authentication, chat).
   - models/: Core data models (e.g., user, auth, company) shared across all users.
   - services/: Shared services for external integrations, AI, payment, and utilities.
   - widgets/: Reusable UI components used throughout the app.

2. Shared Features (shared_features/):
   - Shared features like AI, project management, and document viewing follow the structure:
     - application/: Contains BLoCs managing feature-specific state.
     - models/: Feature-specific models.
     - ui/: Desktop and mobile-specific UI components.
     - services/: Services specific to each feature.
     - widgets/: Reusable widgets for feature-related tasks.

3. User-Specific Features (users/):
   - Each user type (e.g., landlord, contractor, developer) has a similar internal structure:
     - Top-Level: Contains user-specific application/, models/, ui/, services/, and widgets/ to handle the overall user interface and core tasks.
     - Features: Within each user, specific features (e.g., property management for landlords or bid management for subcontractors) follow the same folder structure:
       - application/: BLoCs for managing state related to the feature.
       - models/: Feature-specific models.
       - ui/: Desktop/mobile-specific UI components for the feature.
       - services/: Services for handling feature-specific logic.
       - widgets/: Reusable widgets for feature-related tasks.
"""

# Helper Files

In [6]:
from datetime import datetime

def has_metadata(file_path, comment_style, required_keys=None):
    """
    Checks if the file already contains the metadata block.

    Args:
        file_path (str): Absolute path to the file.
        comment_style (str): The comment prefix based on file type (e.g., '//', '#').
        required_keys (list, optional): List of required metadata keys. Defaults to None.

    Returns:
        bool: True if all required metadata keys are found, False otherwise.
    """
    if required_keys is None:
        required_keys = [
            "# File:",
            "# Module:",
            "# Description:",
            "# Dependencies:",
            "# Components:",
            "# Role:",
            "# Author:",
            "# Date Created:",
            "# Last Updated:",
            "# Related Files:",
            "# Key:"
        ]

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            # Read the first N lines where N is the number of required keys
            for _ in range(len(required_keys)):
                line = file.readline()
                if not line:
                    break  # Reached EOF
                # Remove comment prefix and leading/trailing whitespace
                stripped_line = line.strip().lstrip(comment_style).strip()
                # Check for each required key
                for key in required_keys:
                    if stripped_line.startswith(key):
                        return True
        return False
    except Exception as e:
        logging.error(f"Error checking metadata in file {file_path}: {e}")
        return False
def detect_folder_context(file_path):
    if 'twcore' in file_path:
        return "Core Shared Components"
    elif 'shared_features' in file_path:
        return "Shared Features"
    elif 'users' in file_path:
        return "User-Specific Features"
    return "Unknown"

def get_comment_style(file_path):
    COMMENT_STYLES = {
        '.dart': '//',
        '.py': '#',
        '.yaml': '#',
        '.yml': '#',
        '.json': '//',
        '.html': '<!--',
        '.css': '/*',
        '.js': '//',
        '.ts': '//',
        '.md': '```',
        '.xml': '<!--',
        '.cpp': '//',
        '.java': '//'
    }
    _, ext = os.path.splitext(file_path)
    return COMMENT_STYLES.get(ext, '#')  # Default to '#' if extension not found

def extract_imports(file_content):
    """Extracts import statements from a Dart file."""
    import_lines = []
    for line in file_content.splitlines():
        if line.strip().startswith('import'):
            import_lines.append(line)
    return import_lines

def extract_top_comments(file_content, comment_style):
    """Extract the top comments from the file."""
    comment_lines = []
    for line in file_content.splitlines():
        line = line.strip()
        if line.startswith(comment_style):
            comment_lines.append(line)
        else:
            break  # Stop when we hit the first non-comment line
    return "\n".join(comment_lines), "\n".join(file_content.splitlines()[len(comment_lines):])  # Return the comments and the rest of the code


import subprocess
from datetime import datetime
import os

def get_git_creation_date(file_path, git_root):
    """
    Retrieves the creation date of a file based on Git history.
    
    Args:
        file_path (str): Absolute path to the file.
        git_root (str): Absolute path to the Git repository root.
    
    Returns:
        str: Creation date in 'YYYY-MM-DD' format, or None if not found.
    """
    try:
        # Convert absolute file path to relative path from git_root
        relative_path = os.path.relpath(file_path, git_root)
        
        # Ensure the file is tracked by git
        subprocess.run(
            ['git', '-C', git_root, 'ls-files', '--error-unmatch', relative_path],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        
        # Get the first commit where the file was added
        result = subprocess.run(
            ['git', '-C', git_root, 'log', '--diff-filter=A', '--follow', '--format=%aI', '--', relative_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        creation_date_str = result.stdout.strip().split('\n')[0]
        if creation_date_str:
            creation_date = datetime.fromisoformat(creation_date_str).strftime('%Y-%m-%d')
            return creation_date
        else:
            print(f"No creation date found in Git history for {relative_path}.")
            return None
    except subprocess.CalledProcessError:
        print(f"File {file_path} is not tracked by Git.")
        return None
    except Exception as e:
        print(f"Error getting Git creation date for {file_path}: {e}")
        return None

def get_file_metadata(file_path, git_root):
    """
    Retrieves the creation and last updated dates of a file.
    
    Args:
        file_path (str): Absolute path to the file.
        git_root (str): Absolute path to the Git repository root.
    
    Returns:
        tuple: (date_created, last_updated) in 'YYYY-MM-DD' format.
    """
    date_created = get_git_creation_date(file_path, git_root)
    if not date_created:
        # Fallback to st_ctime if Git fails
        try:
            file_stats = os.stat(file_path)
            date_created = datetime.fromtimestamp(file_stats.st_ctime).strftime('%Y-%m-%d')
        except Exception as e:
            print(f"Error getting st_ctime for {file_path}: {e}")
            date_created = "Unknown"
    
    try:
        last_updated = datetime.fromtimestamp(os.stat(file_path).st_mtime).strftime('%Y-%m-%d')
    except Exception as e:
        print(f"Error getting st_mtime for {file_path}: {e}")
        last_updated = "Unknown"
    
    return date_created, last_updated

import logging

# Configure logging
logging.basicConfig(
    filename='file_processing.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

In [7]:
import re
def extract_import_path(import_statement, current_file_path, lib_directory):
    """Extracts the file path from a Dart import statement.

    Handles relative paths and package paths for local files.
    """
    # Match import statements like: import 'some_relative_path.dart';
    match = re.search(r"import\s+['\"]([^'\"]+)['\"];", import_statement)
    
    if match:
        import_path = match.group(1)
        
        # Handle relative paths
        if import_path.startswith('.'):
            # Convert relative paths to absolute paths based on the current file's location
            base_dir = os.path.dirname(current_file_path)
            absolute_path = os.path.abspath(os.path.join(base_dir, import_path))
            # Ensure the path points to a Dart file
            if not absolute_path.endswith('.dart'):
                absolute_path += '.dart'
            if os.path.exists(absolute_path):
                return absolute_path
        
        # Handle package paths
        elif import_path.startswith('package:tradework_platform/'):
            # Remove 'package:tradework_platform/' and map to lib_directory
            relative_path = import_path.replace('package:tradework_platform/', '')
            absolute_path = os.path.join(lib_directory, relative_path)
            if os.path.exists(absolute_path):
                return absolute_path
        
        # Handle other package paths if needed
        elif import_path.startswith('package:'):
            # You can add custom logic to map other package imports to actual file paths in your repo if needed
            return None
        
    return None  # If not a valid import line or if a package import (ignored for now)

In [8]:
from tqdm import tqdm
def process_file(file_path, memory, core_files, processed_files, chain, app_context, lib_directory, git_root, only_missing=False):
    """
    Processes a single Dart file:
    - Skips auto-generated files.
    - Optionally skips files that already contain metadata.
    - Extracts and removes existing top comments.
    - Generates new metadata using LLM.
    - Inserts the new metadata at the top of the file.
    - Recursively processes imported files.
    """
    # Skip files that end with .freezed.dart or .g.dart or already processed
    if file_path.endswith('.freezed.dart') or file_path.endswith('.g.dart') or file_path in processed_files:
        logging.info(f"Skipping file: {file_path}")
        return

    logging.info(f"Processing file: {file_path}")

    # Determine the comment style based on file extension
    comment_style = get_comment_style(file_path)

    # If only_missing flag is set, check for existing metadata
    if only_missing:
        if has_metadata(file_path, comment_style):
            logging.info(f"Metadata already exists. Skipping file: {file_path}")
            return

    try:
        # Read the file content
        with open(file_path, 'r', encoding='utf-8') as file:
            code_content = file.read()
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return

    # Get file metadata (creation and last updated dates)
    date_created, last_updated = get_file_metadata(file_path, git_root)

    # Get folder context to add to the metadata
    folder_context = detect_folder_context(file_path)

    # Extract the filename
    file_name = os.path.basename(file_path)

    # Extract existing comments from the top of the file
    existing_comments, code_content_without_comments = extract_top_comments(code_content, comment_style)

    # Prepare context by including core files content
    context_files_content = ""
    for core_file in core_files:
        if os.path.exists(core_file):
            try:
                with open(core_file, 'r', encoding='utf-8') as cf:
                    context_files_content += cf.read() + "\n"
            except Exception as e:
                logging.error(f"Error reading core file {core_file}: {e}")

    # Use the LLM to generate the metadata
    try:
        result = chain.invoke({
            "code_content": code_content_without_comments,
            "app_context": app_context + "\n" + context_files_content,
            "last_updated": last_updated,
            "date_created": date_created,
            "file_name": file_name,
            "module_name": folder_context,
            "existing_comments": existing_comments
        })
        generated_metadata = result.get('text', '').strip()
        logging.debug(f"LLM Output for {file_path}:\n{generated_metadata}\n")
    except Exception as e:
        logging.error(f"Error invoking LLM for file {file_path}: {e}")
        return

    # Verify that all required keys are present
    required_keys = [
        "# File:",
        "# Module:",
        "# Description:",
        "# Dependencies:",
        "# Components:",
        "# Role:",
        "# Author:",
        "# Date Created:",
        "# Last Updated:",
        "# Related Files:",
        "# Key:"
    ]

    missing_keys = [key for key in required_keys if key not in generated_metadata]
    if missing_keys:
        logging.warning(f"Missing metadata keys in {file_path}: {missing_keys}")

    # Add comment style to each line of the metadata
    metadata_comment = "\n".join([f"{comment_style} {line}" for line in generated_metadata.splitlines()])

    # Insert generated metadata at the start of the file
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(metadata_comment + "\n" + code_content_without_comments)
    except Exception as e:
        logging.error(f"Error writing metadata to file {file_path}: {e}")
        return

    # Add the file to the processed list
    processed_files.add(file_path)

    # Extract imports and process dependencies
    import_lines = extract_imports(code_content_without_comments)
    for import_line in import_lines:
        # Extract the path from the import and process the imported file
        import_path = extract_import_path(import_line, file_path, lib_directory)
        if import_path and import_path not in processed_files:
            process_file(import_path, memory, core_files, processed_files, chain, app_context, lib_directory, git_root, only_missing)



In [9]:
from tqdm import tqdm

def process_repository(directory, memory, chain, app_context, git_root, only_missing=False, chunk_size=20):
    """
    Processes all Dart files in the specified directory.
    - Prioritizes core files (e.g., main.dart, locator.dart, routes.dart).
    - Tracks processed files to avoid duplication.
    - Uses a progress bar to display processing status.
    """
    lib_directory = directory  # Assuming 'directory' is the 'lib' folder

    # Prioritize certain core files (like main.dart and services)
    core_files = []
    main_file = os.path.join(lib_directory, "main.dart")
    if os.path.exists(main_file):
        core_files.append(main_file)

    # Add other core files manually (like service locator, route)
    core_files.extend([
        os.path.join(lib_directory, "services", "locator.dart"),
        os.path.join(lib_directory, "routes", "routes.dart")
    ])

    # Initialize set to track processed files
    processed_files = set()

    # First, process core files
    for core_file in core_files:
        if os.path.exists(core_file) and core_file not in processed_files:
            process_file(core_file, memory, core_files, processed_files, chain, app_context, lib_directory, git_root, only_missing)

    # Collect all other Dart files excluding auto-generated ones
    all_files = sorted([
        os.path.join(root, file_name) for root, dirs, files in os.walk(lib_directory)
        for file_name in files
        if file_name.endswith(".dart") and not file_name.endswith(('.freezed.dart', '.g.dart'))
    ], key=lambda x: x.lower())

    # Calculate the number of files to process (excluding already processed)
    files_to_process = [f for f in all_files if f not in processed_files]
    total_files = len(files_to_process)
    print(f"Total files to process: {total_files}\n")

    # Use tqdm to display a progress bar
    with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
        for file_path in files_to_process:
            process_file(file_path, memory, core_files, processed_files, chain, app_context, lib_directory, git_root, only_missing)
            pbar.update(1)

    print("\nProcessing complete.")


In [10]:

from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain.memory import ConversationBufferWindowMemory








# Initialize the LLM using GPT-4o-mini (or another GPT model of your choice)
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")



# Create a LangChain prompt template for file processing
prompt = PromptTemplate(
        input_variables=["code_content", "app_context", "last_updated", "date_created", "file_name", "module_name", "existing_comments"],
        template=metadata_prompt
    )
# Define a chain to process files with the given LLM and prompt template
chain = LLMChain(llm=llm, prompt=prompt)

    
# Initialize memory for tracking context and window resets
memory = ConversationBufferWindowMemory(k=5)  # Keep 5 file contexts in memory at a time

# Define the repository's lib directory
lib_directory = os.path.join(repo_directory, "lib")



  warn_deprecated(


In [None]:
only_missing = True  # Set to False to process all files



# Run the processing starting from main.dart
# repo_directory = "path_to_your_flutter_repo"
process_repository(lib_directory, memory, chain, app_context, git_root, only_missing=only_missing)
