In [55]:
# Install required packages
%pip install langchain_community langchain-pinecone tqdm python-dotenv ipywidgets tenacity



Note: you may need to restart the kernel to use updated packages.


In [56]:
import os
import re
import json
import logging
from dotenv import load_dotenv
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm

import pinecone
import openai

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone as LangchainPineconeVectorStore
from langchain.docstore.document import Document


In [57]:
# Load environment variables from .env file
load_dotenv()

# Retrieve API keys and configurations
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')  # e.g., "us-west1-gcp"

# Validate API keys
if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key not found in environment variables.")
if not PINECONE_API_KEY:
    raise ValueError("Pinecone API key not found in environment variables.")
if not PINECONE_ENVIRONMENT:
    raise ValueError("Pinecone environment not found in environment variables.")

In [58]:
# Configure logging
logging.basicConfig(
    filename='file_processing.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)


In [59]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)




In [60]:
import time

index_name = "tradework-platform-code"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="eu-west-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [61]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [62]:
# Set the path to your Flutter/Dart project directory
repo_directory = "/Volumes/Mac-External/Development/Tradework/tradework_platform"

# Verify that the directory exists
if not os.path.exists(repo_directory):
    raise FileNotFoundError(f"Directory does not exist: {repo_directory}")
else:
    print(f"Directory exists: {repo_directory}")


Directory exists: /Volumes/Mac-External/Development/Tradework/tradework_platform


In [63]:
def extract_metadata(file_path, comment_style='//'):
    """
    Extracts metadata from the top of a Dart file.

    Args:
        file_path (str): Path to the Dart file.
        comment_style (str): Comment prefix used in the metadata.

    Returns:
        dict: Dictionary containing metadata keys and their values.
    """
    metadata = {}
    required_keys = [
        "# File:",
        "# Module:",
        "# Description:",
        "# Dependencies:",
        "# Components:",
        "# Role:",
        "# Author:",
        "# Date Created:",
        "# Last Updated:",
        "# Related Files:",
        "# Key:"
    ]
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                stripped_line = line.strip()
                if stripped_line.startswith(comment_style):
                    # Remove comment prefix and whitespace
                    content = stripped_line.lstrip(comment_style).strip()
                    for key in required_keys:
                        if content.startswith(key):
                            value = content[len(key):].strip()
                            metadata[key.strip("#:")] = value
                            break
                else:
                    break  # Stop reading after the initial comments
    except Exception as e:
        logging.error(f"Error extracting metadata from {file_path}: {e}")
    
    return metadata


In [64]:
def load_file_content(file_path):
    """
    Loads the content of a Dart file.

    Args:
        file_path (str): Path to the Dart file.

    Returns:
        str: Content of the file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        logging.error(f"Error loading file content from {file_path}: {e}")
        return ""

In [65]:
def detect_folder_context(file_path):
    """
    Determines the folder context based on the file path.

    Args:
        file_path (str): Absolute path to the file.

    Returns:
        str: Folder context description.
    """
    if 'twcore' in file_path:
        return "Core Shared Components"
    elif 'shared_features' in file_path:
        return "Shared Features"
    elif 'users' in file_path:
        return "User-Specific Features"
    return "Unknown"

In [66]:
def extract_imports(file_content):
    """Extracts import statements from a Dart file."""
    import_lines = []
    for line in file_content.splitlines():
        if line.strip().startswith('import'):
            import_lines.append(line)
    return import_lines

In [67]:
def extract_import_path(import_statement, current_file_path, lib_directory):
    """
    Extracts the file path from a Dart import statement.

    Handles relative paths and package paths for local files.

    Args:
        import_statement (str): The import statement line.
        current_file_path (str): Absolute path to the current file.
        lib_directory (str): Absolute path to the lib directory.

    Returns:
        str or None: Absolute path to the imported file, or None if not found.
    """
    # Match import statements like: import 'some_relative_path.dart';
    match = re.search(r"import\s+['\"]([^'\"]+)['\"];", import_statement)
    
    if match:
        import_path = match.group(1)
        
        # Handle relative paths
        if import_path.startswith('.'):
            # Convert relative paths to absolute paths based on the current file's location
            base_dir = os.path.dirname(current_file_path)
            absolute_path = os.path.abspath(os.path.join(base_dir, import_path))
            # Ensure the path points to a Dart file
            if not absolute_path.endswith('.dart'):
                absolute_path += '.dart'
            if os.path.exists(absolute_path):
                return absolute_path
        
        # Handle package paths
        elif import_path.startswith('package:tradework_platform/'):
            # Remove 'package:tradework_platform/' and map to lib_directory
            relative_path = import_path.replace('package:tradework_platform/', '')
            absolute_path = os.path.join(lib_directory, relative_path)
            if os.path.exists(absolute_path):
                return absolute_path
        
        # Handle other package paths if needed
        elif import_path.startswith('package:'):
            # You can add custom logic to map other package imports to actual file paths in your repo if needed
            return None
        
    return None  # If not a valid import line or if a package import (ignored for now)


In [68]:
def has_metadata(file_path, comment_style, required_keys=None):
    """
    Checks if the file already contains the metadata block.

    Args:
        file_path (str): Absolute path to the file.
        comment_style (str): The comment prefix based on file type (e.g., '//', '#').
        required_keys (list, optional): List of required metadata keys. Defaults to None.

    Returns:
        bool: True if all required metadata keys are found, False otherwise.
    """
    if required_keys is None:
        required_keys = [
            "File:",
            "Module:",
            "Description:",
            "Dependencies:",
            "Components:",
            "Role:",
            "Author:",
            "Date Created:",
            "Last Updated:",
            "Related Files:",
            "Key:"
        ]

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            # Read the first N lines where N is the number of required keys
            for _ in range(len(required_keys)):
                line = file.readline()
                if not line:
                    break  # Reached EOF
                # Remove comment prefix and leading/trailing whitespace
                stripped_line = line.strip().lstrip(comment_style).strip()
                # Check for each required key
                for key in required_keys:
                    if stripped_line.startswith(key):
                        return True
        return False
    except Exception as e:
        logging.error(f"Error checking metadata in file {file_path}: {e}")
        return False


Collect All Relevant Dart Files
Gather all Dart files in the lib directory, excluding auto-generated files.

In [69]:
from pathlib import Path

# Define the lib directory
lib_directory = Path(repo_directory) / "lib"

# Collect all Dart files excluding those ending with .freezed.dart or .g.dart
dart_files = list(lib_directory.rglob("*.dart"))
filtered_dart_files = [f for f in dart_files if not f.name.endswith(('.freezed.dart', '.g.dart'))]

print(f"Total Dart files found: {len(filtered_dart_files)}")


Total Dart files found: 1041


Extract Metadata from All Files
Create a list of dictionaries containing file paths and their associated metadata.

In [70]:
file_metadata_list = []

for file_path in tqdm(filtered_dart_files, desc="Extracting metadata"):
    metadata = extract_metadata(file_path)
    if metadata:
        metadata['file_path'] = str(file_path)
        file_metadata_list.append(metadata)
    else:
        logging.warning(f"No metadata found in file: {file_path}")


Extracting metadata:   0%|          | 0/1041 [00:00<?, ?it/s]

Chunking the Files
Use LangChain's RecursiveCharacterTextSplitter to split the file content into manageable chunks.

In [71]:
def load_file_content(file_path):
    """
    Loads the content of a Dart file.

    Args:
        file_path (str): Path to the Dart file.

    Returns:
        str: Content of the file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        logging.error(f"Error loading file content from {file_path}: {e}")
        return ""

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust as needed
    chunk_overlap=100,  # Adjust as needed
    separators=["\n\n", "\n", " ", ""]
)

# List to hold all chunks with metadata
all_chunks = []

for metadata in tqdm(file_metadata_list, desc="Chunking files"):
    file_path = metadata['file_path']
    content = load_file_content(file_path)
    
    # Remove existing metadata comments to avoid redundancy
    comment_style = '//'
    content_without_metadata = re.sub(rf'^{comment_style}.*\n', '', content, flags=re.MULTILINE)
    
    # Split content into chunks
    chunks = text_splitter.split_text(content_without_metadata)
    
    for i, chunk in enumerate(chunks):
        chunk_metadata = metadata.copy()
        chunk_metadata.update({
            "chunk_id": f"{Path(file_path).stem}_chunk_{i+1}",
            "chunk_text": chunk,
            "file_name": Path(file_path).name,
            "module_name": metadata.get("Module", "Unknown"),
            "description": metadata.get("Description", ""),
            "dependencies": metadata.get("Dependencies", ""),
            "components": metadata.get("Components", ""),
            "role": metadata.get("Role", ""),
            "author": metadata.get("Author", ""),
            "date_created": metadata.get("Date Created", ""),
            "last_updated": metadata.get("Last Updated", ""),
            "related_files": metadata.get("Related Files", ""),
            "key": metadata.get("Key", "")
        })
        all_chunks.append(chunk_metadata)

print(f"Total chunks created: {len(all_chunks)}")


Chunking files:   0%|          | 0/1041 [00:00<?, ?it/s]

Total chunks created: 5201


In [51]:

from tenacity import retry, wait_random_exponential, stop_after_attempt

# Function to generate embedding for a given text
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_embedding(text):
    try:
        return embeddings.embed_query(text)
    except Exception as e:
        logging.error(f"Error generating embedding: {e}")
        return None

# Generate embeddings for all chunks
successful_embeddings = 0

for chunk in tqdm(all_chunks, desc="Generating embeddings"):
    text = chunk['chunk_text']
   
    embedding = get_embedding(text)
    if embedding:
        chunk['embedding'] = embedding
        successful_embeddings += 1
    else:
        logging.warning(f"Failed to generate embedding for chunk: {chunk['chunk_id']}")

print(f"Successfully generated embeddings for {successful_embeddings} out of {len(all_chunks)} chunks.")



Generating embeddings:   0%|          | 0/5201 [00:00<?, ?it/s]

In [52]:
documents = []

for chunk in all_chunks:
    if 'embedding' in chunk:
        doc = Document(
            page_content=chunk['chunk_text'],
            metadata={
                "file_path": chunk['file_path'],
                "file_name": chunk['file_name'],
                "module_name": chunk['module_name'],
                "description": chunk['description'],
                "dependencies": chunk['dependencies'],
                "components": chunk['components'],
                "role": chunk['role'],
                "author": chunk['author'],
                "date_created": chunk['date_created'],
                "last_updated": chunk['last_updated'],
                "related_files": chunk['related_files'],
                "key": chunk['key'],
                "chunk_id": chunk['chunk_id']
            }
        )
        documents.append(doc)

print(f"Total documents to add: {len(documents)}")


Total documents to add: 64


In [53]:
# Assuming 'documents' is correctly populated
if documents:
    vector_store.add_documents(documents)
    print(f"Added {len(documents)} documents to Pinecone.")
else:
    print("No documents to add to Pinecone.")



Added 64 documents to Pinecone.
