# Graph Data Review and Prompt Example Preparation

This notebook helps review the current graph data, export it for archival purposes, and prepare examples for few-shot prompting.

## Features:
- Load all data from Neo4j graph
- Display data in interactive pandas DataFrame (full content, no trimming)
- Edit cell contents
- Remove unwanted rows
- Export to CSV
- Prepare prompt examples


In [1]:
import sys
import os
import logging

# Add src to path
sys.path.append('../src')
sys.path.append('..')

import pandas as pd
import json
from datetime import datetime
from typing import List, Dict, Any

# Try to import optional widgets, but don't fail if not available
try:
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
    WIDGETS_AVAILABLE = True
except ImportError:
    WIDGETS_AVAILABLE = False
    print("ℹ️ ipywidgets not available - using simple text interface")

from src.graph.neo4j_client import Neo4jClient

# Setup basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("graph_review")

print("✅ Imports successful")
if WIDGETS_AVAILABLE:
    print("✅ Interactive widgets available")
else:
    print("ℹ️ Using text-only interface")


✅ Imports successful
✅ Interactive widgets available


## 1. Connect to Neo4j and Load Data


In [2]:
# Initialize Neo4j client
try:
    # Use default Neo4j settings for docker-compose setup
    neo4j_client = Neo4jClient(
        uri="bolt://localhost:7687",
        user="neo4j", 
        password="password"
    )
    print("✅ Connected to Neo4j")
    
    # Test the connection
    health = neo4j_client.health_check()
    if health:
        print("✅ Neo4j health check passed")
    else:
        print("⚠️ Neo4j connected but health check failed")
        
except Exception as e:
    print(f"❌ Failed to connect to Neo4j: {e}")
    print("Make sure Neo4j is running with: docker-compose up")
    print("If using a different setup, check your connection parameters")


INFO:src.graph.neo4j_client:Successfully connected to Neo4j
INFO:src.graph.neo4j_client:Successfully created or verified constraint on :GtdNote(content_hash).
INFO:src.graph.neo4j_client:Initialized Neo4j client with URI: bolt://localhost:7687


✅ Connected to Neo4j
✅ Neo4j health check passed


In [3]:
def load_all_graph_data():
    """Load all relevant data from the graph for review."""
    
    # Main query to get all notes with their relationships
    query = """
    MATCH (n:GtdNote)-[:RECORDED_ON]->(d:Day)
    OPTIONAL MATCH (n)-[:HAS_TAG]->(t:Tag)
    OPTIONAL MATCH (n)-[:MENTIONS]->(e:Entity)
    OPTIONAL MATCH (n)-[:HAS_CHILD]->(child:GtdNote)
    OPTIONAL MATCH (parent:GtdNote)-[:HAS_CHILD]->(n)
    
    RETURN 
        n.content as content,
        n.llm_summary as llm_summary,
        n.line_number as line_number,
        n.content_hash as content_hash,
        n.created_at as created_at,
        d.date as date,
        collect(DISTINCT t.name) as tags,
        collect(DISTINCT {name: e.name, type: e.type}) as entities,
        collect(DISTINCT child.content) as children,
        collect(DISTINCT parent.content) as parents
    ORDER BY n.line_number
    """
    
    try:
        results = neo4j_client.execute_query(query)
        print(f"✅ Loaded {len(results)} notes from graph")
        return results
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return []

# Load the data
graph_data = load_all_graph_data()


✅ Loaded 91 notes from graph


## 2. Convert to DataFrame and Display


In [4]:
def clean_data_for_display(data):
    """Clean and prepare data for DataFrame display - keeping full content."""
    cleaned = []
    
    for item in data:
        cleaned_item = {
            'line_number': item.get('line_number', ''),
            'content': item.get('content', ''),
            'llm_summary': item.get('llm_summary', ''),
            'date': item.get('date', ''),
            'tags': ', '.join([tag for tag in item.get('tags', []) if tag]),  # Filter out None/empty
            'entities': item.get('entities', []),  # Keep full entity objects
            'entities_formatted': ', '.join([f"{e['name']} ({e['type']})" for e in item.get('entities', []) if e.get('name')]),
            'children_count': len([c for c in item.get('children', []) if c]),
            'parents_count': len([p for p in item.get('parents', []) if p]),
            'content_hash': item.get('content_hash', ''),  # Full hash
            'created_at': str(item.get('created_at', '')) if item.get('created_at') else ''
        }
        cleaned.append(cleaned_item)
    
    return cleaned

# Create DataFrame
if graph_data:
    cleaned_data = clean_data_for_display(graph_data)
    df = pd.DataFrame(cleaned_data)
    
    print(f"📊 DataFrame created with {len(df)} rows and {len(df.columns)} columns")
    print(f"\nColumns: {list(df.columns)}")
    
    # Display first few rows with full content
    print("\n📋 First 5 rows preview:")
    if WIDGETS_AVAILABLE:
        # Show full content in widgets
        pd.set_option('display.max_colwidth', None)
        display(df.head(5))
    else:
        # Simple text display with full content
        for i, row in df.head(5).iterrows():
            print(f"\n{'='*80}")
            print(f"Row {i} | Line {row['line_number']} | Date: {row['date']}")
            print(f"Content: {row['content']}")
            print(f"Summary: {row['llm_summary']}")
            print(f"Tags: {row['tags']}")
            print(f"Entities: {row['entities_formatted']}")
            print(f"Children/Parents: {row['children_count']}/{row['parents_count']}")
            print(f"Hash: {row['content_hash']}")
else:
    print("❌ No data to display")


📊 DataFrame created with 91 rows and 11 columns

Columns: ['line_number', 'content', 'llm_summary', 'date', 'tags', 'entities', 'entities_formatted', 'children_count', 'parents_count', 'content_hash', 'created_at']

📋 First 5 rows preview:


Unnamed: 0,line_number,content,llm_summary,date,tags,entities,entities_formatted,children_count,parents_count,content_hash,created_at
0,3,- Getting things done,A note referencing the concept or task of getting things done.,19.06,project,"[{'name': None, 'type': None}]",,12,0,bbf2f6534814d0534908eb1b15e3e497da4236eda733e12964c8e235773c20d0,2025-06-20T13:32:23.458000000+00:00
1,4,- Write down short introduction what we want.,A task to write a short introduction about what is desired.,19.06,sub5minutes,"[{'name': None, 'type': None}]",,1,1,fbc61e8c7aeff9c0e52799490c98f5e92344fdfc2b8e9aea61b12b13beafa0ed,2025-06-20T13:32:23.955000000+00:00
2,5,"- We want to create locally running, llm and neo4j based system, that will allow me to take notes as this file and they will be automatically processed by llm, ingested to knowledge graph. For interface we'll use streamlit, for backend we'll go with fastapi, for llm hosting we'll use ollama. For coding we're using VSCode branch Cursor, so I'm thinking about using devcontainters to contain all app elements. System will allow me to take notes, they will be highlighted, tagged automatically, you can follow the graph to get more context information. Dynamic system. What's also important - we want to create it in a way that will allow to use ollama or other providers as OpenAI, Google or other local system.","The note describes creating a local system integrating LLM, knowledge graph, and various tech stack components for note-taking, processing, and flexible deployment options.",19.06,reference,"[{'name': 'streamlit', 'type': 'Technology'}, {'name': 'neo4j', 'type': 'Technology'}, {'name': 'Cursor', 'type': 'Software Branch'}, {'name': 'VSCode', 'type': 'Technology'}, {'name': 'ollama', 'type': 'Technology'}, {'name': 'devcontainters', 'type': 'Technology'}, {'name': 'llm', 'type': 'Technology'}, {'name': 'Google', 'type': 'Technology'}, {'name': 'fastapi', 'type': 'Technology'}, {'name': 'OpenAI', 'type': 'Technology'}]","streamlit (Technology), neo4j (Technology), Cursor (Software Branch), VSCode (Technology), ollama (Technology), devcontainters (Technology), llm (Technology), Google (Technology), fastapi (Technology), OpenAI (Technology)",0,1,195e9aa8e0f34f8f94104c99b1ea7bc20986fe906234e0178d74f30251abf1b3,2025-06-20T13:32:25.682000000+00:00
3,6,"- Find on X post about ""template"" for Vibe Coding.",A task to find an X post about a template for Vibe Coding.,19.06,done,"[{'name': 'template', 'type': 'Technology'}, {'name': 'X', 'type': 'Platform'}, {'name': 'Vibe Coding', 'type': 'Project'}]","template (Technology), X (Platform), Vibe Coding (Project)",4,1,464f5a7cfa85235dff6a85dd2458a76c7d4cec34370b3b164f1194323f810b99,2025-06-20T13:32:26.505000000+00:00
4,7,- Here it is: https://pbs.twimg.com/media/GtfC0G8XQAAYJ-c?format=jpg&name=large,The note provides a link to an image hosted on Twitter's media server.,19.06,reference,"[{'name': 'https://pbs.twimg.com/media/GtfC0G8XQAAYJ-c?format=jpg&name=large', 'type': 'Image'}]",https://pbs.twimg.com/media/GtfC0G8XQAAYJ-c?format=jpg&name=large (Image),0,1,8b99fd5e9bd211838ca9e36f320972936dcff7f8a0a71e1cabcbed2af4178180,2025-06-20T13:32:27.416000000+00:00


## 3. Interactive Data Review and Editing


In [5]:
# Make a working copy for editing
if 'df' in locals():
    working_df = df.copy()
    working_df['keep_for_prompts'] = True  # Add a column to mark rows for prompt examples
    working_df['prompt_category'] = ''  # Category for grouping prompt examples
    
    print(f"✅ Working DataFrame created with {len(working_df)} rows")
    print("Added columns: 'keep_for_prompts' and 'prompt_category'")
else:
    print("❌ No DataFrame available to work with")


✅ Working DataFrame created with 91 rows
Added columns: 'keep_for_prompts' and 'prompt_category'


In [6]:
def browse_rows(start_index=0, num_rows=5):
    """Display a range of rows for detailed browsing - FULL CONTENT, NO TRIMMING."""
    if 'working_df' not in locals() and 'working_df' not in globals():
        print("❌ No working DataFrame available")
        return
    
    end_index = min(start_index + num_rows, len(working_df))
    
    print(f"\n📖 Rows {start_index} to {end_index-1} - FULL CONTENT:")
    print("=" * 120)
    
    for i in range(start_index, end_index):
        row = working_df.iloc[i]
        keep_status = "✅" if row['keep_for_prompts'] else "❌"
        category = f"[{row['prompt_category']}]" if row['prompt_category'] else "[no category]"
        
        print(f"\n🔸 ROW {i:3d} | Line {row['line_number']:3} | {keep_status} {category}")
        print("-" * 80)
        
        # FULL CONTENT - NO TRIMMING
        print(f"📝 CONTENT:")
        print(f"   {row['content']}")
        
        print(f"\n🤖 LLM SUMMARY:")
        print(f"   {row['llm_summary']}")
        
        print(f"\n🏷️ TAGS:")
        print(f"   {row['tags']}")
        
        # SHOW ENTITIES FULLY - THIS WAS MISSING
        print(f"\n🎯 ENTITIES:")
        if row['entities_formatted']:
            print(f"   {row['entities_formatted']}")
            # Also show structured entities
            entities = row['entities']
            if entities:
                print("   Structured entities:")
                for entity in entities:
                    if entity.get('name'):
                        print(f"     • {entity['name']} ({entity.get('type', 'unknown')})")
        else:
            print("   No entities extracted")
        
        print(f"\n📊 METADATA:")
        print(f"   Date: {row['date']}")
        print(f"   Children: {row['children_count']} | Parents: {row['parents_count']}")
        print(f"   Hash: {row['content_hash']}")
        print(f"   Created: {row['created_at']}")

# Browse first 3 rows with full detail
if 'working_df' in locals():
    browse_rows(0, 3)
    print(f"\n🎯 Use browse_rows(start_index, num_rows) to see more rows")
    print(f"🎯 Use edit_row(index) function below to edit specific rows")



📖 Rows 0 to 2 - FULL CONTENT:

🔸 ROW   0 | Line   3 | ✅ [no category]
--------------------------------------------------------------------------------
📝 CONTENT:
   - Getting things done

🤖 LLM SUMMARY:
   A note referencing the concept or task of getting things done.

🏷️ TAGS:
   project

🎯 ENTITIES:
   No entities extracted

📊 METADATA:
   Date: 19.06
   Children: 12 | Parents: 0
   Hash: bbf2f6534814d0534908eb1b15e3e497da4236eda733e12964c8e235773c20d0
   Created: 2025-06-20T13:32:23.458000000+00:00

🔸 ROW   1 | Line   4 | ✅ [no category]
--------------------------------------------------------------------------------
📝 CONTENT:
   - Write down short introduction what we want.

🤖 LLM SUMMARY:
   A task to write a short introduction about what is desired.

🏷️ TAGS:
   sub5minutes

🎯 ENTITIES:
   No entities extracted

📊 METADATA:
   Date: 19.06
   Children: 1 | Parents: 1
   Hash: fbc61e8c7aeff9c0e52799490c98f5e92344fdfc2b8e9aea61b12b13beafa0ed
   Created: 2025-06-20T13:32:23.9550000

In [7]:
def edit_row(row_index):
    """Simple row editor - shows all details for editing."""
    if 'working_df' not in locals() and 'working_df' not in globals():
        print("❌ No working DataFrame available")
        return
        
    if row_index >= len(working_df):
        print(f"❌ Row {row_index} doesn't exist. Max index: {len(working_df)-1}")
        return
    
    row = working_df.iloc[row_index]
    
    print(f"\n📝 EDITING ROW {row_index} (Line {row['line_number']}):")
    print("=" * 80)
    print(f"CONTENT: {row['content']}")
    print(f"SUMMARY: {row['llm_summary']}")
    print(f"TAGS: {row['tags']}")
    print(f"ENTITIES: {row['entities_formatted']}")
    print(f"KEEP FOR PROMPTS: {row['keep_for_prompts']}")
    print(f"CATEGORY: {row['prompt_category']}")
    print("=" * 80)
    
    print("✏️ EDITING FUNCTIONS:")
    print(f"update_content({row_index}, 'new content')")
    print(f"update_summary({row_index}, 'new summary')")
    print(f"update_tags({row_index}, 'tag1, tag2')")
    print(f"mark_for_prompts({row_index}, True/False)")
    print(f"set_category({row_index}, 'category_name')")

def update_content(row_index, new_content):
    """Update content for a specific row."""
    if row_index < len(working_df):
        working_df.at[row_index, 'content'] = new_content
        print(f"✅ Updated content for row {row_index}")
    else:
        print(f"❌ Row {row_index} doesn't exist")

def update_summary(row_index, new_summary):
    """Update summary for a specific row."""
    if row_index < len(working_df):
        working_df.at[row_index, 'llm_summary'] = new_summary
        print(f"✅ Updated summary for row {row_index}")
    else:
        print(f"❌ Row {row_index} doesn't exist")

def update_tags(row_index, new_tags):
    """Update tags for a specific row."""
    if row_index < len(working_df):
        working_df.at[row_index, 'tags'] = new_tags
        print(f"✅ Updated tags for row {row_index}")
    else:
        print(f"❌ Row {row_index} doesn't exist")

def mark_for_prompts(row_index, keep=True):
    """Mark/unmark row for prompt examples."""
    if row_index < len(working_df):
        working_df.at[row_index, 'keep_for_prompts'] = keep
        status = "marked" if keep else "unmarked"
        print(f"✅ Row {row_index} {status} for prompts")
    else:
        print(f"❌ Row {row_index} doesn't exist")

def set_category(row_index, category):
    """Set category for a specific row."""
    if row_index < len(working_df):
        working_df.at[row_index, 'prompt_category'] = category
        print(f"✅ Set category '{category}' for row {row_index}")
    else:
        print(f"❌ Row {row_index} doesn't exist")

print("✅ Row editing functions loaded")
print("Use edit_row(index) to see full editing options for a specific row")


✅ Row editing functions loaded
Use edit_row(index) to see full editing options for a specific row


In [8]:
edit_row(1)


📝 EDITING ROW 1 (Line 4):
CONTENT: - Write down short introduction what we want.
SUMMARY: A task to write a short introduction about what is desired.
TAGS: sub5minutes
ENTITIES: 
KEEP FOR PROMPTS: True
CATEGORY: 
✏️ EDITING FUNCTIONS:
update_content(1, 'new content')
update_summary(1, 'new summary')
update_tags(1, 'tag1, tag2')
mark_for_prompts(1, True/False)
set_category(1, 'category_name')


In [9]:
mark_for_prompts(1, False)

✅ Row 1 unmarked for prompts


## 4. Export Functions


In [10]:
def export_to_csv(filename=None, include_all=True):
    """Export data to CSV file."""
    if 'working_df' not in locals() and 'working_df' not in globals():
        print("❌ No working DataFrame available")
        return
    
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"graph_data_export_{timestamp}.csv"
    
    # Determine which rows to export
    if include_all:
        export_df = working_df.copy()
        print(f"📤 Exporting all {len(export_df)} rows")
    else:
        export_df = working_df[working_df['keep_for_prompts']].copy()
        print(f"📤 Exporting {len(export_df)} marked rows")
    
    try:
        # Create exports directory if it doesn't exist
        os.makedirs('../exports', exist_ok=True)
        filepath = f"../exports/{filename}"
        
        export_df.to_csv(filepath, index=False)
        print(f"✅ Data exported to: {filepath}")
        
        return filepath
    except Exception as e:
        print(f"❌ Export failed: {e}")
        return None

def export_prompt_examples(filename=None):
    """Export data formatted specifically for prompt examples."""
    if 'working_df' not in locals() and 'working_df' not in globals():
        print("❌ No working DataFrame available")
        return
    
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"prompt_examples_{timestamp}.json"
    
    # Get only marked rows
    prompt_df = working_df[working_df['keep_for_prompts']].copy()
    
    if len(prompt_df) == 0:
        print("❌ No rows marked for prompt examples")
        return
    
    # Group by category
    examples_by_category = {}
    
    for category in prompt_df['prompt_category'].unique():
        if not category:  # Skip empty categories
            category = 'uncategorized'
        
        category_rows = prompt_df[prompt_df['prompt_category'] == category]
        
        examples = []
        for _, row in category_rows.iterrows():
            example = {
                'input': str(row['content']),
                'output': {
                    'summary': str(row['llm_summary']),
                    'tags': [tag.strip() for tag in str(row['tags']).split(',') if tag.strip()],
                    'entities': row['entities'],  # Full entity objects
                    'line_number': int(row['line_number']) if pd.notna(row['line_number']) else None
                },
                'metadata': {
                    'date': str(row['date']),
                    'content_hash': str(row['content_hash'])
                }
            }
            examples.append(example)
        
        examples_by_category[category] = examples
    
    # Create final structure
    export_data = {
        'generated_at': datetime.now().isoformat(),
        'total_examples': len(prompt_df),
        'categories': examples_by_category
    }
    
    try:
        os.makedirs('../exports', exist_ok=True)
        filepath = f"../exports/{filename}"
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        
        print(f"✅ Prompt examples exported to: {filepath}")
        print(f"\n📊 Examples by category:")
        for cat, examples in examples_by_category.items():
            print(f"  {cat}: {len(examples)} examples")
        
        return filepath
    except Exception as e:
        print(f"❌ Export failed: {e}")
        return None

print("📤 Export functions ready:")
print("export_to_csv() - Export all data")  
print("export_prompt_examples() - Export for few-shot prompting")


📤 Export functions ready:
export_to_csv() - Export all data
export_prompt_examples() - Export for few-shot prompting
