In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

import sys
from pathlib import Path
import time

# Add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

from src.dbs.graphdb import GraphDB

In [3]:
# Initialize database connection
db = GraphDB(
    db_path="../data/sqlite/graphdb.db"
)

# Display database statistics
print("=" * 70)
print("DATABASE STATISTICS")
print("=" * 70)

stats = db.stats()
print(f"\nDatabase: {db.db_path}")
print(f"Total Nodes: {stats['total_nodes']:,}")
print(f"Total Edges: {stats['total_edges']:,}")
print(f"Total Embeddings: {stats['total_embeddings']:,}")

print(f"\nNodes by Type:")
for node_type, count in sorted(stats['nodes_by_type'].items()):
    print(f"  {node_type:20s}: {count:6,}")

print(f"\nEdges by Relation Type:")
for rel_type, count in sorted(stats['edges_by_type'].items()):
    print(f"  {rel_type:30s}: {count:6,}")


DATABASE STATISTICS

Database: ../data/sqlite/graphdb.db
Total Nodes: 16,362
Total Edges: 55,607
Total Embeddings: 5,047

Nodes by Type:
  affiliation         :    692
  author              :  8,680
  conference          :     20
  domain              :  1,923
  paper               :  5,047

Edges by Relation Type:
  author_in_affiliation         :  7,244
  author_write_paper            : 14,096
  paper_cite_paper              :  8,583
  paper_in_domain               : 20,637
  paper_in_venue                :  5,047


## Demo 1: RAG (0-hop) - Vector Search Only

Pure semantic search using vector embeddings. This is the foundation of RAG - retrieving relevant documents based on semantic similarity.


In [4]:
def demo_rag(query: str, k: int = 3):
    """Demonstrate RAG: Vector search only."""
    print("=" * 70)
    print(f"RAG (0-hop): Vector Search")
    print("=" * 70)
    print(f"\nQuery: '{query}'")
    print(f"Retrieving top {k} papers...\n")
    
    start_time = time.perf_counter()
    results = db.search_by_text(query, k=k)
    latency = (time.perf_counter() - start_time) * 1000
    
    print(f"Results ({len(results)} papers, {latency:.2f}ms):")
    print("-" * 70)
    for i, paper in enumerate(results, 1):
        print(f"{i}. {paper['name']}")
        print(f"   ID: {paper['id']}")
        print(f"   Score: {paper['similarity']:.4f}")
        print()
    
    return results

# Example queries
query1 = "machine learning"
results_rag = demo_rag(query1, k=3)


RAG (0-hop): Vector Search

Query: 'machine learning'
Retrieving top 3 papers...

Results (3 papers, 367.77ms):
----------------------------------------------------------------------
1. Machine Learning by Function Decomposition
   ID: 5A7BA6C1
   Score: 0.6120

2. Mechanism design via machine learning
   ID: 7F4BED95
   Score: 0.5755

3. Detecting spam blogs: a machine learning approach
   ID: 06518482
   Score: 0.5334



## Demo 2: 1-Hop GraphRAG - Paper → Authors

Extend RAG with graph traversal: find papers semantically, then traverse to their authors. This adds entity context to the retrieved documents.


In [5]:
def demo_1hop_graphrag(query: str, k: int = 3):
    """Demonstrate 1-hop GraphRAG: Papers → Authors."""
    print("=" * 70)
    print(f"1-Hop GraphRAG: Paper → Authors")
    print("=" * 70)
    print(f"\nQuery: '{query}'")
    print(f"Retrieving top {k} papers and their authors...\n")
    
    start_time = time.perf_counter()
    
    # Vector search
    papers = db.search_by_text(query, k=k)
    
    # Get authors for each paper
    for paper in papers:
        paper['authors'] = db.get_neighbors(
            paper['id'],
            direction='in',
            relation_types=['author_write_paper']
        )
    
    latency = (time.perf_counter() - start_time) * 1000
    
    print(f"Results ({len(papers)} papers, {latency:.2f}ms):")
    print("-" * 70)
    
    total_authors = 0
    for i, paper in enumerate(papers, 1):
        print(f"{i}. {paper['name']}")
        print(f"   ID: {paper['id']}")
        print(f"   Score: {paper['similarity']:.4f}")
        print(f"   Authors ({len(paper['authors'])}):")
        for author in paper['authors']:
            print(f"      • {author['node_name']}")
        print()
        total_authors += len(paper['authors'])
    
    print(f"Summary: {len(papers)} papers, {total_authors} total authors")
    return papers

# Example query
query2 = "information retrieval"
results_1hop = demo_1hop_graphrag(query2, k=3)


1-Hop GraphRAG: Paper → Authors

Query: 'information retrieval'
Retrieving top 3 papers and their authors...

Results (3 papers, 460.74ms):
----------------------------------------------------------------------
1. A case-based approach to intelligent information retrieval
   ID: 7EA5DD49
   Score: 0.6544
   Authors (2):
      • jody j daniels
      • edwina l rissland

2. Discriminative models for information retrieval
   ID: 7B402A2B
   Score: 0.5558
   Authors (1):
      • ramesh nallapati

3. A hidden Markov model information retrieval system
   ID: 7CE4C917
   Score: 0.5547
   Authors (3):
      • tim leek
      • david miller
      • richard schwartz

Summary: 3 papers, 6 total authors


## Demo 3: 2-Hop GraphRAG - Paper → Authors → Affiliations

Multi-hop traversal: find papers, traverse to authors, then to their affiliations. This demonstrates how graph structure enriches retrieval with institutional context.


In [6]:
def demo_2hop_graphrag(query: str, k: int = 3):
    """Demonstrate 2-hop GraphRAG: Papers → Authors → Affiliations."""
    print("=" * 70)
    print(f"2-Hop GraphRAG: Paper → Authors → Affiliations")
    print("=" * 70)
    print(f"\nQuery: '{query}'")
    print(f"Retrieving top {k} papers, authors, and affiliations...\n")
    
    start_time = time.perf_counter()
    
    # Vector search
    papers = db.search_by_text(query, k=k)
    
    # Get authors for each paper
    for paper in papers:
        paper['authors'] = db.get_neighbors(
            paper['id'],
            direction='in',
            relation_types=['author_write_paper']
        )
        
        # Get affiliations for each author
        for author in paper['authors']:
            author['affiliations'] = db.get_neighbors(
                author['node_id'],
                direction='out',
                relation_types=['author_in_affiliation']
            )
    
    latency = (time.perf_counter() - start_time) * 1000
    
    print(f"Results ({len(papers)} papers, {latency:.2f}ms):")
    print("-" * 70)
    
    total_authors = 0
    total_affs = 0
    for i, paper in enumerate(papers, 1):
        print(f"{i}. {paper['name']}")
        print(f"   ID: {paper['id']}")
        print(f"   Score: {paper['similarity']:.4f}")
        print(f"   Authors ({len(paper['authors'])}):")
        
        for author in paper['authors']:
            print(f"      • {author['node_name']}")
            if author['affiliations']:
                print(f"        Affiliations ({len(author['affiliations'])}):")
                for aff in author['affiliations']:
                    print(f"          → {aff['node_name']}")
            else:
                print(f"        → No affiliation")
            total_affs += len(author['affiliations'])
        
        total_authors += len(paper['authors'])
        print()
    
    print(f"Summary: {len(papers)} papers, {total_authors} authors, {total_affs} affiliations")
    return papers

# Example query
query3 = "deep learning"
results_2hop = demo_2hop_graphrag(query3, k=3)


2-Hop GraphRAG: Paper → Authors → Affiliations

Query: 'deep learning'
Retrieving top 3 papers, authors, and affiliations...

Results (3 papers, 352.98ms):
----------------------------------------------------------------------
1. Deep learning via semi-supervised embedding
   ID: 80B42CFC
   Score: 0.6169
   Authors (3):
      • jason weston
        Affiliations (2):
          → royal holloway university of london
          → nec
      • ronan collobert
        Affiliations (1):
          → nec
      • frederic ratle
        Affiliations (1):
          → university of lausanne

2. Deep learning via Hessian-free optimization
   ID: 0BBA56E8
   Score: 0.5752
   Authors (1):
      • james martens
        Affiliations (1):
          → university of toronto

3. A unified architecture for natural language processing: deep neural networks with multitask learning
   ID: 7E30D880
   Score: 0.5320
   Authors (2):
      • jason weston
        Affiliations (2):
          → max planck society
     

## Demo 4: Inverse Traversal - Paper → Author → Co-authored Papers

Bidirectional graph traversal: find papers, get authors, then find other papers by those authors. This demonstrates discovering related papers through shared authorship.


In [7]:
def demo_inverse_traversal(query: str, k: int = 2):
    """Demonstrate inverse traversal: Paper → Author → Other Papers."""
    print("=" * 70)
    print(f"Inverse Traversal: Paper → Author → Co-authored Papers")
    print("=" * 70)
    print(f"\nQuery: '{query}'")
    print(f"Finding papers, then discovering co-authored papers...\n")
    
    start_time = time.perf_counter()
    
    # Vector search
    papers = db.search_by_text(query, k=k)
    
    # Get authors for each paper
    for paper in papers:
        paper['authors'] = db.get_neighbors(
            paper['id'],
            direction='in',
            relation_types=['author_write_paper']
        )
        
        # Get other papers by each author
        for author in paper['authors']:
            author['other_papers'] = db.get_neighbors(
                author['node_id'],
                direction='out',
                relation_types=['author_write_paper']
            )
            # Filter out the seed paper itself
            author['other_papers'] = [
                p for p in author['other_papers'] 
                if p['node_id'] != paper['id']
            ]
    
    latency = (time.perf_counter() - start_time) * 1000
    
    print(f"Results ({len(papers)} seed papers, {latency:.2f}ms):")
    print("-" * 70)
    
    total_authors = 0
    total_related_papers = 0
    for i, paper in enumerate(papers, 1):
        print(f"{i}. Seed Paper: {paper['name']}")
        print(f"   ID: {paper['id']}")
        print(f"   Authors ({len(paper['authors'])}):")
        
        for author in paper['authors']:
            print(f"      • {author['node_name']}")
            if author['other_papers']:
                print(f"        Co-authored Papers ({len(author['other_papers'])}):")
                for other_paper in author['other_papers'][:5]:  # Show max 5
                    print(f"          → {other_paper['node_name']}")
                if len(author['other_papers']) > 5:
                    print(f"          ... and {len(author['other_papers']) - 5} more")
            else:
                print(f"        → No other papers found")
            total_related_papers += len(author['other_papers'])
        
        total_authors += len(paper['authors'])
        print()
    
    print(f"Summary: {len(papers)} seed papers, {total_authors} authors, {total_related_papers} related papers discovered")
    return papers

# Example query
query4 = "neural networks"
results_inverse = demo_inverse_traversal(query4, k=2)


Inverse Traversal: Paper → Author → Co-authored Papers

Query: 'neural networks'
Finding papers, then discovering co-authored papers...

Results (2 seed papers, 295.52ms):
----------------------------------------------------------------------
1. Seed Paper: How to Train Neural Networks
   ID: 5E72175B
   Authors (2):
      • hansgeorg zimmermann
        → No other papers found
      • ralph neuneier
        Co-authored Papers (2):
          → Risk sensitive reinforcement learning
          → Optimal Asset Allocation using Adaptive Dynamic Programming

2. Seed Paper: Intrusion detection with neural networks
   ID: 7FEC5C75
   Authors (3):
      • j p ryan
        → No other papers found
      • mengjang lin
        → No other papers found
      • risto miikkulainen
        Co-authored Papers (2):
          → Forming neural networks through efficient and adaptive coevolution
          → SARDNET: A Self-Organizing Feature Map for Sequences

Summary: 2 seed papers, 5 authors, 4 related pap

In [8]:
# Clean up: close database connection
db.close()
print("Database connection closed.")


Database connection closed.
