# GPU-Accelerated Graph Analysis with cuGraph

This notebook compares NetworkX (CPU) vs cuGraph (GPU) for transaction graph analysis.

**Instructions:**
1. Runtime > Change runtime type > T4 GPU
2. Upload your `hound_edges.csv` file
3. Run all cells

In [None]:
# Install RAPIDS cuGraph (takes ~2-3 minutes)
!pip install cudf-cu12 cugraph-cu12 --extra-index-url=https://pypi.nvidia.com -q
print("Installation complete!")

In [None]:
# Verify GPU
!nvidia-smi

In [None]:
import time
import pandas as pd
import networkx as nx

try:
    import cudf
    import cugraph
    GPU_AVAILABLE = True
    print("cuGraph loaded successfully!")
except ImportError as e:
    GPU_AVAILABLE = False
    print(f"cuGraph not available: {e}")

In [None]:
# Upload your CSV file first, then run this cell
from google.colab import files
uploaded = files.upload()  # Click to upload hound_edges.csv

In [None]:
# Load data
df = pd.read_csv('hound_edges.csv')
print(f"Loaded {len(df)} edges")
print(f"Unique wallets: {df['from_wallet'].nunique() + df['to_wallet'].nunique()}")
df.head()

In [None]:
# Create integer mapping for wallets (required for cuGraph)
all_wallets = pd.concat([df['from_wallet'], df['to_wallet']]).unique()
wallet_to_id = {w: i for i, w in enumerate(all_wallets)}
id_to_wallet = {i: w for w, i in wallet_to_id.items()}

df['src'] = df['from_wallet'].map(wallet_to_id)
df['dst'] = df['to_wallet'].map(wallet_to_id)

print(f"Mapped {len(wallet_to_id)} unique wallets to integer IDs")

## NetworkX (CPU) Analysis

In [None]:
# Build NetworkX graph
start = time.time()
G_nx = nx.DiGraph()
for _, row in df.iterrows():
    G_nx.add_edge(row['src'], row['dst'])
nx_build_time = time.time() - start

print(f"NetworkX graph: {G_nx.number_of_nodes()} nodes, {G_nx.number_of_edges()} edges")
print(f"Build time: {nx_build_time:.3f}s")

In [None]:
# NetworkX PageRank (CPU)
start = time.time()
nx_pagerank = nx.pagerank(G_nx)
nx_pr_time = time.time() - start

print(f"NetworkX PageRank time: {nx_pr_time:.3f}s")

# Top 5 by PageRank
top_pr = sorted(nx_pagerank.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 wallets by PageRank (CPU):")
for node_id, score in top_pr:
    wallet = id_to_wallet[node_id]
    print(f"  {wallet[:24]}... score={score:.6f}")

In [None]:
# NetworkX Connected Components (CPU)
start = time.time()
nx_components = list(nx.weakly_connected_components(G_nx))
nx_cc_time = time.time() - start

print(f"NetworkX Connected Components time: {nx_cc_time:.3f}s")
print(f"Found {len(nx_components)} components")
print(f"Largest component: {len(max(nx_components, key=len))} nodes")

## cuGraph (GPU) Analysis

In [None]:
if GPU_AVAILABLE:
    # Build cuGraph graph
    start = time.time()
    
    # Create cuDF DataFrame (GPU)
    gdf = cudf.DataFrame({
        'src': df['src'].values,
        'dst': df['dst'].values
    })
    
    # Build graph
    G_cu = cugraph.Graph(directed=True)
    G_cu.from_cudf_edgelist(gdf, source='src', destination='dst')
    cu_build_time = time.time() - start
    
    print(f"cuGraph graph: {G_cu.number_of_vertices()} nodes, {G_cu.number_of_edges()} edges")
    print(f"Build time: {cu_build_time:.3f}s")
    print(f"\nSpeedup vs NetworkX: {nx_build_time/cu_build_time:.1f}x")
else:
    print("GPU not available - skipping cuGraph")

In [None]:
if GPU_AVAILABLE:
    # cuGraph PageRank (GPU)
    start = time.time()
    cu_pagerank = cugraph.pagerank(G_cu)
    cu_pr_time = time.time() - start
    
    print(f"cuGraph PageRank time: {cu_pr_time:.3f}s")
    print(f"Speedup vs NetworkX: {nx_pr_time/cu_pr_time:.1f}x")
    
    # Top 5 by PageRank
    top_cu = cu_pagerank.sort_values('pagerank', ascending=False).head(5)
    print("\nTop 5 wallets by PageRank (GPU):")
    for _, row in top_cu.to_pandas().iterrows():
        wallet = id_to_wallet[int(row['vertex'])]
        print(f"  {wallet[:24]}... score={row['pagerank']:.6f}")

In [None]:
if GPU_AVAILABLE:
    # cuGraph Connected Components (GPU)
    # Note: cuGraph uses weakly_connected_components on undirected view
    G_cu_undirected = cugraph.Graph(directed=False)
    G_cu_undirected.from_cudf_edgelist(gdf, source='src', destination='dst')
    
    start = time.time()
    cu_components = cugraph.connected_components(G_cu_undirected)
    cu_cc_time = time.time() - start
    
    n_components = cu_components['labels'].nunique()
    
    print(f"cuGraph Connected Components time: {cu_cc_time:.3f}s")
    print(f"Speedup vs NetworkX: {nx_cc_time/cu_cc_time:.1f}x")
    print(f"Found {n_components} components")

## Performance Summary

In [None]:
print("="*60)
print("PERFORMANCE COMPARISON")
print("="*60)
print(f"Dataset: {len(df)} edges, {len(wallet_to_id)} nodes")
print()
print(f"{'Algorithm':<25} {'NetworkX (CPU)':<15} {'cuGraph (GPU)':<15} {'Speedup':<10}")
print("-"*60)
print(f"{'Graph Build':<25} {nx_build_time:<15.3f} {cu_build_time if GPU_AVAILABLE else 'N/A':<15} {nx_build_time/cu_build_time if GPU_AVAILABLE else 'N/A':.1f}x")
print(f"{'PageRank':<25} {nx_pr_time:<15.3f} {cu_pr_time if GPU_AVAILABLE else 'N/A':<15} {nx_pr_time/cu_pr_time if GPU_AVAILABLE else 'N/A':.1f}x")
print(f"{'Connected Components':<25} {nx_cc_time:<15.3f} {cu_cc_time if GPU_AVAILABLE else 'N/A':<15} {nx_cc_time/cu_cc_time if GPU_AVAILABLE else 'N/A':.1f}x")
print()
print("Note: GPU speedup increases dramatically with larger graphs (100K+ edges)")

## Wash Trading Detection (GPU)

In [None]:
# Find reciprocal edges (A->B and B->A) - strong wash trading signal
edges_set = set(zip(df['src'], df['dst']))
reciprocals = [(s, d) for s, d in edges_set if (d, s) in edges_set and s < d]

print(f"Found {len(reciprocals)} reciprocal wallet pairs")
print("\nTop reciprocal pairs (potential wash trading):")
for src, dst in reciprocals[:10]:
    src_wallet = id_to_wallet[src]
    dst_wallet = id_to_wallet[dst]
    fwd_count = len(df[(df['src']==src) & (df['dst']==dst)])
    rev_count = len(df[(df['src']==dst) & (df['dst']==src)])
    print(f"  {src_wallet[:16]}... <-> {dst_wallet[:16]}... ({fwd_count}/{rev_count} txs)")

In [None]:
if GPU_AVAILABLE:
    # Betweenness Centrality (GPU) - find bridge wallets
    print("Computing Betweenness Centrality on GPU...")
    start = time.time()
    bc = cugraph.betweenness_centrality(G_cu)
    bc_time = time.time() - start
    
    print(f"Time: {bc_time:.3f}s")
    print("\nTop bridge wallets (high betweenness):")
    top_bc = bc.sort_values('betweenness_centrality', ascending=False).head(10)
    for _, row in top_bc.to_pandas().iterrows():
        wallet = id_to_wallet[int(row['vertex'])]
        print(f"  {wallet[:24]}... score={row['betweenness_centrality']:.6f}")