# Document Network Analysis

**Duration:** ~30 min | **Platform:** Kaggle dual Tesla T4

This notebook builds a **document similarity network** using embeddings,
performs network analysis (centrality, community detection), and uses
the LLM to summarize discovered clusters.

### What you'll learn
1. Generate document embeddings
2. Build a similarity network
3. Run network analysis (centrality, Louvain communities)
4. Visualize with Graphistry
5. LLM-powered cluster summarization

In [None]:
!pip install -q git+https://github.com/llamatelemetry/llamatelemetry.git@v1.2.0

import llamatelemetry
from llamatelemetry.llama import ServerManager, LlamaCppClient
from llamatelemetry.kaggle import rapids_gpu
from huggingface_hub import hf_hub_download

llamatelemetry.init(service_name="doc-network")

model_path = hf_hub_download(
    repo_id="bartowski/google_gemma-3-1b-it-GGUF",
    filename="google_gemma-3-1b-it-Q4_K_M.gguf",
    cache_dir="/root/.cache/huggingface",
)

mgr = ServerManager()
mgr.start_server(model_path=model_path, gpu_layers=99, tensor_split="1.0,0.0", ctx_size=2048)
mgr.wait_until_ready(timeout=60)
client = LlamaCppClient(base_url="http://127.0.0.1:8090")
print("Ready")

## Generate Document Embeddings

Use the server's embedding endpoint to create vector representations of documents.

In [None]:
import numpy as np
from llamatelemetry.embeddings import cosine_similarity

documents = [
    {"id": "D1", "title": "Neural Networks", "text": "Neural networks are computing systems inspired by biological neural networks in the brain."},
    {"id": "D2", "title": "Deep Learning", "text": "Deep learning uses multiple layers of neural networks to learn representations of data."},
    {"id": "D3", "title": "Transformers", "text": "Transformer architecture uses self-attention mechanisms for sequence-to-sequence tasks."},
    {"id": "D4", "title": "CUDA Programming", "text": "CUDA enables parallel computing on NVIDIA GPUs for scientific and ML workloads."},
    {"id": "D5", "title": "GPU Memory", "text": "GPU memory management is critical for training large models that exceed VRAM capacity."},
    {"id": "D6", "title": "Quantization", "text": "Model quantization reduces precision of weights to fit larger models in limited memory."},
    {"id": "D7", "title": "Fine-tuning", "text": "Fine-tuning adapts pre-trained models to specific tasks using domain-specific data."},
    {"id": "D8", "title": "LoRA", "text": "Low-Rank Adaptation trains small adapter matrices instead of updating all model weights."},
    {"id": "D9", "title": "Inference", "text": "LLM inference optimizations include batching, KV caching, and speculative decoding."},
    {"id": "D10", "title": "Observability", "text": "OpenTelemetry provides distributed tracing and metrics for monitoring ML pipelines."},
]

# Generate embeddings
embeddings = []
for doc in documents:
    emb = client.embed(doc["text"])
    embeddings.append(emb)

embeddings = np.array(embeddings)
print(f"Generated {len(embeddings)} embeddings, dimension={embeddings.shape[1]}")

## Build Document Network

Create edges between documents whose cosine similarity exceeds a threshold.

In [None]:
import pandas as pd

SIMILARITY_THRESHOLD = 0.5

# Compute pairwise similarity matrix
n = len(documents)
sim_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        sim_matrix[i][j] = sim
        sim_matrix[j][i] = sim

# Build edges above threshold
edges = []
for i in range(n):
    for j in range(i + 1, n):
        if sim_matrix[i][j] >= SIMILARITY_THRESHOLD:
            edges.append({
                "src": documents[i]["id"],
                "dst": documents[j]["id"],
                "similarity": round(sim_matrix[i][j], 3),
            })

edge_df = pd.DataFrame(edges)
node_df = pd.DataFrame([{"id": d["id"], "title": d["title"]} for d in documents])

print(f"Document network: {len(node_df)} nodes, {len(edge_df)} edges (threshold={SIMILARITY_THRESHOLD})")
if len(edge_df) > 0:
    print(edge_df.to_string(index=False))

## Network Analysis

Compute centrality metrics and detect communities.

In [None]:
@llamatelemetry.task(name="centrality-analysis")
def compute_centrality(node_df, edge_df):
    """Compute degree centrality for each document."""
    degree = {row["id"]: 0 for _, row in node_df.iterrows()}
    for _, row in edge_df.iterrows():
        degree[row["src"]] = degree.get(row["src"], 0) + 1
        degree[row["dst"]] = degree.get(row["dst"], 0) + 1

    max_d = max(degree.values()) if degree and max(degree.values()) > 0 else 1
    centrality = {k: v / max_d for k, v in degree.items()}
    return centrality

@llamatelemetry.task(name="community-detection")
def detect_communities(node_df, edge_df):
    """Simple community detection via connected components."""
    # Build adjacency
    adj = {row["id"]: set() for _, row in node_df.iterrows()}
    for _, row in edge_df.iterrows():
        adj[row["src"]].add(row["dst"])
        adj[row["dst"]].add(row["src"])

    # BFS to find connected components
    visited = set()
    communities = {}
    community_id = 0
    for node in adj:
        if node not in visited:
            queue = [node]
            while queue:
                current = queue.pop(0)
                if current not in visited:
                    visited.add(current)
                    communities[current] = community_id
                    queue.extend(adj[current] - visited)
            community_id += 1
    return communities

centrality = compute_centrality(node_df, edge_df)
communities = detect_communities(node_df, edge_df)

print(f"{'Document':<6} {'Title':<20} {'Centrality':<12} {'Community'}")
print("-" * 50)
for doc in documents:
    did = doc["id"]
    print(f"{did:<6} {doc['title']:<20} {centrality.get(did, 0):<12.2f} {communities.get(did, -1)}")

## Clustering and Visualization

In [None]:
with rapids_gpu(1):
    try:
        import graphistry

        # Enrich node data
        node_df["centrality"] = node_df["id"].map(centrality)
        node_df["community"] = node_df["id"].map(communities)
        node_df["size"] = (node_df["centrality"] * 25 + 5).astype(int)

        g = (graphistry
             .edges(edge_df, "src", "dst")
             .nodes(node_df, "id")
             .bind(point_title="title", point_size="size", edge_weight="similarity")
             .encode_point_color("community", palette=["blue", "green", "red", "orange", "purple"]))
        g.plot()
    except Exception as e:
        print(f"Graphistry: {e}")
        # Fallback: text summary
        community_groups = {}
        for doc in documents:
            cid = communities.get(doc["id"], -1)
            community_groups.setdefault(cid, []).append(doc["title"])
        for cid, titles in sorted(community_groups.items()):
            print(f"  Community {cid}: {', '.join(titles)}")

## LLM-Powered Cluster Summarization

Use the LLM to generate a summary for each discovered community.

In [None]:
# Group documents by community
community_docs = {}
for doc in documents:
    cid = communities.get(doc["id"], -1)
    community_docs.setdefault(cid, []).append(doc)

for cid, docs in sorted(community_docs.items()):
    doc_texts = "\n".join(f"- {d['title']}: {d['text']}" for d in docs)

    with llamatelemetry.span("summarize-cluster", community_id=cid):
        resp = client.chat.completions.create(
            messages=[{
                "role": "user",
                "content": f"Summarize the common theme of these documents in one sentence:\n{doc_texts}"
            }],
            max_tokens=64, temperature=0.3,
        )
        summary = resp.choices[0].message.content

    titles = [d["title"] for d in docs]
    print(f"Community {cid} ({', '.join(titles)}):")
    print(f"  {summary}\n")

## Summary

This notebook demonstrated:
- **Embedding-based similarity** networks from document collections
- **Network analysis**: degree centrality and community detection
- **GPU-accelerated visualization** with Graphistry
- **LLM summarization** of discovered clusters

In [None]:
mgr.stop_server()
llamatelemetry.shutdown()
print("Done.")