# Wikipedia Embeddings Visualization

This notebook allows you to visualize the 3D embeddings of Wikipedia pages from your SQLite database. You can explore clusters of similar pages and see how they're organized in 3D space.

## Features:
- Interactive 3D visualization of embeddings
- Cluster-by-cluster exploration
- Page title annotations instead of numeric IDs
- Color-coded clusters
- Zoom, rotate, and pan capabilities

In [None]:
import sqlite3
import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler
import ipywidgets as widgets
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

# Set up database connection
def get_db_connection():
    return sqlite3.connect('chunk_log.db')

def load_embeddings_for_cluster(cluster_id, namespace='enwiki_namespace_0', limit=None):
    """Load 3D embeddings and page information for a specific cluster."""
    conn = get_db_connection()
    
    query = """
        SELECT 
            pv.page_id,
            pv.three_d_vector,
            pl.title,
            pl.url,
            pv.cluster_id
        FROM page_vector pv
        INNER JOIN page_log pl ON pv.page_id = pl.page_id
        INNER JOIN chunk_log cl ON pl.chunk_name = cl.chunk_name
        WHERE pv.cluster_id = ?
        AND cl.namespace = ?
        AND pv.three_d_vector IS NOT NULL
        ORDER BY pv.page_id
    """
    
    if limit:
        query += f" LIMIT {limit}"
    
    df = pd.read_sql_query(query, conn, params=(cluster_id, namespace))
    conn.close()
    
    # Parse the 3D vector JSON
    def parse_vector(vector_str):
        try:
            return np.array(json.loads(vector_str))
        except:
            return np.array([0, 0, 0])
    
    df['vector'] = df['three_d_vector'].apply(parse_vector)
    df['x'] = df['vector'].apply(lambda v: v[0])
    df['y'] = df['vector'].apply(lambda v: v[1])
    df['z'] = df['vector'].apply(lambda v: v[2])
    
    return df

def get_cluster_info(namespace='enwiki_namespace_0'):
    """Get information about all clusters."""
    conn = get_db_connection()
    
    query = """
        SELECT 
            cluster_id,
            COUNT(*) as page_count,
            SUM(CASE WHEN three_d_vector IS NOT NULL THEN 1 ELSE 0 END) as projected_count
        FROM page_vector
        INNER JOIN page_log ON page_vector.page_id = page_log.page_id
        INNER JOIN chunk_log ON page_log.chunk_name = chunk_log.chunk_name
        WHERE chunk_log.namespace = ?
        AND cluster_id IS NOT NULL
        GROUP BY cluster_id
        ORDER BY cluster_id
    """
    
    df = pd.read_sql_query(query, conn, params=(namespace,))
    conn.close()
    
    return df

In [None]:
# Load cluster information
cluster_info = get_cluster_info()
print(f"Found {len(cluster_info)} clusters")
display(cluster_info.head(10))

In [None]:
# Create interactive cluster selector
cluster_dropdown = widgets.Dropdown(
    options=[f"Cluster {row['cluster_id']} ({row['page_count']} pages, {row['projected_count']} projected)" 
             for _, row in cluster_info.iterrows()],
    description='Select Cluster:',
    style={'description_width': 'initial'}
)

limit_slider = widgets.IntSlider(
    value=100,
    min=10,
    max=1000,
    step=10,
    description='Max Pages:',
    style={'description_width': 'initial'}
)

def on_cluster_change(change):
    """Update visualization when cluster selection changes."""
    if change['type'] == 'change' and change['name'] == 'value':
        selected_text = cluster_dropdown.value
        cluster_id = int(selected_text.split('Cluster ')[1].split(' ')[0])
        limit = limit_slider.value
        
        # Load data for selected cluster
        df = load_embeddings_for_cluster(cluster_id, limit=limit)
        
        if len(df) == 0:
            print(f"No projected vectors found for cluster {cluster_id}")
            return
        
        # Create 3D scatter plot
        fig = go.Figure(data=[go.Scatter3d(
            x=df['x'],
            y=df['y'],
            z=df['z'],
            mode='markers',
            marker=dict(
                size=5,
                color=df['cluster_id'],
                colorscale='Viridis',
                showscale=False,
                opacity=0.8
            ),
            text=df['title'],
            hovertemplate='<b>%{text}</b><br>' +
                         'X: %{x:.3f}<br>' +
                         'Y: %{y:.3f}<br>' +
                         'Z: %{z:.3f}<br>' +
                         '<extra></extra>',
            showlegend=False
        )])
        
        fig.update_layout(
            title=f'3D Visualization of Cluster {cluster_id} ({len(df)} pages)',
            scene=dict(
                xaxis_title='X',
                yaxis_title='Y',
                zaxis_title='Z',
                xaxis=dict(backgroundcolor="rgb(200, 200, 230)"),
                yaxis=dict(backgroundcolor="rgb(200, 200, 230)"),
                zaxis=dict(backgroundcolor="rgb(200, 200, 230)"),
                camera=dict(
                    eye=dict(x=1.2, y=1.2, z=0.6)
                )
            ),
            width=800,
            height=600,
            margin=dict(l=0, r=0, b=0, t=40)
        )
        
        fig.show()

# Set up event handlers
cluster_dropdown.observe(on_cluster_change, names='value')
limit_slider.observe(on_cluster_change, names='value')

# Display controls
print("Select a cluster to visualize:")
display(widgets.VBox([cluster_dropdown, limit_slider]))

# Initialize with first cluster
on_cluster_change({'type': 'change', 'name': 'value', 'new': cluster_dropdown.value})

In [None]:
# Alternative: 2D visualization using PCA projection
from sklearn.decomposition import PCA

def create_2d_visualization(cluster_id, limit=200):
    """Create a 2D visualization using PCA."""
    df = load_embeddings_for_cluster(cluster_id, limit=limit)
    
    if len(df) == 0:
        print(f"No projected vectors found for cluster {cluster_id}")
        return
    
    # Extract 3D vectors and apply PCA to reduce to 2D
    vectors = np.array(df['vector'].tolist())
    pca = PCA(n_components=2)
    vectors_2d = pca.fit_transform(vectors)
    
    df['pca_x'] = vectors_2d[:, 0]
    df['pca_y'] = vectors_2d[:, 1]
    
    # Create 2D scatter plot
    fig = px.scatter(
        df, 
        x='pca_x', 
        y='pca_y',
        text='title',
        title=f'2D PCA Projection of Cluster {cluster_id}',
        hover_data=['title', 'url'],
        width=800,
        height=600
    )
    
    fig.update_traces(
        textposition='top center',
        textfont_size=8,
        marker=dict(size=8, opacity=0.7)
    )
    
    fig.show()

# Create 2D visualization widget
cluster_2d_dropdown = widgets.Dropdown(
    options=[f"Cluster {row['cluster_id']} ({row['page_count']} pages, {row['projected_count']} projected)" 
             for _, row in cluster_info.iterrows()],
    description='Select Cluster (2D):',
    style={'description_width': 'initial'}
)

limit_2d_slider = widgets.IntSlider(
    value=100,
    min=10,
    max=500,
    step=10,
    description='Max Pages:',
    style={'description_width': 'initial'}
)

def on_2d_cluster_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_text = cluster_2d_dropdown.value
        cluster_id = int(selected_text.split('Cluster ')[1].split(' ')[0])
        limit = limit_2d_slider.value
        create_2d_visualization(cluster_id, limit)

cluster_2d_dropdown.observe(on_2d_cluster_change, names='value')
limit_2d_slider.observe(on_2d_cluster_change, names='value')

print("\nSelect a cluster for 2D PCA visualization:")
display(widgets.VBox([cluster_2d_dropdown, limit_2d_slider]))

# Initialize with first cluster
on_2d_cluster_change({'type': 'change', 'name': 'value', 'new': cluster_2d_dropdown.value})

In [None]:
# Cluster statistics overview
def show_cluster_statistics():
    """Show statistics about all clusters."""
    conn = get_db_connection()
    
    query = """
        SELECT 
            cluster_id,
            COUNT(*) as total_pages,
            SUM(CASE WHEN three_d_vector IS NOT NULL THEN 1 ELSE 0 END) as projected_pages,
            AVG(CASE WHEN three_d_vector IS NOT NULL THEN 1 ELSE 0 END) * 100 as projection_percentage
        FROM page_vector
        INNER JOIN page_log ON page_vector.page_id = page_log.page_id
        INNER JOIN chunk_log ON page_log.chunk_name = chunk_log.chunk_name
        WHERE chunk_log.namespace = 'enwiki_namespace_0'
        AND cluster_id IS NOT NULL
        GROUP BY cluster_id
        ORDER BY cluster_id
    """
    
    stats_df = pd.read_sql_query(query, conn)
    conn.close()
    
    # Create a bar chart showing projection progress
    fig = px.bar(
        stats_df,
        x='cluster_id',
        y='projection_percentage',
        title='Projection Progress by Cluster',
        labels={'cluster_id': 'Cluster ID', 'projection_percentage': 'Projected (%)'},
        width=800,
        height=400
    )
    
    fig.add_hline(y=100, line_dash="dash", line_color="red", annotation_text="100% Complete")
    fig.update_layout(yaxis_range=[0, 100])
    fig.show()
    
    # Show summary statistics
    total_pages = stats_df['total_pages'].sum()
    total_projected = stats_df['projected_pages'].sum()
    overall_percentage = (total_projected / total_pages) * 100
    
    print(f"\n=== Projection Summary ===")
    print(f"Total pages: {total_pages:,}")
    print(f"Projected pages: {total_projected:,}")
    print(f"Overall completion: {overall_percentage:.1f}%")
    print(f"Clusters with 100% projection: {(stats_df['projection_percentage'] == 100).sum()}")
    print(f"Clusters with 0% projection: {(stats_df['projection_percentage'] == 0).sum()}")
    
    return stats_df

# Show cluster statistics
cluster_stats = show_cluster_statistics()

## How to Use This Notebook

1. **3D Visualization**: Use the first dropdown to select a cluster and adjust the page limit with the slider. The 3D plot will show you how pages are distributed in the embedding space.

2. **2D Visualization**: Use the second dropdown for a 2D PCA projection, which can be easier to interpret and faster to render.

3. **Cluster Statistics**: The final section shows you the overall progress of projection across all clusters.

## Tips for Exploration:
- **Zoom**: Use your mouse wheel or trackpad to zoom in/out
- **Rotate**: Click and drag to rotate the 3D view
- **Pan**: Shift+click and drag to pan the view
- **Hover**: Hover over points to see page titles
- **Adjust limits**: Use the sliders to control how many pages are displayed

## Next Steps:
- If you see interesting clusters, you can run the `project` command to project more clusters
- Use the page URLs to explore the actual Wikipedia articles
- Look for patterns in how similar pages are grouped together