# Quick Graph Dataset Analysis
This notebook scans the graph files under `submodules/graph-token/graphs`, computes basic statistics (nodes, edges, degree stats, connected components), and shows simple visualizations for a quick sanity-check. It's intentionally conservative about how many files it reads so it stays fast.

In [1]:
# Imports
import os
from pathlib import Path
import glob
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

ModuleNotFoundError: No module named 'seaborn'

In [2]:
# Configure the graphs directory (absolute path ensures notebook runs from any cwd)
BASE = Path('/data/young/capstone/graph-learning-benchmarks/submodules/graph-token/graphs')
assert BASE.exists(), f'Graphs base dir not found: {BASE}'
# list top-level dataset directories (e.g., 'ba', 'er', 'path', ...)
datasets = sorted([p.name for p in BASE.iterdir() if p.is_dir()])
print('Found datasets:', datasets)

Found datasets: ['ba', 'complete', 'er', 'path', 'sbm', 'sfn', 'star']


In [3]:
def analyze_graph(path: Path) -> dict:
    """Read a graphml file and return basic stats.
    Returns a dict with: dataset, split, filename, nodes, edges, avg_deg, max_deg, min_deg, components, directed
    """
    try:
        G = nx.read_graphml(path)
    except Exception as e:
        return {'error': str(e), 'path': str(path)}
    n = G.number_of_nodes()
    m = G.number_of_edges()
    directed = nx.is_directed(G)
    if n > 0:
        degrees = [d for _, d in G.degree()]
        avg_deg = sum(degrees) / n
        max_deg = max(degrees)
        min_deg = min(degrees)
    else:
        degrees = []
        avg_deg = max_deg = min_deg = 0
    # connected components: use weakly connected for directed graphs
    try:
        if directed:
            comps = nx.number_weakly_connected_components(G)
        else:
            comps = nx.number_connected_components(G)
    except Exception:
        comps = None
    # clustering (undirected) - may raise for very large graphs, handle safely
    try:
        if directed:
            clu = nx.average_clustering(G.to_undirected())
        else:
            clu = nx.average_clustering(G)
    except Exception:
        clu = None
    return dict(path=str(path), nodes=n, edges=m, avg_deg=avg_deg, max_deg=max_deg, min_deg=min_deg, components=comps, directed=directed, clustering=clu)

In [4]:
# Scan dataset folders and analyze a limited number of graphs for a quick summary
results = []
limit_per_split = 100  # max files to read per dataset split to keep analysis quick
for ds in datasets:
    ds_dir = BASE / ds
    # expected splits like train/valid/test under each dataset
    for split in sorted([p for p in ds_dir.iterdir() if p.is_dir()]):
        files = sorted(list(split.glob('*.graphml')))
        if not files:
            continue
        to_take = files[:limit_per_split]
        for p in to_take:
            res = analyze_graph(p)
            # attach metadata
            if 'error' not in res:
                res['dataset'] = ds
                res['split'] = split.name
            else:
                res['dataset'] = ds
                res['split'] = split.name
            results.append(res)
print(f'Analyzed {len(results)} graph files (up to {limit_per_split} per split).')

Analyzed 2100 graph files (up to 100 per split).


In [8]:
# Build DataFrame and show summary
df = pd.DataFrame(results)
# separate error rows if any
if 'error' in df.columns:
    error_df = df[df['error'].notnull()]
else:
    error_df = pd.DataFrame([])
# Convert numeric columns
for c in ['nodes','edges','avg_deg','max_deg','min_deg','components','clustering']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
# Build a safe boolean mask for rows without errors
if 'error' in df.columns:
    good_mask = df['path'].notnull() & df['error'].isna()
else:
    good_mask = df['path'].notnull()
summary = df[good_mask].groupby('dataset').agg({
    'nodes':['count','mean','median','min','max'],
    'edges':['mean','median'],
    'avg_deg':['mean','median']
})
summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
summary = summary.reset_index()
display(summary)
if not error_df.empty:
    print('Some files failed to read (showing up to 10):')
    display(error_df.head(10))

Unnamed: 0,dataset,nodes_count,nodes_mean,nodes_median,nodes_min,nodes_max,edges_mean,edges_median,avg_deg_mean,avg_deg_median
0,ba,300,12.08,12.0,5,19,29.376667,22.0,4.35056,3.75
1,complete,300,12.073333,12.0,5,19,75.633333,66.0,11.073333,11.0
2,er,300,12.086667,12.0,5,19,37.416667,23.5,5.413207,4.461538
3,path,300,12.073333,12.0,5,19,11.073333,11.0,1.807667,1.833333
4,sbm,300,11.926667,12.0,5,19,36.736667,30.0,5.375482,5.0
5,sfn,300,12.073333,12.0,5,19,15.36,15.0,2.499552,2.5
6,star,300,12.073333,12.0,5,19,11.073333,11.0,1.807667,1.833333


In [9]:
# Simple plots: nodes distribution across datasets (small sample).
plot_df = df[df['path'].notnull() & df.get('error', pd.Series()).isna()]
if not plot_df.empty:
    plt.figure(figsize=(12,6))
    sns.boxplot(x='dataset', y='nodes', data=plot_df)
    plt.title('Nodes distribution by dataset (sample)')
    plt.xticks(rotation=45)
    plt.show()
    plt.figure(figsize=(12,6))
    sns.scatterplot(x='nodes', y='edges', hue='dataset', data=plot_df, alpha=0.6)
    plt.title('Edges vs Nodes (sample)')
    plt.show()
else:
    print('No graph data available to plot.')

No graph data available to plot.


In [None]:
# Visualize a few small graphs using networkx (up to 2 per dataset).
from pathlib import Path
viz_out = Path('notebooks/graph_viz_images')
viz_out.mkdir(exist_ok=True)
# use the safe good_mask computed earlier (falls back to path-only if no 'error' column)
if 'error' in df.columns:
    good_mask = df['path'].notnull() & df['error'].isna()
else:
    good_mask = df['path'].notnull()
small = df[good_mask & (df['nodes'] <= 80)] if 'nodes' in df.columns else df[good_mask].head(10)
if small.empty:
    print('No small graphs found (nodes <= 80). Increase threshold or ensure nodes were computed.')
else:
    for ds, group in small.groupby('dataset'):
        for _, row in group.head(2).iterrows():
            p = Path(row['path'])
            try:
                G = nx.read_graphml(p)
            except Exception as e:
                print('Failed to read', p, e)
                continue
            # choose a layout - spring for general graphs
            plt.figure(figsize=(6,6))
            try:
                pos = nx.spring_layout(G, seed=42)
            except Exception:
                pos = None
            nx.draw(G, pos=pos, node_size=40, linewidths=0.1, edge_color='#999999', node_color='#1f78b4', with_labels=False)
            title = f"{ds}/{row.get('split','?')} - {p.name} (n={row.get('nodes','?')}, m={row.get('edges','?')})"
            plt.title(title)
            out = viz_out / f"{ds}_{row.get('split','')}_{p.stem}.png"
            plt.savefig(out, dpi=150, bbox_inches='tight')
            plt.show()
            print('Saved', out)

**Next steps / tips**
- Run the notebook to produce the summary and plots.
- To expand the analysis, increase `limit_per_split` or compute degree distributions per-graph.
- If you want me to run this analysis now and save outputs (figures / CSV), tell me and I will execute the notebook and return results.