# Community Subgraph Density Analysis

This notebook analyzes community detection results from run 142 by:
1. Loading the full graph from CSV
2. Loading community assignments from JSON
3. Creating subgraphs for each community
4. Computing density metrics
5. Ranking communities by density

In [1]:
import pandas as pd
import json
import networkx as nx
import numpy as np
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


## 1. Load Data Files

In [2]:
# Load the CSV file containing the full graph
csv_path = "/home/zeneto/projects/kg4ai-community_detection/data/v0.0/df_nq_version0.csv"
df = pd.read_csv(csv_path)

print(f"Loaded graph data with {len(df)} nodes")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst few rows:")
df.head()

Loaded graph data with 108071 nodes
Columns: ['id', 'd_properties_document_title', 'd_properties_title_encode', 'cites_ids']

First few rows:


Unnamed: 0,id,d_properties_document_title,d_properties_title_encode,cites_ids
0,0,Therefore sign,Therefore_sign,"[6352, 2622, 5393, 3079, 15650, 33906, 13375, ..."
1,1,Watchman (law enforcement),Watchman_(law_enforcement),"[4286, 4215, 2819, 864, 5115, 3220, 21000, 156..."
2,3,Super Bowl 50 halftime show,Super_Bowl_50_halftime_show,"[6341, 1832, 360, 1739, 4108, 138, 73, 1941, 1..."
3,5,A Whiter Shade of Pale,A_Whiter_Shade_of_Pale,"[547, 1646, 3958, 423, 9518, 8998, 9020, 8938,..."
4,6,Globe,Globe,"[258, 2819, 600, 1188, 1564, 15616, 21607, 123..."


In [3]:
# Load the JSON file containing community assignments
json_path = "/home/zeneto/projects/kg4ai-community_detection/community_detection/output/run142-NQv0-best_run/run142_raw_communities.json"

with open(json_path, 'r') as f:
    communities_data = json.load(f)

print("Available algorithms:", list(communities_data.keys()))
for algo, communities in communities_data.items():
    print(f"{algo}: {len(communities)} communities")

Available algorithms: ['infomap', 'louvain', 'leiden', 'k-means']
infomap: 1249 communities
louvain: 24 communities
leiden: 19 communities
k-means: 40 communities


## 2. Preprocess the Data

In [4]:
# Parse the cites_ids column (convert string representation of list to actual list)
df['cites_ids_parsed'] = df['cites_ids'].apply(literal_eval)

# Create a mapping from node ID to all its cited nodes
node_citations = {}
all_nodes = set(df['id'].tolist())

for _, row in df.iterrows():
    node_id = row['id']
    cited_nodes = row['cites_ids_parsed']
    node_title = row['d_properties_document_title']
    node_citations[node_id] = cited_nodes
    # Add cited nodes to our set of all nodes
    all_nodes.update(cited_nodes)

print(f"Total unique nodes in the graph: {len(all_nodes)}")
print(f"Nodes with citation data: {len(node_citations)}")

Total unique nodes in the graph: 108071
Nodes with citation data: 108071


## 3. Build the Full Graph

In [5]:
# Create the full graph using NetworkX
G = nx.DiGraph()  # Using directed graph since we have citation relationships

# Add all nodes
G.add_nodes_from(all_nodes)

# Add node attributes (titles)
for node_id, node_title in zip(df['id'], df['d_properties_document_title']):
    G.nodes[node_id]['title'] = node_title  

# Add edges (citations)
edges = []
for node_id, cited_nodes in node_citations.items():
    for cited_node in cited_nodes:
        edges.append((node_id, cited_node))

G.add_edges_from(edges)

print(f"Full graph created:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Graph density: {nx.density(G):.6f}")

Full graph created:
  Nodes: 108071
  Edges: 5122983
  Graph density: 0.000439


## 4. Analyze Communities by Algorithm

In [6]:
def analyze_community_subgraphs(graph, communities_dict):
    """
    Analyze subgraphs for each community and compute density metrics.
    
    Args:
        graph: NetworkX graph object
        communities_dict: Dictionary with algorithm names as keys and community lists as values
    
    Returns:
        DataFrame with analysis results
    """
    results = []
    
    for algorithm, communities in communities_dict.items():
        if len(communities) == 0:
            print(f"Skipping {algorithm} - no communities found")
            continue
            
        print(f"\nAnalyzing {algorithm} with {len(communities)} communities...")
        
        for comm_id, community_nodes in enumerate(communities):
            if len(community_nodes) < 20:
                # Skip communities with fewer than 2 nodes (can't compute density)
                continue
                
            # Create subgraph for this community
            subgraph = graph.subgraph(community_nodes)
            
            # Compute metrics
            num_nodes = subgraph.number_of_nodes()
            num_edges = subgraph.number_of_edges()
            
            # NetworkX density calculation
            if num_nodes > 1:
                density = nx.density(subgraph)
            else:
                density = 0.0
            
            # Store results
            results.append({
                'Algorithm': algorithm,
                'Community_ID': comm_id,
                'Num_Nodes': num_nodes,
                'Num_Edges': num_edges,
                'Density': density
            })
            
            # Progress indicator for large community sets
            if (comm_id + 1) % 50 == 0:
                print(f"  Processed {comm_id + 1}/{len(communities)} communities")
    
    return pd.DataFrame(results)

# Run the analysis
results_df = analyze_community_subgraphs(G, communities_data)
print(f"\nAnalysis complete. Total communities analyzed: {len(results_df)}")


Analyzing infomap with 1249 communities...
  Processed 50/1249 communities
  Processed 100/1249 communities
  Processed 150/1249 communities
  Processed 200/1249 communities
  Processed 250/1249 communities
  Processed 300/1249 communities
  Processed 400/1249 communities

Analyzing louvain with 24 communities...

Analyzing leiden with 19 communities...

Analyzing k-means with 40 communities...

Analysis complete. Total communities analyzed: 460


## 5. Summary Statistics by Algorithm

In [7]:
# Display summary statistics by algorithm
if len(results_df) > 0:
    print("Summary Statistics by Algorithm:")
    print("=" * 50)
    
    for algorithm in results_df['Algorithm'].unique():
        algo_data = results_df[results_df['Algorithm'] == algorithm]
        
        print(f"\n{algorithm.upper()}:")
        print(f"  Total communities: {len(algo_data)}")
        print(f"  Average density: {algo_data['Density'].mean():.6f}")
        print(f"  Median density: {algo_data['Density'].median():.6f}")
        print(f"  Max density: {algo_data['Density'].max():.6f}")
        print(f"  Min density: {algo_data['Density'].min():.6f}")
        print(f"  Average community size: {algo_data['Num_Nodes'].mean():.2f} nodes")
        print(f"  Average edges per community: {algo_data['Num_Edges'].mean():.2f}")
else:
    print("No valid communities found for analysis.")

Summary Statistics by Algorithm:

INFOMAP:
  Total communities: 377
  Average density: 0.199457
  Median density: 0.140336
  Max density: 0.782656
  Min density: 0.002168
  Average community size: 277.44 nodes
  Average edges per community: 7273.71

LOUVAIN:
  Total communities: 24
  Average density: 0.170148
  Median density: 0.026509
  Max density: 0.878546
  Min density: 0.001449
  Average community size: 4480.58 nodes
  Average edges per community: 150726.08

LEIDEN:
  Total communities: 19
  Average density: 0.188669
  Median density: 0.005739
  Max density: 0.836327
  Min density: 0.001390
  Average community size: 5659.68 nodes
  Average edges per community: 190744.58

K-MEANS:
  Total communities: 40
  Average density: 0.005001
  Median density: 0.004184
  Max density: 0.016522
  Min density: 0.000600
  Average community size: 2688.35 nodes
  Average edges per community: 34547.38


## 6. Ranked Communities by Density

In [8]:
# Rank all communities by density (highest to lowest)
if len(results_df) > 0:
    ranked_communities = results_df.sort_values('Density', ascending=False).reset_index(drop=True)
    
    print("Top 20 Communities by Density:")
    print("=" * 70)
    
    top_20 = ranked_communities.head(20)
    
    # Format the display
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    
    print(top_20.to_string(index=False, float_format='%.6f'))
    
    print(f"\n\nBottom 10 Communities by Density:")
    print("=" * 70)
    
    bottom_10 = ranked_communities.tail(10)
    print(bottom_10.to_string(index=False, float_format='%.6f'))
else:
    print("No communities to rank.")

Top 20 Communities by Density:
Algorithm  Community_ID  Num_Nodes  Num_Edges  Density
  louvain            13         48       1982 0.878546
   leiden            17         50       2049 0.836327
  infomap           177         53       2157 0.782656
  infomap           364         21        325 0.773810
  infomap           114        158      19148 0.771910
  infomap           244         46       1418 0.685024
  infomap           333         27        473 0.673789
  infomap           339         27        450 0.641026
  infomap           253         32        633 0.638105
  infomap           326         26        413 0.635385
  infomap           372         22        290 0.627706
  infomap           225         37        836 0.627628
  infomap           358         25        373 0.621667
  infomap           354         22        287 0.621212
  infomap           323         23        314 0.620553
  infomap           147         66       2613 0.609091
  infomap           293         39

## 7. Save Results

In [9]:
# Save the complete ranked results to CSV
if len(results_df) > 0:
    output_path = "/home/zeneto/projects/kg4ai-community_detection/notebook/sample/community_density_analysis.csv"
    ranked_communities.to_csv(output_path, index=False)
    print(f"Results saved to: {output_path}")
    
    # Also save summary statistics
    summary_stats = []
    for algorithm in results_df['Algorithm'].unique():
        algo_data = results_df[results_df['Algorithm'] == algorithm]
        summary_stats.append({
            'Algorithm': algorithm,
            'Total_Communities': len(algo_data),
            'Avg_Density': algo_data['Density'].mean(),
            'Median_Density': algo_data['Density'].median(),
            'Max_Density': algo_data['Density'].max(),
            'Min_Density': algo_data['Density'].min(),
            'Avg_Community_Size': algo_data['Num_Nodes'].mean(),
            'Avg_Edges_Per_Community': algo_data['Num_Edges'].mean()
        })
    
    summary_df = pd.DataFrame(summary_stats)
    summary_path = "/home/zeneto/projects/kg4ai-community_detection/notebook/sample/algorithm_summary_stats.csv"
    summary_df.to_csv(summary_path, index=False)
    print(f"Summary statistics saved to: {summary_path}")
    
    print("\nAnalysis complete!")
else:
    print("No results to save.")

Results saved to: /home/zeneto/projects/kg4ai-community_detection/notebook/sample/community_density_analysis.csv
Summary statistics saved to: /home/zeneto/projects/kg4ai-community_detection/notebook/sample/algorithm_summary_stats.csv

Analysis complete!


## 8. Additional Insights

In [10]:
# Define node bins
bins = [0, 100, 200, 300, 400, 500]
labels = ["0-100", "100-200", "200-300", "300-400", "400-500"]

# Add a column indicating which bin each community belongs to
ranked_communities["Node_Range"] = pd.cut(ranked_communities["Num_Nodes"], bins=bins, labels=labels, right=False)

# remove communities that do not fall into any bin (e.g., >500 nodes)
ranked_communities = ranked_communities.dropna(subset=["Node_Range"])

# Get the most dense community in each bin
most_dense_per_bin = ranked_communities.loc[ranked_communities.groupby("Node_Range")["Density"].idxmax()]
most_dense_per_bin

Unnamed: 0,Algorithm,Community_ID,Num_Nodes,Num_Edges,Density,Node_Range
0,louvain,13,48,1982,0.878546,0-100
4,infomap,114,158,19148,0.77191,100-200
154,infomap,73,297,16795,0.191043,200-300
155,infomap,40,320,19325,0.189312,300-400
315,infomap,84,421,10283,0.058155,400-500


In [11]:
import re


def get_community_subgraph(graph, communities_data, algorithm, community_id):
    """
    Returns the subgraph for a given algorithm and community ID.

    Args:
        graph: NetworkX graph object (full graph)
        communities_data: dict of {algorithm: list of communities}
        algorithm: str, algorithm name
        community_id: int, index of the community in communities_data[algorithm]

    Returns:
        NetworkX subgraph object
    """
    nodes = communities_data[algorithm][community_id]
    return graph.subgraph(nodes).copy()

# Example usage for most_dense_per_bin:
subgraphs = []
for _, row in most_dense_per_bin.iterrows():
    subgraph = get_community_subgraph(G, communities_data, row['Algorithm'], row['Community_ID'])
    subgraphs.append(subgraph)

# Save the subgraphs to GraphML files
for i, sg in enumerate(subgraphs):
    output_file = f"/home/zeneto/projects/kg4ai-community_detection/notebook/sample/subgraph_{i}.graphml"
    nx.write_graphml(sg, output_file)
    print(f"Subgraph {i} saved to {output_file}")

stop_words = {
    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
    'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
    'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'list'
}

for i, sg in enumerate(subgraphs):
    print(f"Subgraph {i}: Nodes={sg.number_of_nodes()}, Edges={sg.number_of_edges()}, Density={nx.density(sg):.6f}")

    # Print statistics of the first subgraph
    subgraphs[1].number_of_nodes(), subgraphs[1].number_of_edges(), nx.density(subgraphs[1])

    # Find keywords in the titles of the nodes in the subgraph
    titles = [data['title'] for node, data in sg.nodes(data=True) if 'title' in data]
    text = ' '.join(titles).lower()
    # Use regex to extract only ASCII words with 3+ letters
    all_words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
    word_freq = pd.Series(all_words).value_counts()
    word_freq = word_freq[~word_freq.index.isin(stop_words)]
    
    print(f"  Top keywords: {word_freq.head(10).to_dict()}")

Subgraph 0 saved to /home/zeneto/projects/kg4ai-community_detection/notebook/sample/subgraph_0.graphml
Subgraph 1 saved to /home/zeneto/projects/kg4ai-community_detection/notebook/sample/subgraph_1.graphml
Subgraph 2 saved to /home/zeneto/projects/kg4ai-community_detection/notebook/sample/subgraph_2.graphml
Subgraph 3 saved to /home/zeneto/projects/kg4ai-community_detection/notebook/sample/subgraph_3.graphml
Subgraph 4 saved to /home/zeneto/projects/kg4ai-community_detection/notebook/sample/subgraph_4.graphml
Subgraph 0: Nodes=48, Edges=1982, Density=0.878546
  Top keywords: {'gun': 45, 'laws': 45, 'new': 4, 'carry': 2, 'california': 2, 'states': 2, 'united': 2, 'virginia': 2, 'carolina': 2, 'campus': 1}
Subgraph 1: Nodes=158, Edges=19148, Density=0.771910
  Top keywords: {'visa': 152, 'citizens': 86, 'requirements': 85, 'policy': 66, 'south': 4, 'united': 4, 'republic': 3, 'korea': 3, 'north': 3, 'saint': 3}
Subgraph 2: Nodes=297, Edges=16795, Density=0.191043
  Top keywords: {'series