Spectral Clustering

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
graph=pickle.load(open('./graphs/graph_match2499719_team1609.pkl','rb'))
print(len(graph.nodes))
print(graph.nodes[40])
print(len(graph.edges))
print(graph.edges[25, 26])

39
{'event_count': 28, 'most_common_event': 'PASS', 'unique_players': 5, 'zone_name': 'OUTSIDE', 'role_distribution': array([0.2, 0.2, 0.6, 0. ])}
293
{'weight': 9, 'transition_frequency': 0.4090909090909091, 'most_common_event': 'PASS', 'start_zone_name': 'LEFT_WING_MID_THIRD_ATT', 'end_zone_name': 'LEFT_HALF_MID_THIRD_ATT'}


We have to encode all event types

In [3]:
def encode_event_semantic(event_type):
    """Group events by semantic meaning"""
    # Group related events
    attacking_events = ['PASS', 'SHOT', 'CARRY', 'TAKE_ON']
    defensive_events = ['CLEARANCE', 'INTERCEPTION', 'PRESSURE', 'RECOVERY', 'DUEL']
    goalkeeper_events = ['GOALKEEPER']
    disruption_events = ['FOUL_COMMITTED', 'CARD', 'MISCONTROL', 'BALL_OUT']
    meta_events = ['FORMATION_CHANGE', 'SUBSTITUTION', 'PLAYER_ON', 'PLAYER_OFF']
    
    if event_type in attacking_events:
        return [1, 0, 0, 0, 0, 0]  # Attacking
    elif event_type in defensive_events:
        return [0, 1, 0, 0, 0, 0]  # Defensive
    elif event_type in goalkeeper_events:
        return [0, 0, 1, 0, 0, 0]  # Goalkeeper
    elif event_type in disruption_events:
        return [0, 0, 0, 1, 0, 0]  # Disruption
    elif event_type in meta_events:
        return [0, 0, 0, 0, 1, 0]  # Meta
    else:
        return [0, 0, 0, 0, 0, 1]  # Generic/Other

In [29]:
from spectral_build_vizualization import discover_tactical_patterns

labels, nodes = discover_tactical_patterns(graph, k=5)

Louvaine

In [30]:
import community as community_louvain  # pip install python-louvain
import numpy as np
import networkx as nx

graph_undirected = graph.to_undirected() if isinstance(graph, nx.DiGraph) else graph
partition_louvain = community_louvain.best_partition(graph_undirected)
labels_louvain = np.array([partition_louvain[node] for node in graph.nodes()])

## Silhouette score

The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette value ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters. 

In [31]:
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
node_features = []
for node in graph.nodes():
    
    node_data = graph.nodes[node]
    
    # Create feature vector from the node attributes
    features = [
        node_data['event_count'],
        node_data['unique_players'],
        *encode_event_semantic(node_data['most_common_event']),
        *node_data['role_distribution']  # This unpacks the 4 values
    ]
    node_features.append(features)

node_features = np.array(node_features)
# Standardize features so as to have mean=0 and variance=1 
# Otherwise, features with larger scales can dominate the distance calculations
node_features = scalar.fit_transform(node_features)
score = silhouette_score(node_features, np.array(labels))
print(score)

0.06744593697943813


In [32]:

score_louvain = silhouette_score(node_features, labels_louvain)
print(score_louvain)

-0.1579616440776901


## Calinski–Harabasz (CH) Index
Defined as the ratio of the between-cluster separation (BCSS) to the within-cluster dispersion (WCSS), normalized by their number of degrees of freedom. BCSS (Between-Cluster Sum of Squares) is the weighted sum of squared Euclidean distances between each cluster centroid (mean) and the overall data centroid (mean). WCSS (Within-Cluster Sum of Squares) is the sum of squared Euclidean distances between the data points and their respective cluster centroids.

- < 10: Very poor separation
- 10-50: Poor to moderate separation
- 50-100: Moderate separation
- 100+: Good separation
- 1000+: Excellent separation

In [33]:
from sklearn.metrics import calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
labels, nodes = discover_tactical_patterns(graph, k=4)
node_features = []
for node in graph.nodes():
    
    node_data = graph.nodes[node]
    
    # Create feature vector from the node attributes
    features = [
        node_data['event_count'],
        node_data['unique_players'],
        *encode_event_semantic(node_data['most_common_event']),
        *node_data['role_distribution']  # This unpacks the 4 values
    ]
    node_features.append(features)

node_features = np.array(node_features)
node_features = scalar.fit_transform(node_features)
ch_score = calinski_harabasz_score(node_features, np.array(labels))
print(ch_score)

4.845613062537386


In [34]:
ch_score_louvain = calinski_harabasz_score(node_features, labels_louvain)
print(ch_score_louvain)

2.0995861384397916


## Davies Bouldin Score

| **DB Score** | **Interpretation** |
|:-------------:|:------------------|
| < 0.5 | Excellent separation and compactness |
| 0.5 – 1.0 | Good clustering, well-separated |
| 1.0 – 2.0 | Moderate clustering quality |
| > 2.0 | Poor clustering — clusters overlap or are not compact |


In [35]:
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
labels, nodes = discover_tactical_patterns(graph, k=4)
node_features = []
for node in graph.nodes():
    
    node_data = graph.nodes[node]
    
    # Create feature vector from the node attributes
    features = [
        node_data['event_count'],
        node_data['unique_players'],
        *encode_event_semantic(node_data['most_common_event']),
        *node_data['role_distribution']  # This unpacks the 4 values
    ]
    node_features.append(features)

node_features = np.array(node_features)
node_features = scalar.fit_transform(node_features)
db_score = davies_bouldin_score(node_features, np.array(labels))
print(db_score)

2.179876664409763


In [36]:
db_score_louvain = davies_bouldin_score(node_features, labels_louvain)
print(db_score_louvain)

2.6276015904342236


## ANOVA (Analysis of Variance) 
Tests whether cluster means differ significantly across features. It tells you if your clusters are actually separating the data in meaningful ways. What ANOVA Tells You:

- High F-statistic, Low p-value (<0.05): Feature significantly differs between clusters
- Low F-statistic, High p-value (>0.05): Feature doesn't help distinguish clusters

In [37]:
from scipy import stats
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
labels, nodes = discover_tactical_patterns(graph, k=4)
node_features = []
for node in graph.nodes():
    
    node_data = graph.nodes[node]
    
    # Create feature vector from the node attributes
    features = [
        node_data['event_count'],
        node_data['unique_players'],
        *encode_event_semantic(node_data['most_common_event']),
        *node_data['role_distribution']  # This unpacks the 4 values
    ]
    node_features.append(features)

node_features = np.array(node_features)
node_features = scalar.fit_transform(node_features)

def anova_test_clustering(features, labels, feature_names=None):
    """
    Perform ANOVA test for each feature across clusters
    
    Args:
        features: numpy array (n_samples, n_features)
        labels: cluster labels
        feature_names: list of feature names
    
    Returns:
        DataFrame with F-statistics and p-values
    """
    n_features = features.shape[1]
    
    if feature_names is None:
        feature_names = [f'Feature_{i}' for i in range(n_features)]
    
    results = []
    
    for i in range(n_features):
        # Split feature values by cluster
        groups = [features[labels == cluster, i] for cluster in np.unique(labels)]
        
        # Perform one-way ANOVA
        f_stat, p_value = stats.f_oneway(*groups)
        
        results.append({
            'Feature': feature_names[i],
            'F-statistic': f_stat,
            'p-value': p_value,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })
    
    return pd.DataFrame(results).sort_values('F-statistic', ascending=False)

feature_names = (
    ['event_count', 'unique_players'] + 
    [f'semantic_{i}' for i in range(6)] + 
    [f'role_{i}' for i in range(4)]
)

anova_results = anova_test_clustering(node_features, labels, feature_names)
print(anova_results.to_string(index=False))

# Summary
significant_features = anova_results[anova_results['Significant'] == 'Yes']
print(f"\n{len(significant_features)}/{len(feature_names)} features significantly differ between clusters")

       Feature  F-statistic      p-value Significant
   event_count    23.074817 2.026728e-08         Yes
unique_players    20.636997 7.111502e-08         Yes
        role_2     4.322023 1.078288e-02         Yes
    semantic_0     2.503939 7.516454e-02          No
    semantic_1     2.503939 7.516454e-02          No
        role_3     2.106765 1.170313e-01          No
        role_0     1.929914 1.427285e-01          No
        role_1     1.681839 1.886984e-01          No
    semantic_2          NaN          NaN          No
    semantic_3          NaN          NaN          No
    semantic_4          NaN          NaN          No
    semantic_5          NaN          NaN          No

3/12 features significantly differ between clusters


  res = hypotest_fun_out(*samples, **kwds)


In [38]:
anova_results_louvain = anova_test_clustering(node_features, labels_louvain, feature_names)
print(anova_results_louvain.to_string(index=False))
# Summary
significant_features_louvain = anova_results_louvain[anova_results_louvain['Significant'] == 'Yes']
print(f"\n{len(significant_features_louvain)}/{len(feature_names)} features significantly differ between Louvain clusters")

       Feature  F-statistic  p-value Significant
   event_count     3.630442 0.014411         Yes
unique_players     3.376068 0.019795         Yes
        role_2     2.091791 0.103482          No
        role_0     1.873603 0.137707          No
        role_3     1.735801 0.164894          No
    semantic_0     1.550968 0.209744          No
    semantic_1     1.550968 0.209744          No
        role_1     1.439084 0.242376          No
    semantic_2          NaN      NaN          No
    semantic_3          NaN      NaN          No
    semantic_4          NaN      NaN          No
    semantic_5          NaN      NaN          No

2/12 features significantly differ between Louvain clusters


  res = hypotest_fun_out(*samples, **kwds)


## Modularity

Modularity measures how well a network is divided into communities/clusters. It compares the number of edges within clusters vs what you'd expect in a random network.

Range: -1 to 1

- \> 0.3: Good community structure
- \> 0.5: Strong community structure
- \> 0.7: Very strong community structure
- < 0.3: Weak or no community structure
Negative: Worse than random

Key difference: Unlike Silhouette/CH, modularity uses the graph structure (edges), not just node features!

In [39]:
import networkx as nx

labels, nodes = discover_tactical_patterns(graph, k=4)

def calculate_modularity(graph, labels):
    """
    Calculate modularity for a clustering on a graph
    
    Args:
        graph: NetworkX graph (DiGraph or Graph)
        labels: cluster assignment for each node
    
    Returns:
        modularity score
    """
    # Create communities from labels
    # NetworkX expects a list of sets, one set per community
    unique_labels = np.unique(labels)
    node_list = list(graph.nodes())
    
    communities = []
    for label in unique_labels:
        # Get nodes in this cluster
        cluster_nodes = [node_list[i] for i, l in enumerate(labels) if l == label]
        communities.append(set(cluster_nodes))
    
    # Calculate modularity
    modularity = nx.community.modularity(graph, communities)
    
    return modularity

modularity_score = calculate_modularity(graph, labels)
print(f"Modularity Score: {modularity_score}")

Modularity Score: 0.13335798625742812


In [40]:
modularity_score_louvain = calculate_modularity(graph, labels_louvain)
print(f"Louvain Modularity Score: {modularity_score_louvain}")

Louvain Modularity Score: 0.37729293693724686


# More graph-based metrics
### Conductance : Measures cluster boundary quality (lower = tighter clusters)

Formula: boundary_edges / (internal_edges + boundary_edges)
- Interpretation: < 0.3 is good, < 0.1 is excellent

### Coverage: Fraction of edges within clusters (higher = better)

Formula: internal_edges / total_edges
- Interpretation: > 0.7 is good, > 0.9 is excellent


### Internal Edge Density: Average density within clusters

Shows how tightly connected nodes are within their clusters
Higher values indicate more cohesive clusters

In [41]:
def compute_conductance(graph, labels):
    """Compute conductance for each cluster (lower is better)"""
    node_list = list(graph.nodes())
    conductances = []
    
    for label in np.unique(labels):
        cluster_nodes = set([node_list[i] for i, l in enumerate(labels) if l == label])
        
        # Count edges within and across cluster boundary
        internal_edges = 0
        boundary_edges = 0
        
        for u, v in graph.edges():
            if u in cluster_nodes and v in cluster_nodes:
                internal_edges += 1
            elif u in cluster_nodes or v in cluster_nodes:
                boundary_edges += 1
        
        # Conductance = boundary / min(internal, external)
        if internal_edges + boundary_edges > 0:
            conductance = boundary_edges / (internal_edges + boundary_edges)
        else:
            conductance = 1.0
        
        conductances.append(conductance)
    
    return np.mean(conductances)



In [42]:
def compute_coverage(graph, labels):
    """Compute coverage: fraction of edges within clusters"""
    node_list = list(graph.nodes())
    total_edges = graph.number_of_edges()
    internal_edges = 0
    
    for u, v in graph.edges():
        u_idx = node_list.index(u)
        v_idx = node_list.index(v)
        if labels[u_idx] == labels[v_idx]:
            internal_edges += 1
    
    return internal_edges / total_edges if total_edges > 0 else 0

In [43]:
def compute_edge_density_ratio(graph, labels):
    """Ratio of internal to external edge density"""
    node_list = list(graph.nodes())
    
    internal_density = []
    for label in np.unique(labels):
        cluster_nodes = [node_list[i] for i, l in enumerate(labels) if l == label]
        if len(cluster_nodes) > 1:
            subgraph = graph.subgraph(cluster_nodes)
            internal_density.append(nx.density(subgraph))
    
    return np.mean(internal_density) if internal_density else 0

In [44]:
conductance = compute_conductance(graph, labels)
coverage = compute_coverage(graph, labels)
edge_density = compute_edge_density_ratio(graph, labels)

print(f"Conductance: {conductance}")
print(f"Coverage: {coverage}")
print(f"Edge Density Ratio: {edge_density}")

Conductance: 0.7937416585563241
Coverage: 0.35494880546075086
Edge Density Ratio: 0.34346764346764347


In [45]:
conductance_louvain = compute_conductance(graph, labels_louvain)
coverage_louvain = compute_coverage(graph, labels_louvain)
edge_density_louvain = compute_edge_density_ratio(graph, labels_louvain)
print(f"Louvain Conductance: {conductance_louvain}")
print(f"Louvain Coverage: {coverage_louvain}")
print(f"Louvain Edge Density Ratio: {edge_density_louvain}")

Louvain Conductance: 0.6772851142834373
Louvain Coverage: 0.5460750853242321
Louvain Edge Density Ratio: 0.742051282051282


## Pairwise Comparisons

Using normalised mutual information and adjusted random index

### Normalized Mutual Information (NMI): Measures agreement between two clusterings

Range: 0-1 (1 = perfect agreement)

Use: Compare how similar different methods' results are

### Adjusted Rand Index (ARI):

Range: -1 to 1 (1 = perfect agreement, 0 = random)

Adjusts for chance agreement

In [46]:
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score

In [47]:
# Compare spectral clustering with Louvain
nmi = normalized_mutual_info_score(labels, labels_louvain)
ari = adjusted_rand_score(labels, labels_louvain)
print(f"NMI between Spectral and Louvain: {nmi}")
print(f"ARI between Spectral and Louvain: {ari}")


NMI between Spectral and Louvain: 0.16859083378414486
ARI between Spectral and Louvain: 0.03137254901960784


## Node2Vec

In [None]:
from node2vec import Node2Vec
from sklearn.cluster import KMeans

node2vec = Node2Vec(
    graph,
    dimensions=64,        # Embedding size
    walk_length=30,       # Length of random walks
    num_walks=200,        # Number of walks per node
    p=1.0,                # Return parameter (likelihood of returning to previous node)
    q=1.0,                # In-out parameter (BFS vs DFS)
    workers=1,            # Set to 1 to avoid multiprocessing issues
    quiet=False           # Show progress
)

model = node2vec.fit(
    window=10,            # Context window size
    min_count=1,          # Minimum word count
    batch_words=4,        # Batch size
    epochs=10             # Training epochs
)

embeddings = {}
for node in graph.nodes():
    try:
        embeddings[node] = model.wv[str(node)]
    except KeyError:
        # Handle isolated nodes
        embeddings[node] = np.zeros(64)

nodes = sorted(embeddings.keys())
X = np.array([embeddings[node] for node in nodes])

# Perform K-means clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
node2vec_labels = kmeans.fit_predict(X)


print(f"✓ Clustered zones into {n_clusters} groups")


Computing transition probabilities: 100%|██████████| 39/39 [00:00<00:00, 7482.98it/s]
Generating walks (CPU: 1): 100%|██████████| 200/200 [00:00<00:00, 1635.64it/s]


✓ Clustered zones into 5 groups


In [50]:
score_node2vec = silhouette_score(node_features, np.array(node2vec_labels))
print(score_node2vec)

-0.24008584847754988


In [52]:
from itertools import product
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
import numpy as np

def eval_config(G, dims, walk_len, n_walks, p, q, window, epochs, k_values, weight_key='transition_frequency'):
    n2v = Node2Vec(
        G, dimensions=dims, walk_length=walk_len, num_walks=n_walks,
        p=p, q=q, weight_key=weight_key, workers=1, quiet=True, seed=42
    )
    model = n2v.fit(window=window, min_count=1, batch_words=4, epochs=epochs)
    nodes = list(G.nodes())
    X = np.array([model.wv[str(n)] for n in nodes])
    X = normalize(X)  # cosine-friendly
    best = {'score': -1}
    for k in k_values:
        labels = KMeans(n_clusters=k, random_state=42, n_init=10).fit_predict(X)
        if len(set(labels)) <= 1: 
            continue
        score = silhouette_score(X, labels, metric='cosine')
        if score > best['score']:
            best = {'score': score, 'k': k, 'labels': labels, 'X': X, 'nodes': nodes}
    return best

dims_list = [128]
walk_list = [20, 30]
nwalks_list = [50, 100, 200]
p_list = [0.5, 1.0, 2.0]
q_list = [1.0, 2.0, 4.0]  # higher q = more local (often better for communities)
window_list = [5, 10]
epochs_list = [5, 10]
k_values = [5]

results = []
for dims, walk, nwalks, p, q, win, ep in product(dims_list, walk_list, nwalks_list, p_list, q_list, window_list, epochs_list):
    res = eval_config(graph, dims, walk, nwalks, p, q, win, ep, k_values, weight_key='transition_frequency')
    results.append((res['score'], {'dims': dims, 'walk': walk, 'nwalks': nwalks, 'p': p, 'q': q, 'win': win, 'ep': ep, 'k': res.get('k')}))
    print(f"score={res['score']:.3f} | dims={dims} walk={walk} nwalks={nwalks} p={p} q={q} win={win} ep={ep} k={res.get('k')}")

best = max(results, key=lambda x: x[0])
print("\nBest:", best)

score=0.419 | dims=128 walk=20 nwalks=50 p=0.5 q=1.0 win=5 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.286 | dims=128 walk=20 nwalks=50 p=0.5 q=1.0 win=5 ep=20 k=5
score=0.392 | dims=128 walk=20 nwalks=50 p=0.5 q=1.0 win=10 ep=10 k=5
score=0.279 | dims=128 walk=20 nwalks=50 p=0.5 q=1.0 win=10 ep=20 k=5
score=0.439 | dims=128 walk=20 nwalks=50 p=0.5 q=2.0 win=5 ep=10 k=5
score=0.311 | dims=128 walk=20 nwalks=50 p=0.5 q=2.0 win=5 ep=20 k=5
score=0.434 | dims=128 walk=20 nwalks=50 p=0.5 q=2.0 win=10 ep=10 k=5
score=0.331 | dims=128 walk=20 nwalks=50 p=0.5 q=2.0 win=10 ep=20 k=5
score=0.338 | dims=128 walk=20 nwalks=50 p=0.5 q=4.0 win=5 ep=10 k=5
score=0.270 | dims=128 walk=20 nwalks=50 p=0.5 q=4.0 win=5 ep=20 k=5
score=0.354 | dims=128 walk=20 nwalks=50 p=0.5 q=4.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.259 | dims=128 walk=20 nwalks=50 p=0.5 q=4.0 win=10 ep=20 k=5
score=0.465 | dims=128 walk=20 nwalks=50 p=1.0 q=1.0 win=5 ep=10 k=5
score=0.332 | dims=128 walk=20 nwalks=50 p=1.0 q=1.0 win=5 ep=20 k=5
score=0.439 | dims=128 walk=20 nwalks=50 p=1.0 q=1.0 win=10 ep=10 k=5
score=0.293 | dims=128 walk=20 nwalks=50 p=1.0 q=1.0 win=10 ep=20 k=5
score=0.415 | dims=128 walk=20 nwalks=50 p=1.0 q=2.0 win=5 ep=10 k=5
score=0.286 | dims=128 walk=20 nwalks=50 p=1.0 q=2.0 win=5 ep=20 k=5
score=0.389 | dims=128 walk=20 nwalks=50 p=1.0 q=2.0 win=10 ep=10 k=5
score=0.301 | dims=128 walk=20 nwalks=50 p=1.0 q=2.0 win=10 ep=20 k=5
score=0.450 | dims=128 walk=20 nwalks=50 p=1.0 q=4.0 win=5 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.282 | dims=128 walk=20 nwalks=50 p=1.0 q=4.0 win=5 ep=20 k=5
score=0.417 | dims=128 walk=20 nwalks=50 p=1.0 q=4.0 win=10 ep=10 k=5
score=0.336 | dims=128 walk=20 nwalks=50 p=1.0 q=4.0 win=10 ep=20 k=5
score=0.546 | dims=128 walk=20 nwalks=50 p=2.0 q=1.0 win=5 ep=10 k=5
score=0.366 | dims=128 walk=20 nwalks=50 p=2.0 q=1.0 win=5 ep=20 k=5
score=0.506 | dims=128 walk=20 nwalks=50 p=2.0 q=1.0 win=10 ep=10 k=5
score=0.335 | dims=128 walk=20 nwalks=50 p=2.0 q=1.0 win=10 ep=20 k=5
score=0.492 | dims=128 walk=20 nwalks=50 p=2.0 q=2.0 win=5 ep=10 k=5
score=0.332 | dims=128 walk=20 nwalks=50 p=2.0 q=2.0 win=5 ep=20 k=5
score=0.424 | dims=128 walk=20 nwalks=50 p=2.0 q=2.0 win=10 ep=10 k=5
score=0.370 | dims=128 walk=20 nwalks=50 p=2.0 q=2.0 win=10 ep=20 k=5
score=0.504 | dims=128 walk=20 nwalks=50 p=2.0 q=4.0 win=5 ep=10 k=5
score=0.341 | dims=128 walk=20 nwalks=50 p=2.0 q=4.0 win=5 ep=20 k=5
score=0.469 | dims=128 walk=20 nwalks=50 p=2.0 q=4.0 win=10 ep=10 k=5
score=0.350 | dims=128 walk

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.253 | dims=128 walk=20 nwalks=100 p=0.5 q=4.0 win=10 ep=20 k=5
score=0.343 | dims=128 walk=20 nwalks=100 p=1.0 q=1.0 win=5 ep=10 k=5
score=0.248 | dims=128 walk=20 nwalks=100 p=1.0 q=1.0 win=5 ep=20 k=5
score=0.337 | dims=128 walk=20 nwalks=100 p=1.0 q=1.0 win=10 ep=10 k=5
score=0.228 | dims=128 walk=20 nwalks=100 p=1.0 q=1.0 win=10 ep=20 k=5
score=0.339 | dims=128 walk=20 nwalks=100 p=1.0 q=2.0 win=5 ep=10 k=5
score=0.261 | dims=128 walk=20 nwalks=100 p=1.0 q=2.0 win=5 ep=20 k=5
score=0.318 | dims=128 walk=20 nwalks=100 p=1.0 q=2.0 win=10 ep=10 k=5
score=0.229 | dims=128 walk=20 nwalks=100 p=1.0 q=2.0 win=10 ep=20 k=5
score=0.320 | dims=128 walk=20 nwalks=100 p=1.0 q=4.0 win=5 ep=10 k=5
score=0.216 | dims=128 walk=20 nwalks=100 p=1.0 q=4.0 win=5 ep=20 k=5
score=0.343 | dims=128 walk=20 nwalks=100 p=1.0 q=4.0 win=10 ep=10 k=5
score=0.260 | dims=128 walk=20 nwalks=100 p=1.0 q=4.0 win=10 ep=20 k=5
score=0.347 | dims=128 walk=20 nwalks=100 p=2.0 q=1.0 win=5 ep=10 k=5
score=0.234 |

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.262 | dims=128 walk=20 nwalks=100 p=2.0 q=2.0 win=5 ep=20 k=5
score=0.310 | dims=128 walk=20 nwalks=100 p=2.0 q=2.0 win=10 ep=10 k=5
score=0.242 | dims=128 walk=20 nwalks=100 p=2.0 q=2.0 win=10 ep=20 k=5
score=0.365 | dims=128 walk=20 nwalks=100 p=2.0 q=4.0 win=5 ep=10 k=5
score=0.280 | dims=128 walk=20 nwalks=100 p=2.0 q=4.0 win=5 ep=20 k=5
score=0.351 | dims=128 walk=20 nwalks=100 p=2.0 q=4.0 win=10 ep=10 k=5
score=0.238 | dims=128 walk=20 nwalks=100 p=2.0 q=4.0 win=10 ep=20 k=5
score=0.252 | dims=128 walk=20 nwalks=200 p=0.5 q=1.0 win=5 ep=10 k=5
score=0.204 | dims=128 walk=20 nwalks=200 p=0.5 q=1.0 win=5 ep=20 k=5
score=0.213 | dims=128 walk=20 nwalks=200 p=0.5 q=1.0 win=10 ep=10 k=5
score=0.202 | dims=128 walk=20 nwalks=200 p=0.5 q=1.0 win=10 ep=20 k=5
score=0.264 | dims=128 walk=20 nwalks=200 p=0.5 q=2.0 win=5 ep=10 k=5
score=0.228 | dims=128 walk=20 nwalks=200 p=0.5 q=2.0 win=5 ep=20 k=5
score=0.275 | dims=128 walk=20 nwalks=200 p=0.5 q=2.0 win=10 ep=10 k=5
score=0.214 |

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.242 | dims=128 walk=20 nwalks=200 p=1.0 q=1.0 win=5 ep=10 k=5
score=0.224 | dims=128 walk=20 nwalks=200 p=1.0 q=1.0 win=5 ep=20 k=5
score=0.262 | dims=128 walk=20 nwalks=200 p=1.0 q=1.0 win=10 ep=10 k=5
score=0.202 | dims=128 walk=20 nwalks=200 p=1.0 q=1.0 win=10 ep=20 k=5
score=0.244 | dims=128 walk=20 nwalks=200 p=1.0 q=2.0 win=5 ep=10 k=5
score=0.231 | dims=128 walk=20 nwalks=200 p=1.0 q=2.0 win=5 ep=20 k=5
score=0.245 | dims=128 walk=20 nwalks=200 p=1.0 q=2.0 win=10 ep=10 k=5
score=0.209 | dims=128 walk=20 nwalks=200 p=1.0 q=2.0 win=10 ep=20 k=5
score=0.292 | dims=128 walk=20 nwalks=200 p=1.0 q=4.0 win=5 ep=10 k=5
score=0.253 | dims=128 walk=20 nwalks=200 p=1.0 q=4.0 win=5 ep=20 k=5
score=0.275 | dims=128 walk=20 nwalks=200 p=1.0 q=4.0 win=10 ep=10 k=5
score=0.206 | dims=128 walk=20 nwalks=200 p=1.0 q=4.0 win=10 ep=20 k=5
score=0.289 | dims=128 walk=20 nwalks=200 p=2.0 q=1.0 win=5 ep=10 k=5
score=0.240 | dims=128 walk=20 nwalks=200 p=2.0 q=1.0 win=5 ep=20 k=5
score=0.276 | 

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.274 | dims=128 walk=20 nwalks=200 p=2.0 q=2.0 win=10 ep=10 k=5
score=0.207 | dims=128 walk=20 nwalks=200 p=2.0 q=2.0 win=10 ep=20 k=5
score=0.284 | dims=128 walk=20 nwalks=200 p=2.0 q=4.0 win=5 ep=10 k=5
score=0.253 | dims=128 walk=20 nwalks=200 p=2.0 q=4.0 win=5 ep=20 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.267 | dims=128 walk=20 nwalks=200 p=2.0 q=4.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.229 | dims=128 walk=20 nwalks=200 p=2.0 q=4.0 win=10 ep=20 k=5
score=0.358 | dims=128 walk=30 nwalks=50 p=0.5 q=1.0 win=5 ep=10 k=5
score=0.252 | dims=128 walk=30 nwalks=50 p=0.5 q=1.0 win=5 ep=20 k=5
score=0.290 | dims=128 walk=30 nwalks=50 p=0.5 q=1.0 win=10 ep=10 k=5
score=0.242 | dims=128 walk=30 nwalks=50 p=0.5 q=1.0 win=10 ep=20 k=5
score=0.357 | dims=128 walk=30 nwalks=50 p=0.5 q=2.0 win=5 ep=10 k=5
score=0.258 | dims=128 walk=30 nwalks=50 p=0.5 q=2.0 win=5 ep=20 k=5
score=0.365 | dims=128 walk=30 nwalks=50 p=0.5 q=2.0 win=10 ep=10 k=5
score=0.261 | dims=128 walk=30 nwalks=50 p=0.5 q=2.0 win=10 ep=20 k=5
score=0.370 | dims=128 walk=30 nwalks=50 p=0.5 q=4.0 win=5 ep=10 k=5
score=0.284 | dims=128 walk=30 nwalks=50 p=0.5 q=4.0 win=5 ep=20 k=5
score=0.313 | dims=128 walk=30 nwalks=50 p=0.5 q=4.0 win=10 ep=10 k=5
score=0.272 | dims=128 walk=30 nwalks=50 p=0.5 q=4.0 win=10 ep=20 k=5
score=0.392 | dims=128 walk=30 nwalks=50 p=1.0 q=1.0 win=5 ep=10 k=5
score=0.257 | dims=128 wal

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.256 | dims=128 walk=30 nwalks=100 p=0.5 q=2.0 win=5 ep=20 k=5
score=0.288 | dims=128 walk=30 nwalks=100 p=0.5 q=2.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.229 | dims=128 walk=30 nwalks=100 p=0.5 q=2.0 win=10 ep=20 k=5
score=0.281 | dims=128 walk=30 nwalks=100 p=0.5 q=4.0 win=5 ep=10 k=5
score=0.272 | dims=128 walk=30 nwalks=100 p=0.5 q=4.0 win=5 ep=20 k=5
score=0.292 | dims=128 walk=30 nwalks=100 p=0.5 q=4.0 win=10 ep=10 k=5
score=0.263 | dims=128 walk=30 nwalks=100 p=0.5 q=4.0 win=10 ep=20 k=5
score=0.316 | dims=128 walk=30 nwalks=100 p=1.0 q=1.0 win=5 ep=10 k=5
score=0.240 | dims=128 walk=30 nwalks=100 p=1.0 q=1.0 win=5 ep=20 k=5
score=0.272 | dims=128 walk=30 nwalks=100 p=1.0 q=1.0 win=10 ep=10 k=5
score=0.218 | dims=128 walk=30 nwalks=100 p=1.0 q=1.0 win=10 ep=20 k=5
score=0.314 | dims=128 walk=30 nwalks=100 p=1.0 q=2.0 win=5 ep=10 k=5
score=0.266 | dims=128 walk=30 nwalks=100 p=1.0 q=2.0 win=5 ep=20 k=5
score=0.297 | dims=128 walk=30 nwalks=100 p=1.0 q=2.0 win=10 ep=10 k=5
score=0.244 | dims=128 walk=30 nwalks=100 p=1.0 q=2.0 win=10 ep=20 k=5
score=0.285 | dims=128 walk=30 nwalks=100 p=1.0 q=4.0 win=5 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.221 | dims=128 walk=30 nwalks=100 p=1.0 q=4.0 win=5 ep=20 k=5
score=0.258 | dims=128 walk=30 nwalks=100 p=1.0 q=4.0 win=10 ep=10 k=5
score=0.234 | dims=128 walk=30 nwalks=100 p=1.0 q=4.0 win=10 ep=20 k=5
score=0.314 | dims=128 walk=30 nwalks=100 p=2.0 q=1.0 win=5 ep=10 k=5
score=0.242 | dims=128 walk=30 nwalks=100 p=2.0 q=1.0 win=5 ep=20 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.322 | dims=128 walk=30 nwalks=100 p=2.0 q=1.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.252 | dims=128 walk=30 nwalks=100 p=2.0 q=1.0 win=10 ep=20 k=5
score=0.338 | dims=128 walk=30 nwalks=100 p=2.0 q=2.0 win=5 ep=10 k=5
score=0.266 | dims=128 walk=30 nwalks=100 p=2.0 q=2.0 win=5 ep=20 k=5
score=0.345 | dims=128 walk=30 nwalks=100 p=2.0 q=2.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.277 | dims=128 walk=30 nwalks=100 p=2.0 q=2.0 win=10 ep=20 k=5
score=0.323 | dims=128 walk=30 nwalks=100 p=2.0 q=4.0 win=5 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.259 | dims=128 walk=30 nwalks=100 p=2.0 q=4.0 win=5 ep=20 k=5
score=0.319 | dims=128 walk=30 nwalks=100 p=2.0 q=4.0 win=10 ep=10 k=5
score=0.239 | dims=128 walk=30 nwalks=100 p=2.0 q=4.0 win=10 ep=20 k=5
score=0.220 | dims=128 walk=30 nwalks=200 p=0.5 q=1.0 win=5 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.173 | dims=128 walk=30 nwalks=200 p=0.5 q=1.0 win=5 ep=20 k=5
score=0.220 | dims=128 walk=30 nwalks=200 p=0.5 q=1.0 win=10 ep=10 k=5
score=0.254 | dims=128 walk=30 nwalks=200 p=0.5 q=1.0 win=10 ep=20 k=5
score=0.257 | dims=128 walk=30 nwalks=200 p=0.5 q=2.0 win=5 ep=10 k=5
score=0.237 | dims=128 walk=30 nwalks=200 p=0.5 q=2.0 win=5 ep=20 k=5
score=0.248 | dims=128 walk=30 nwalks=200 p=0.5 q=2.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.239 | dims=128 walk=30 nwalks=200 p=0.5 q=2.0 win=10 ep=20 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.277 | dims=128 walk=30 nwalks=200 p=0.5 q=4.0 win=5 ep=10 k=5
score=0.228 | dims=128 walk=30 nwalks=200 p=0.5 q=4.0 win=5 ep=20 k=5
score=0.246 | dims=128 walk=30 nwalks=200 p=0.5 q=4.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.247 | dims=128 walk=30 nwalks=200 p=0.5 q=4.0 win=10 ep=20 k=5
score=0.237 | dims=128 walk=30 nwalks=200 p=1.0 q=1.0 win=5 ep=10 k=5
score=0.234 | dims=128 walk=30 nwalks=200 p=1.0 q=1.0 win=5 ep=20 k=5
score=0.238 | dims=128 walk=30 nwalks=200 p=1.0 q=1.0 win=10 ep=10 k=5
score=0.228 | dims=128 walk=30 nwalks=200 p=1.0 q=1.0 win=10 ep=20 k=5
score=0.263 | dims=128 walk=30 nwalks=200 p=1.0 q=2.0 win=5 ep=10 k=5
score=0.255 | dims=128 walk=30 nwalks=200 p=1.0 q=2.0 win=5 ep=20 k=5
score=0.262 | dims=128 walk=30 nwalks=200 p=1.0 q=2.0 win=10 ep=10 k=5
score=0.247 | dims=128 walk=30 nwalks=200 p=1.0 q=2.0 win=10 ep=20 k=5
score=0.258 | dims=128 walk=30 nwalks=200 p=1.0 q=4.0 win=5 ep=10 k=5
score=0.222 | dims=128 walk=30 nwalks=200 p=1.0 q=4.0 win=5 ep=20 k=5
score=0.246 | dims=128 walk=30 nwalks=200 p=1.0 q=4.0 win=10 ep=10 k=5
score=0.266 | dims=128 walk=30 nwalks=200 p=1.0 q=4.0 win=10 ep=20 k=5
score=0.274 | dims=128 walk=30 nwalks=200 p=2.0 q=1.0 win=5 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.225 | dims=128 walk=30 nwalks=200 p=2.0 q=1.0 win=5 ep=20 k=5
score=0.276 | dims=128 walk=30 nwalks=200 p=2.0 q=1.0 win=10 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.223 | dims=128 walk=30 nwalks=200 p=2.0 q=1.0 win=10 ep=20 k=5
score=0.262 | dims=128 walk=30 nwalks=200 p=2.0 q=2.0 win=5 ep=10 k=5
score=0.248 | dims=128 walk=30 nwalks=200 p=2.0 q=2.0 win=5 ep=20 k=5
score=0.274 | dims=128 walk=30 nwalks=200 p=2.0 q=2.0 win=10 ep=10 k=5
score=0.251 | dims=128 walk=30 nwalks=200 p=2.0 q=2.0 win=10 ep=20 k=5
score=0.272 | dims=128 walk=30 nwalks=200 p=2.0 q=4.0 win=5 ep=10 k=5


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.265 | dims=128 walk=30 nwalks=200 p=2.0 q=4.0 win=5 ep=20 k=5
score=0.277 | dims=128 walk=30 nwalks=200 p=2.0 q=4.0 win=10 ep=10 k=5
score=0.246 | dims=128 walk=30 nwalks=200 p=2.0 q=4.0 win=10 ep=20 k=5

Best: (0.54637610912323, {'dims': 128, 'walk': 20, 'nwalks': 50, 'p': 2.0, 'q': 1.0, 'win': 5, 'ep': 10, 'k': 5})


Best: (0.54637610912323, {'dims': 128, 'walk': 20, 'nwalks': 50, 'p': 2.0, 'q': 1.0, 'win': 5, 'ep': 10, 'k': 5}

In [54]:

dims_list = [16, 32]
walk_list = [10, 15, 20]
nwalks_list = [30, 40]
p_list = [0.5, 1.0, 2.0]
q_list = [1.0, 2.0, 4.0]  # higher q = more local (often better for communities)
window_list = [5, 10]
epochs_list = [5, 10]
k_values = [5]

results = []
for dims, walk, nwalks, p, q, win, ep in product(dims_list, walk_list, nwalks_list, p_list, q_list, window_list, epochs_list):
    res = eval_config(graph, dims, walk, nwalks, p, q, win, ep, k_values, weight_key='transition_frequency')
    results.append((res['score'], {'dims': dims, 'walk': walk, 'nwalks': nwalks, 'p': p, 'q': q, 'win': win, 'ep': ep, 'k': res.get('k')}))
    print(f"score={res['score']:.3f} | dims={dims} walk={walk} nwalks={nwalks} p={p} q={q} win={win} ep={ep} k={res.get('k')}")

best = max(results, key=lambda x: x[0])
print("\nBest:", best)

score=0.240 | dims=16 walk=10 nwalks=30 p=0.5 q=1.0 win=5 ep=5 k=5
score=0.554 | dims=16 walk=10 nwalks=30 p=0.5 q=1.0 win=5 ep=10 k=5
score=0.348 | dims=16 walk=10 nwalks=30 p=0.5 q=1.0 win=10 ep=5 k=5
score=0.536 | dims=16 walk=10 nwalks=30 p=0.5 q=1.0 win=10 ep=10 k=5
score=0.415 | dims=16 walk=10 nwalks=30 p=0.5 q=2.0 win=5 ep=5 k=5
score=0.566 | dims=16 walk=10 nwalks=30 p=0.5 q=2.0 win=5 ep=10 k=5
score=0.386 | dims=16 walk=10 nwalks=30 p=0.5 q=2.0 win=10 ep=5 k=5
score=0.560 | dims=16 walk=10 nwalks=30 p=0.5 q=2.0 win=10 ep=10 k=5
score=0.304 | dims=16 walk=10 nwalks=30 p=0.5 q=4.0 win=5 ep=5 k=5
score=0.525 | dims=16 walk=10 nwalks=30 p=0.5 q=4.0 win=5 ep=10 k=5
score=0.377 | dims=16 walk=10 nwalks=30 p=0.5 q=4.0 win=10 ep=5 k=5
score=0.541 | dims=16 walk=10 nwalks=30 p=0.5 q=4.0 win=10 ep=10 k=5
score=0.225 | dims=16 walk=10 nwalks=30 p=1.0 q=1.0 win=5 ep=5 k=5
score=0.580 | dims=16 walk=10 nwalks=30 p=1.0 q=1.0 win=5 ep=10 k=5
score=0.260 | dims=16 walk=10 nwalks=30 p=1.0 q=1

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


score=0.617 | dims=32 walk=10 nwalks=30 p=2.0 q=4.0 win=10 ep=10 k=5
score=0.382 | dims=32 walk=10 nwalks=40 p=0.5 q=1.0 win=5 ep=5 k=5
score=0.535 | dims=32 walk=10 nwalks=40 p=0.5 q=1.0 win=5 ep=10 k=5
score=0.367 | dims=32 walk=10 nwalks=40 p=0.5 q=1.0 win=10 ep=5 k=5
score=0.562 | dims=32 walk=10 nwalks=40 p=0.5 q=1.0 win=10 ep=10 k=5
score=0.494 | dims=32 walk=10 nwalks=40 p=0.5 q=2.0 win=5 ep=5 k=5
score=0.502 | dims=32 walk=10 nwalks=40 p=0.5 q=2.0 win=5 ep=10 k=5
score=0.534 | dims=32 walk=10 nwalks=40 p=0.5 q=2.0 win=10 ep=5 k=5
score=0.481 | dims=32 walk=10 nwalks=40 p=0.5 q=2.0 win=10 ep=10 k=5
score=0.467 | dims=32 walk=10 nwalks=40 p=0.5 q=4.0 win=5 ep=5 k=5
score=0.548 | dims=32 walk=10 nwalks=40 p=0.5 q=4.0 win=5 ep=10 k=5
score=0.500 | dims=32 walk=10 nwalks=40 p=0.5 q=4.0 win=10 ep=5 k=5
score=0.479 | dims=32 walk=10 nwalks=40 p=0.5 q=4.0 win=10 ep=10 k=5
score=0.327 | dims=32 walk=10 nwalks=40 p=1.0 q=1.0 win=5 ep=5 k=5
score=0.520 | dims=32 walk=10 nwalks=40 p=1.0 q=

Best: (0.657311737537384, {'dims': 32, 'walk': 10, 'nwalks': 40, 'p': 2.0, 'q': 4.0, 'win': 10, 'ep': 10, 'k': 5}

### Best visualization

In [None]:
node2vec = Node2Vec(
    graph,
    dimensions=32,        # Embedding size
    walk_length=10,       # Length of random walks
    num_walks=40,        # Number of walks per node
    p=2.0,                # Return parameter (likelihood of returning to previous node)
    q=4.0,                # In-out parameter (BFS vs DFS)
    workers=1,            # Set to 1 to avoid multiprocessing issues
    quiet=False,
    seed=42           
)

model = node2vec.fit(
    window=10,            # Context window size
    min_count=1,          # Minimum word count
    batch_words=4,        # Batch size
    epochs=10             # Training epochs
)

# Perform K-means clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
node2vec_labels = kmeans.fit_predict(X)

nodes = list(graph.nodes())
X = np.array([model.wv[str(n)] for n in nodes])
X = normalize(X)

score = silhouette_score(X, node2vec_labels, metric='cosine')
print(score)

Computing transition probabilities: 100%|██████████| 39/39 [00:00<00:00, 12307.42it/s]
Generating walks (CPU: 1): 100%|██████████| 40/40 [00:00<00:00, 2686.93it/s]


0.657311737537384


In [83]:
import pickle
import numpy as np
from glob import glob
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize

# Find all graph files
graph_files = sorted(glob('./graphs/*.pkl'))
print(f"Found {len(graph_files)} graphs")

silhouette_scores = []

for i, file in enumerate(graph_files, start=1):
    with open(file, 'rb') as f:
        G = pickle.load(f)

    print(f"\nProcessing graph {i}/{len(graph_files)}: {file}")

    # Train Node2Vec on the current graph
    node2vec = Node2Vec(
        G,
        dimensions=32,
        walk_length=10,
        num_walks=40,
        p=2.0,
        q=4.0,
        workers=1,
        quiet=True,
        seed=42
    )

    model = node2vec.fit(
        window=10,
        min_count=1,
        batch_words=4,
        epochs=10
    )

    # Extract embeddings
    nodes = list(G.nodes())
    X = np.array([model.wv[str(n)] for n in nodes])
    X = normalize(X)

    # Perform K-means clustering
    n_clusters = 5
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)

    # Compute silhouette score (using cosine distance)
    score = silhouette_score(X, labels, metric='cosine')
    silhouette_scores.append(score)

    print(f"Silhouette score for graph {i}: {score:.4f}")

# Compute mean and standard deviation
mean_score = np.mean(silhouette_scores)
std_score = np.std(silhouette_scores)

print("\n=== Summary ===")
print(f"Mean Silhouette Score: {mean_score:.4f}")
print(f"Standard Deviation:    {std_score:.4f}")


Found 112 graphs

Processing graph 1/112: ./graphs/graph_match2499719_team1609.pkl
Silhouette score for graph 1: 0.6573

Processing graph 2/112: ./graphs/graph_match2499720_team1625.pkl
Silhouette score for graph 2: 0.6538

Processing graph 3/112: ./graphs/graph_match2499727_team1612.pkl
Silhouette score for graph 3: 0.6071

Processing graph 4/112: ./graphs/graph_match2499733_team1612.pkl
Silhouette score for graph 4: 0.6064

Processing graph 5/112: ./graphs/graph_match2499734_team1625.pkl
Silhouette score for graph 5: 0.6531

Processing graph 6/112: ./graphs/graph_match2499735_team1609.pkl
Silhouette score for graph 6: 0.5911

Processing graph 7/112: ./graphs/graph_match2499739_team1625.pkl
Silhouette score for graph 7: 0.6174

Processing graph 8/112: ./graphs/graph_match2499743_team1609.pkl
Silhouette score for graph 8: 0.5033

Processing graph 9/112: ./graphs/graph_match2499743_team1612.pkl


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Silhouette score for graph 9: 0.5606

Processing graph 10/112: ./graphs/graph_match2499749_team1609.pkl
Silhouette score for graph 10: 0.5280

Processing graph 11/112: ./graphs/graph_match2499754_team1612.pkl
Silhouette score for graph 11: 0.6237

Processing graph 12/112: ./graphs/graph_match2499754_team1625.pkl
Silhouette score for graph 12: 0.4923

Processing graph 13/112: ./graphs/graph_match2499760_team1609.pkl
Silhouette score for graph 13: 0.5713

Processing graph 14/112: ./graphs/graph_match2499763_team1612.pkl
Silhouette score for graph 14: 0.5297

Processing graph 15/112: ./graphs/graph_match2499767_team1625.pkl
Silhouette score for graph 15: 0.6068

Processing graph 16/112: ./graphs/graph_match2499769_team1609.pkl
Silhouette score for graph 16: 0.6002

Processing graph 17/112: ./graphs/graph_match2499773_team1612.pkl
Silhouette score for graph 17: 0.5806

Processing graph 18/112: ./graphs/graph_match2499774_team1625.pkl
Silhouette score for graph 18: 0.4850

Processing graph 

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Silhouette score for graph 30: 0.5220

Processing graph 31/112: ./graphs/graph_match2499822_team1609.pkl
Silhouette score for graph 31: 0.5783

Processing graph 32/112: ./graphs/graph_match2499822_team1625.pkl
Silhouette score for graph 32: 0.6383

Processing graph 33/112: ./graphs/graph_match2499828_team1612.pkl
Silhouette score for graph 33: 0.5118

Processing graph 34/112: ./graphs/graph_match2499830_team1609.pkl
Silhouette score for graph 34: 0.5439

Processing graph 35/112: ./graphs/graph_match2499834_team1625.pkl
Silhouette score for graph 35: 0.5653

Processing graph 36/112: ./graphs/graph_match2499835_team1612.pkl
Silhouette score for graph 36: 0.5354

Processing graph 37/112: ./graphs/graph_match2499839_team1609.pkl


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Silhouette score for graph 37: 0.5329

Processing graph 38/112: ./graphs/graph_match2499841_team1625.pkl
Silhouette score for graph 38: 0.5093

Processing graph 39/112: ./graphs/graph_match2499842_team1612.pkl
Silhouette score for graph 39: 0.5623

Processing graph 40/112: ./graphs/graph_match2499850_team1609.pkl
Silhouette score for graph 40: 0.5883

Processing graph 41/112: ./graphs/graph_match2499857_team1625.pkl
Silhouette score for graph 41: 0.5640

Processing graph 42/112: ./graphs/graph_match2499858_team1612.pkl
Silhouette score for graph 42: 0.5844

Processing graph 43/112: ./graphs/graph_match2499860_team1609.pkl
Silhouette score for graph 43: 0.5604

Processing graph 44/112: ./graphs/graph_match2499861_team1612.pkl
Silhouette score for graph 44: 0.5219

Processing graph 45/112: ./graphs/graph_match2499865_team1625.pkl
Silhouette score for graph 45: 0.5270

Processing graph 46/112: ./graphs/graph_match2499872_team1612.pkl
Silhouette score for graph 46: 0.5119

Processing graph

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Silhouette score for graph 50: 0.5248

Processing graph 51/112: ./graphs/graph_match2499887_team1612.pkl
Silhouette score for graph 51: 0.5613

Processing graph 52/112: ./graphs/graph_match2499889_team1612.pkl
Silhouette score for graph 52: 0.5508

Processing graph 53/112: ./graphs/graph_match2499890_team1609.pkl
Silhouette score for graph 53: 0.5280

Processing graph 54/112: ./graphs/graph_match2499895_team1625.pkl
Silhouette score for graph 54: 0.6236

Processing graph 55/112: ./graphs/graph_match2499899_team1609.pkl
Silhouette score for graph 55: 0.6441

Processing graph 56/112: ./graphs/graph_match2499899_team1612.pkl
Silhouette score for graph 56: 0.5940

Processing graph 57/112: ./graphs/graph_match2499904_team1625.pkl
Silhouette score for graph 57: 0.4746

Processing graph 58/112: ./graphs/graph_match2499911_team1609.pkl
Silhouette score for graph 58: 0.5701

Processing graph 59/112: ./graphs/graph_match2499913_team1612.pkl
Silhouette score for graph 59: 0.6658

Processing graph

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Silhouette score for graph 78: 0.4688

Processing graph 79/112: ./graphs/graph_match2499982_team1625.pkl
Silhouette score for graph 79: 0.5489

Processing graph 80/112: ./graphs/graph_match2499984_team1612.pkl
Silhouette score for graph 80: 0.6409

Processing graph 81/112: ./graphs/graph_match2499987_team1609.pkl
Silhouette score for graph 81: 0.5875

Processing graph 82/112: ./graphs/graph_match2499990_team1609.pkl
Silhouette score for graph 82: 0.5447

Processing graph 83/112: ./graphs/graph_match2499990_team1625.pkl
Silhouette score for graph 83: 0.5363

Processing graph 84/112: ./graphs/graph_match2499995_team1612.pkl
Silhouette score for graph 84: 0.5753

Processing graph 85/112: ./graphs/graph_match2499999_team1609.pkl
Silhouette score for graph 85: 0.5259

Processing graph 86/112: ./graphs/graph_match2500003_team1612.pkl
Silhouette score for graph 86: 0.4882

Processing graph 87/112: ./graphs/graph_match2500004_team1625.pkl
Silhouette score for graph 87: 0.5222

Processing graph

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Silhouette score for graph 101: 0.5014

Processing graph 102/112: ./graphs/graph_match2500065_team1625.pkl
Silhouette score for graph 102: 0.4966

Processing graph 103/112: ./graphs/graph_match2500068_team1612.pkl
Silhouette score for graph 103: 0.5463

Processing graph 104/112: ./graphs/graph_match2500072_team1612.pkl
Silhouette score for graph 104: 0.5035

Processing graph 105/112: ./graphs/graph_match2500073_team1609.pkl
Silhouette score for graph 105: 0.5597

Processing graph 106/112: ./graphs/graph_match2500078_team1625.pkl
Silhouette score for graph 106: 0.6222

Processing graph 107/112: ./graphs/graph_match2500080_team1609.pkl
Silhouette score for graph 107: 0.4664

Processing graph 108/112: ./graphs/graph_match2500082_team1612.pkl
Silhouette score for graph 108: 0.5951

Processing graph 109/112: ./graphs/graph_match2500085_team1625.pkl
Silhouette score for graph 109: 0.5591

Processing graph 110/112: ./graphs/graph_match2500091_team1609.pkl
Silhouette score for graph 110: 0.608

=== Summary ===

Mean Silhouette Score: 0.5565

Standard Deviation:    0.0504

In [85]:
import pickle
import numpy as np
from glob import glob
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from collections import defaultdict, Counter

# Process first 3 graphs for detailed analysis
graph_files = sorted(glob('./graphs/*.pkl'))[:3]
print(f"Analyzing {len(graph_files)} graphs in detail\n")

for graph_idx, file in enumerate(graph_files, start=1):
    with open(file, 'rb') as f:
        G = pickle.load(f)

    print(f"\n{'='*70}")
    print(f"GRAPH {graph_idx}: {file}")
    print(f"Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")
    print(f"{'='*70}")

    # Train Node2Vec
    node2vec = Node2Vec(
        G,
        dimensions=32,
        walk_length=10,
        num_walks=40,
        p=2.0,
        q=4.0,
        workers=1,
        quiet=True,
        seed=42
    )

    model = node2vec.fit(
        window=10,
        min_count=1,
        batch_words=4,
        epochs=10
    )

    # Extract embeddings
    nodes = list(G.nodes())
    X = np.array([model.wv[str(n)] for n in nodes])
    X = normalize(X)

    # Perform K-means clustering
    k = 5
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)

    print(f"\n{'='*70}")
    print(f"TACTICAL CLUSTER INTERPRETATION (k={k})")
    print(f"{'='*70}")

    # Analyze each cluster
    for cluster_id in range(k):
        mask = labels == cluster_id
        cluster_nodes = [nodes[i] for i in range(len(nodes)) if mask[i]]
        cluster_embeddings = X[mask]
        
        print(f"\n{'='*70}")
        print(f"CLUSTER {cluster_id} ({len(cluster_nodes)} nodes)")
        print(f"{'='*70}")
        
        # Collect node attributes
        node_attributes = defaultdict(list)
        for node in cluster_nodes:
            node_data = G.nodes[node]
            for key, value in node_data.items():
                node_attributes[key].append(value)
        
        # Print all available attributes for debugging
        print(f"\nAvailable attributes: {list(node_attributes.keys())}")
        
        # If you have zone information
        if 'zone' in node_attributes or 'zone_id' in node_attributes:
            zone_key = 'zone' if 'zone' in node_attributes else 'zone_id'
            zones = node_attributes[zone_key]
            unique_zones = set(zones)
            print(f"\nZones: {sorted(unique_zones)}")
            print(f"   Unique zones in cluster: {len(unique_zones)}")
        
        # Event characteristics - with type checking
        if 'event_count' in node_attributes or 'events' in node_attributes:
            print(f"\nAverage Activity:")
            event_key = 'event_count' if 'event_count' in node_attributes else 'events'
            events = node_attributes[event_key]
            # Filter numeric values only
            numeric_events = [e for e in events if isinstance(e, (int, float))]
            if numeric_events:
                print(f"   Event count: {np.mean(numeric_events):.1f} ± {np.std(numeric_events):.1f}")
        
        if 'unique_players' in node_attributes or 'player_count' in node_attributes:
            player_key = 'unique_players' if 'unique_players' in node_attributes else 'player_count'
            players = node_attributes[player_key]
            numeric_players = [p for p in players if isinstance(p, (int, float))]
            if numeric_players:
                print(f"   Unique players: {np.mean(numeric_players):.1f} ± {np.std(numeric_players):.1f}")
        
        # Event type distribution - with type checking
        event_type_keys = [k for k in node_attributes.keys() if 'event' in k.lower() or k in ['pass', 'shot', 'dribble', 'tackle', 'interception']]
        if event_type_keys:
            print(f"\nEvent Type Distribution:")
            event_totals = {}
            for event_type in event_type_keys:
                values = node_attributes[event_type]
                # Check if values are numeric
                if values and isinstance(values[0], (int, float)):
                    total = sum(values)
                    event_totals[event_type] = total
                elif values and isinstance(values[0], str):
                    # If strings, show distribution instead
                    counter = Counter(values)
                    print(f"   {event_type}: {dict(counter.most_common(3))}")
            
            if event_totals:
                total_events = sum(event_totals.values())
                if total_events > 0:
                    for event_type, total in sorted(event_totals.items(), key=lambda x: -x[1]):
                        pct = 100 * total / total_events
                        if pct > 5:
                            print(f"   {event_type:15s}: {pct:5.1f}%")
        
        # Player roles - with type checking
        role_keys = [k for k in node_attributes.keys() if k in ['GK', 'DEF', 'MID', 'FWD', 'goalkeeper', 'defender', 'midfielder', 'forward']]
        if role_keys:
            print(f"\nDominant Player Roles:")
            for role in role_keys:
                values = node_attributes[role]
                numeric_values = [v for v in values if isinstance(v, (int, float))]
                if numeric_values:
                    avg_pct = 100 * np.mean(numeric_values)
                    print(f"   {role:12s}: {avg_pct:5.1f}%")
        
        # Categorical attributes (for any string attributes)
        categorical_attrs = [k for k in node_attributes.keys() 
                           if node_attributes[k] and isinstance(node_attributes[k][0], str)]
        if categorical_attrs:
            print(f"\nCategorical Attributes:")
            for attr in categorical_attrs[:5]:  # Limit to first 5
                counter = Counter(node_attributes[attr])

Analyzing 3 graphs in detail


GRAPH 1: ./graphs/graph_match2499719_team1609.pkl
Nodes: 39, Edges: 293

TACTICAL CLUSTER INTERPRETATION (k=5)

CLUSTER 0 (2 nodes)

Available attributes: ['event_count', 'most_common_event', 'unique_players', 'zone_name', 'role_distribution']

Average Activity:
   Event count: 6.5 ± 4.5
   Unique players: 2.5 ± 1.5

Event Type Distribution:
   most_common_event: {'DUEL': 2}
   event_count    : 100.0%

Categorical Attributes:

CLUSTER 1 (16 nodes)

Available attributes: ['event_count', 'most_common_event', 'unique_players', 'zone_name', 'role_distribution']

Average Activity:
   Event count: 22.9 ± 13.8
   Unique players: 5.6 ± 2.5

Event Type Distribution:
   most_common_event: {'PASS': 13, 'DUEL': 3}
   event_count    : 100.0%

Categorical Attributes:

CLUSTER 2 (8 nodes)

Available attributes: ['event_count', 'most_common_event', 'unique_players', 'zone_name', 'role_distribution']

Average Activity:
   Event count: 30.5 ± 18.2
   Unique players: 7.1 ± 

### NMI comparisons 

NMI node2vec

In [87]:
import pickle
import numpy as np
from glob import glob
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import normalize
from itertools import combinations

# Find all graph files
graph_files = sorted(glob('./graphs/*.pkl'))
print(f"Found {len(graph_files)} graphs")

cluster_labels = []

for i, file in enumerate(graph_files, start=1):
    with open(file, 'rb') as f:
        G = pickle.load(f)

    # print(f"\nProcessing graph {i}/{len(graph_files)}: {file}")

    # Train Node2Vec on the current graph
    node2vec = Node2Vec(
        G,
        dimensions=32,
        walk_length=10,
        num_walks=40,
        p=2.0,
        q=4.0,
        workers=1,
        quiet=True,
        seed=42
    )

    model = node2vec.fit(
        window=10,
        min_count=1,
        batch_words=4,
        epochs=10
    )

    # Extract embeddings
    nodes = list(G.nodes())
    X = np.array([model.wv[str(n)] for n in nodes])
    X = normalize(X)

    # Cluster embeddings with KMeans
    n_clusters = 5
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    cluster_labels.append(labels)

# === Compute pairwise NMI scores between all graphs ===
nmi_scores = []

for (i, labels_i), (j, labels_j) in combinations(enumerate(cluster_labels), 2):
    # Only compare graphs with the same number of nodes
    if len(labels_i) == len(labels_j):
        nmi = normalized_mutual_info_score(labels_i, labels_j)
        nmi_scores.append(nmi)
        print(f"NMI between graph {i+1} and {j+1}: {nmi:.4f}")
    else:
        print(f"Skipping NMI for graphs {i+1} and {j+1} (different number of nodes)")

# === Compute mean and standard deviation ===
if nmi_scores:
    mean_nmi = np.mean(nmi_scores)
    std_nmi = np.std(nmi_scores)
    print("\n=== NMI Summary ===")
    print(f"Mean NMI: {mean_nmi:.4f}")
    print(f"Std NMI:  {std_nmi:.4f}")
else:
    print("\nNo NMI scores computed (graphs had different sizes).")


Found 112 graphs
NMI between graph 1 and 2: 0.5686
NMI between graph 1 and 3: 0.4113
NMI between graph 1 and 4: 0.4987
NMI between graph 1 and 5: 0.4654
NMI between graph 1 and 6: 0.5795
NMI between graph 1 and 7: 0.6256
NMI between graph 1 and 8: 0.3995
NMI between graph 1 and 9: 0.3727
NMI between graph 1 and 10: 0.5354
NMI between graph 1 and 11: 0.4360
NMI between graph 1 and 12: 0.6175
NMI between graph 1 and 13: 0.5126
NMI between graph 1 and 14: 0.5636
NMI between graph 1 and 15: 0.6132
NMI between graph 1 and 16: 0.5744
NMI between graph 1 and 17: 0.4991
NMI between graph 1 and 18: 0.6665
NMI between graph 1 and 19: 0.5252
NMI between graph 1 and 20: 0.4579
NMI between graph 1 and 21: 0.6051
NMI between graph 1 and 22: 0.5694
NMI between graph 1 and 23: 0.5015
NMI between graph 1 and 24: 0.3848
NMI between graph 1 and 25: 0.4787
NMI between graph 1 and 26: 0.5287
NMI between graph 1 and 27: 0.3638
NMI between graph 1 and 28: 0.6248
NMI between graph 1 and 29: 0.6935
NMI between

In [93]:
import os
import re
import pickle
import numpy as np
from glob import glob
from itertools import combinations
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import normalize

# === 1. Find all graph files ===
graph_files = sorted(glob('./graphs/*.pkl'))
print(f"Found {len(graph_files)} graphs")

# === 2. Group by team ===
team_graphs = {}
pattern = re.compile(r'team(\d+)')

for f in graph_files:
    match = pattern.search(os.path.basename(f))
    if match:
        team_id = match.group(1)
        team_graphs.setdefault(team_id, []).append(f)

print("\nTeams found:")
for team, files in team_graphs.items():
    print(f"  Team {team}: {len(files)} matches")

# === 3. Compute mean NMI per team ===
team_nmi_summary = {}

for team, files in team_graphs.items():
    print(f"\n=== Processing Team {team} ===")

    cluster_labels = []

    for file in sorted(files):
        with open(file, 'rb') as f:
            G = pickle.load(f)

        # Node2Vec training
        node2vec = Node2Vec(
            G,
            dimensions=32,
            walk_length=10,
            num_walks=40,
            p=2.0,
            q=4.0,
            workers=1,
            quiet=True,
            seed=42
        )

        model = node2vec.fit(
            window=10,
            min_count=1,
            batch_words=4,
            epochs=10
        )

        # Embeddings
        nodes = list(G.nodes())
        X = np.array([model.wv[str(n)] for n in nodes])
        X = normalize(X)

        # Cluster embeddings
        n_clusters = 5
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        cluster_labels.append(labels)

    # Compute pairwise NMI between matches of this team
    nmi_scores = []
    for (labels_i, labels_j) in combinations(cluster_labels, 2):
        if len(labels_i) == len(labels_j):
            nmi_scores.append(normalized_mutual_info_score(labels_i, labels_j))

    # Store mean/std NMI
    if nmi_scores:
        mean_nmi = np.mean(nmi_scores)
        std_nmi = np.std(nmi_scores)
        team_nmi_summary[team] = (mean_nmi, std_nmi)
        print(f"Mean NMI: {mean_nmi:.4f}, Std: {std_nmi:.4f}")
    else:
        print("Not enough comparable matches (different #nodes or only one match).")

# === 4. Print final summary ===
print("\n=== Overall NMI Summary ===")
for team, (mean_nmi, std_nmi) in team_nmi_summary.items():
    print(f"Team {team}: Mean NMI = {mean_nmi:.4f}, Std = {std_nmi:.4f}")


Found 112 graphs

Teams found:
  Team 1609: 38 matches
  Team 1625: 37 matches
  Team 1612: 37 matches

=== Processing Team 1609 ===
Mean NMI: 0.4700, Std: 0.0873

=== Processing Team 1625 ===
Mean NMI: 0.4883, Std: 0.0858

=== Processing Team 1612 ===
Mean NMI: 0.4357, Std: 0.0778

=== Overall NMI Summary ===
Team 1609: Mean NMI = 0.4700, Std = 0.0873
Team 1625: Mean NMI = 0.4883, Std = 0.0858
Team 1612: Mean NMI = 0.4357, Std = 0.0778


Random Clustering

In [96]:
import numpy as np
from itertools import combinations
from sklearn.metrics import normalized_mutual_info_score

# === Parameters ===
num_labelings = 40   # number of random clusterings
num_nodes = 41       # number of nodes
num_clusters = 5     # number of clusters per labeling

# === Generate random clusterings ===
labelings = [np.random.randint(0, num_clusters, size=num_nodes) for _ in range(num_labelings)]

# === Compute pairwise NMI between all clusterings ===
nmi_scores = []
for l1, l2 in combinations(labelings, 2):
    nmi = normalized_mutual_info_score(l1, l2)
    nmi_scores.append(nmi)

# === Compute mean and standard deviation ===
mean_nmi = np.mean(nmi_scores)
std_nmi = np.std(nmi_scores)

print(f"Mean NMI: {mean_nmi:.4f}")
print(f"Std NMI:  {std_nmi:.4f}")


Mean NMI: 0.1557
Std NMI:  0.0463


### Spectral & Louvain

In [98]:
import os
import re
import pickle
import numpy as np
from glob import glob
from itertools import combinations
from sklearn.metrics import normalized_mutual_info_score
from spectral_build_vizualization import discover_tactical_patterns
import networkx as nx
import community as community_louvain  # pip install python-louvain

# === 1. Find all graph files ===
graph_files = sorted(glob('./graphs/*.pkl'))
print(f"Found {len(graph_files)} graphs")

# === 2. Group by team ===
team_graphs = {}
pattern = re.compile(r'team(\d+)')

for f in graph_files:
    match = pattern.search(os.path.basename(f))
    if match:
        team_id = match.group(1)
        team_graphs.setdefault(team_id, []).append(f)

print("\nTeams found:")
for team, files in team_graphs.items():
    print(f"  Team {team}: {len(files)} matches")

# === 3. Compute mean NMI per team for spectral and Louvain ===
team_nmi_summary = {}

for team, files in team_graphs.items():
    print(f"\n=== Processing Team {team} ===")

    spectral_labels_list = []
    louvain_labels_list = []

    for file in sorted(files):
        with open(file, 'rb') as f:
            G = pickle.load(f)

        # --- Spectral clustering ---
        labels_spectral, _ = discover_tactical_patterns(G, k=5)
        spectral_labels_list.append(labels_spectral)

        # --- Louvain clustering ---
        G_undirected = G.to_undirected() if isinstance(G, nx.DiGraph) else G
        partition_louvain = community_louvain.best_partition(G_undirected)
        labels_louvain = np.array([partition_louvain[node] for node in G.nodes()])
        louvain_labels_list.append(labels_louvain)

    # --- Compute pairwise NMI ---
    def compute_pairwise_nmi(labels_list):
        nmi_scores = []
        for labels_i, labels_j in combinations(labels_list, 2):
            if len(labels_i) == len(labels_j):
                nmi_scores.append(normalized_mutual_info_score(labels_i, labels_j))
        if nmi_scores:
            return np.mean(nmi_scores), np.std(nmi_scores)
        else:
            return 0.0, 0.0

    mean_nmi_spectral, std_nmi_spectral = compute_pairwise_nmi(spectral_labels_list)
    mean_nmi_louvain, std_nmi_louvain = compute_pairwise_nmi(louvain_labels_list)

    team_nmi_summary[team] = {
        "spectral": (mean_nmi_spectral, std_nmi_spectral),
        "louvain": (mean_nmi_louvain, std_nmi_louvain)
    }

    print(f"Spectral - Mean NMI: {mean_nmi_spectral:.4f}, Std: {std_nmi_spectral:.4f}")
    print(f"Louvain  - Mean NMI: {mean_nmi_louvain:.4f}, Std: {std_nmi_louvain:.4f}")

# === 4. Print final summary ===
print("\n=== Overall NMI Summary ===")
for team, scores in team_nmi_summary.items():
    spectral = scores["spectral"]
    louvain = scores["louvain"]
    print(f"Team {team}:")
    print(f"  Spectral - Mean NMI = {spectral[0]:.4f}, Std = {spectral[1]:.4f}")
    print(f"  Louvain  - Mean NMI = {louvain[0]:.4f}, Std = {louvain[1]:.4f}")



Found 112 graphs

Teams found:
  Team 1609: 38 matches
  Team 1625: 37 matches
  Team 1612: 37 matches

=== Processing Team 1609 ===
Spectral - Mean NMI: 0.3542, Std: 0.0774
Louvain  - Mean NMI: 0.5530, Std: 0.0729

=== Processing Team 1625 ===
Spectral - Mean NMI: 0.3705, Std: 0.0756
Louvain  - Mean NMI: 0.5735, Std: 0.0774

=== Processing Team 1612 ===
Spectral - Mean NMI: 0.4128, Std: 0.0769
Louvain  - Mean NMI: 0.5609, Std: 0.0646

=== Overall NMI Summary ===
Team 1609:
  Spectral - Mean NMI = 0.3542, Std = 0.0774
  Louvain  - Mean NMI = 0.5530, Std = 0.0729
Team 1625:
  Spectral - Mean NMI = 0.3705, Std = 0.0756
  Louvain  - Mean NMI = 0.5735, Std = 0.0774
Team 1612:
  Spectral - Mean NMI = 0.4128, Std = 0.0769
  Louvain  - Mean NMI = 0.5609, Std = 0.0646


###