Skip to content

Commit 116f216

Browse files
authored
Improve square clustering (#17)
* Improve square clustering * Fix (pagerank_scipy and pagerank_numpy were removed)
1 parent 8de0be6 commit 116f216

File tree

2 files changed

+119
-75
lines changed

2 files changed

+119
-75
lines changed

graphblas_algorithms/algorithms/cluster.py

Lines changed: 112 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
2-
from graphblas import Matrix, Vector, binary, monoid, replace, select, unary
3-
from graphblas.semiring import any_times, plus_pair, plus_times
2+
from graphblas import Matrix, Vector, binary, monoid, replace, unary
3+
from graphblas.semiring import plus_first, plus_pair, plus_times
44

55
from graphblas_algorithms.classes.digraph import to_graph
66
from graphblas_algorithms.classes.graph import to_undirected_graph
@@ -244,100 +244,141 @@ def average_clustering(G, nodes=None, weight=None, count_zeros=True):
244244
return func(G, weighted=weighted, count_zeros=count_zeros, mask=mask)
245245

246246

247-
def square_clustering_core(G, node_ids=None):
248-
# node_ids argument is a bit different from what we do elsewhere.
249-
# Normally, we take a mask or vector in a `_core` function.
250-
# By accepting an iterable here, it could be of node ids or node keys.
251-
A, degrees = G.get_properties("A degrees+") # TODO" how to handle self-edges?
252-
if node_ids is None:
253-
# Can we do this better using SuiteSparse:GraphBLAS iteration?
254-
node_ids = A.reduce_rowwise(monoid.any).new(name="node_ids") # all nodes with edges
255-
C = unary.positionj(A).new(name="C")
256-
rv = Vector(float, A.nrows, name="square_clustering")
257-
row = Vector(A.dtype, A.ncols, name="row")
258-
M = Matrix(int, A.nrows, A.ncols, name="M")
259-
Q = Matrix(int, A.nrows, A.ncols, name="Q")
260-
for v in node_ids:
261-
# Th mask M indicates the u and w neighbors of v to "iterate" over
262-
row << A[v, :]
263-
M << row.outer(row, binary.pair)
264-
M << select.tril(M, -1)
265-
# To compute q_v(u, w), the number of common neighbors of u and w other than v (squares),
266-
# we first set the v'th column to zero, which lets us ignore v as a common neighbor.
267-
Q << binary.isne(C, v) # `isne` keeps the dtype as int
268-
# Q: count the number of squares for each u-w combination!
269-
Q(M.S, replace) << plus_times(Q @ Q.T)
270-
# Total squares for v
271-
squares = Q.reduce_scalar().get(0)
272-
if squares == 0:
273-
rv[v] = 0
274-
continue
275-
# Denominator is the total number of squares that could exist.
276-
# First contribution is degrees[u] + degrees[w] for each u-w combo.
277-
Q(M.S, replace) << degrees.outer(degrees, binary.plus)
278-
deg_uw = Q.reduce_scalar().new()
279-
# Then we subtract off # squares, 1 for each u and 1 for each w for all combos,
280-
# and 1 for each edge where u-w or w-u are connected (which would make triangles).
281-
Q << binary.pair(A & M) # Are u-w connected? Can skip if bipartite
282-
denom = deg_uw.get(0) - (squares + 2 * M.nvals + 2 * Q.nvals)
283-
rv[v] = squares / denom
284-
return rv
285-
286-
287-
def square_clustering_core_full(G):
288-
# Warning: only tested on undirected graphs
289-
# Read-only matrices we'll use throughout the calculation
247+
def single_square_clustering_core(G, idx):
290248
A, degrees = G.get_properties("A degrees+") # TODO" how to handle self-edges?
291-
D = degrees.diag(name="D")
292-
P2 = plus_pair(A @ A.T).new(mask=~D.S, name="P2")
293-
249+
deg = degrees.get(idx, 0)
250+
if deg <= 1:
251+
return 0
252+
# P2 from https://arxiv.org/pdf/2007.11111.pdf; we'll also use it as scratch
253+
v = A[idx, :].new(name="v")
254+
p2 = plus_pair(A.T @ v).new(name="p2")
255+
del p2[idx]
256+
# Denominator is thought of as the total number of squares that could exist.
257+
# We use the definition from https://arxiv.org/pdf/0710.0117v1.pdf (equation 2).
258+
#
259+
# (1) Subtract 1 for each edge where u-w or w-u are connected (which would make triangles)
260+
denom = -plus_first(p2 @ v).get(0)
294261
# Numerator: number of squares
295262
# Based on https://arxiv.org/pdf/2007.11111.pdf (sigma_12, c_4)
296-
Q = (P2 - 1).new(name="Q")
297-
Q << Q * P2
298-
squares = Q.reduce_rowwise().new(name="squares")
299-
squares(squares.V, replace=True) << squares // 2 # Drop zeros
263+
p2(binary.times) << p2 - 1
264+
squares = p2.reduce().get(0) // 2
265+
if squares == 0:
266+
return 0
267+
# (2) Subtract 1 for each u and 1 for each w for all combos: degrees * (degrees - 1)
268+
denom -= deg * (deg - 1)
269+
# (3) The main contribution to the denominator: degrees[u] + degrees[w] for each u-w combo.
270+
# This is the only positive term.
271+
denom += plus_times(v @ degrees).value * (deg - 1)
272+
# (4) Subtract the number of squares
273+
denom -= squares
274+
# And we're done!
275+
return squares / denom
276+
277+
278+
def square_clustering_core(G, node_ids=None):
279+
# Warning: only tested on undirected graphs.
280+
# Also, it may use a lot of memory, because we compute `P2 = A @ A.T`
281+
#
282+
# Pseudocode:
283+
# P2(~degrees.diag().S) = plus_pair(A @ A.T)
284+
# tri = first(P2 & A).reduce_rowwise()
285+
# squares = (P2 * (P2 - 1)).reduce_rowwise() / 2
286+
# uw_count = degrees * (degrees - 1)
287+
# uw_degrees = plus_times(A @ degrees) * (degrees - 1)
288+
# square_clustering = squares / (uw_degrees - uw_count - tri - squares)
289+
#
290+
A, degrees = G.get_properties("A degrees+") # TODO" how to handle self-edges?
291+
# P2 from https://arxiv.org/pdf/2007.11111.pdf; we'll also use it as scratch
292+
if node_ids is not None:
293+
v = Vector.from_values(node_ids, True, size=degrees.size)
294+
Asubset = binary.second(v & A).new(name="A_subset")
295+
else:
296+
Asubset = A
297+
A = A.T
298+
D = degrees.diag(name="D")
299+
P2 = plus_pair(Asubset @ A).new(mask=~D.S, name="P2")
300300

301301
# Denominator is thought of as the total number of squares that could exist.
302302
# We use the definition from https://arxiv.org/pdf/0710.0117v1.pdf (equation 2).
303-
# First three contributions will become negative in the final step.
303+
# denom = uw_degrees - uw_count - tri - squares
304304
#
305-
# (1) Subtract 1 for each u and 1 for each w for all combos: degrees * (degrees - 1)
306-
denom = (degrees - 1).new(name="denom")
307-
denom << denom * degrees
305+
# (1) Subtract 1 for each edge where u-w or w-u are connected (i.e., triangles)
306+
# tri = first(P2 & A).reduce_rowwise()
307+
D << binary.first(P2 & Asubset)
308+
neg_denom = D.reduce_rowwise().new(name="neg_denom")
309+
del D
308310

309-
# (2) Subtract the number of squares
310-
denom << binary.plus(denom & squares)
311+
# Numerator: number of squares
312+
# Based on https://arxiv.org/pdf/2007.11111.pdf (sigma_12, c_4)
313+
# squares = (P2 * (P2 - 1)).reduce_rowwise() / 2
314+
P2(binary.times) << P2 - 1
315+
squares = P2.reduce_rowwise().new(name="squares")
316+
del P2
317+
squares(squares.V, replace) << binary.cdiv(squares, 2) # Drop zeros
318+
319+
# (2) Subtract 1 for each u and 1 for each w for all combos: degrees * (degrees - 1)
320+
# uw_count = degrees * (degrees - 1)
321+
denom = (degrees - 1).new(mask=squares.S, name="denom")
322+
neg_denom(binary.plus) << degrees * denom
311323

312-
# (3) Subtract 1 for each edge where u-w or w-u are connected (which would make triangles)
313-
Q << binary.first(P2 & A)
314-
denom(binary.plus) << Q.reduce_rowwise()
324+
# (3) The main contribution to the denominator: degrees[u] + degrees[w] for each u-w combo.
325+
# uw_degrees = plus_times(A @ degrees) * (degrees - 1)
326+
# denom(binary.times) << plus_times(A @ degrees)
327+
denom(binary.times, denom.S) << plus_times(Asubset @ degrees)
315328

316-
# The main contribution to the denominator: degrees[u] + degrees[w] for each u-w combo.
317-
# This is the only positive term. We subtract all other terms from this one, hence rminus.
318-
Q(A.S, replace=True) << plus_pair(A @ P2.T)
319-
Q << any_times(Q @ D)
320-
denom(binary.rminus) << Q.reduce_rowwise()
329+
# (4) Subtract the number of squares
330+
denom(binary.minus) << binary.plus(neg_denom & squares)
321331

322332
# And we're done! This result does not include 0s
323333
return (squares / denom).new(name="square_clustering")
324334

325335

326-
def square_clustering(G, nodes=None):
336+
def _split(L, k):
337+
"""Split a list into approximately-equal parts"""
338+
N = len(L)
339+
start = 0
340+
for i in range(1, k):
341+
stop = (N * i + k - 1) // k
342+
if stop != start:
343+
yield L[start:stop]
344+
start = stop
345+
if stop != N:
346+
yield L[stop:]
347+
348+
349+
def _square_clustering_split(G, node_ids=None, *, nsplits):
350+
if node_ids is None:
351+
node_ids = G._A.reduce_rowwise(monoid.any).to_values()[0]
352+
result = None
353+
for chunk_ids in _split(node_ids, nsplits):
354+
res = square_clustering_core(G, chunk_ids)
355+
if result is None:
356+
result = res
357+
else:
358+
result << monoid.any(result | res)
359+
return result
360+
361+
362+
def square_clustering(G, nodes=None, *, nsplits=None):
327363
G = to_undirected_graph(G)
328364
if len(G) == 0:
329365
return {}
330366
elif nodes is None:
331367
# Should we use this one for subsets of nodes as well?
332-
result = square_clustering_core_full(G)
368+
if nsplits is None:
369+
result = square_clustering_core(G)
370+
else:
371+
result = _square_clustering_split(G, nsplits=nsplits)
333372
return G.vector_to_dict(result, fillvalue=0)
334373
elif nodes in G:
335374
idx = G._key_to_id[nodes]
336-
result = square_clustering_core(G, [idx])
337-
return result.get(idx)
375+
return single_square_clustering_core(G, idx)
338376
else:
339377
ids = G.list_to_ids(nodes)
340-
result = square_clustering_core(G, ids)
378+
if nsplits is None:
379+
result = square_clustering_core(G, ids)
380+
else:
381+
result = _square_clustering_split(G, ids, nsplits=nsplits)
341382
return G.vector_to_dict(result)
342383

343384

graphblas_algorithms/conftest.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,15 @@ def orig():
2626
and not isinstance(val, types.ModuleType)
2727
and key not in {"Graph", "DiGraph"}
2828
}
29-
replacements["pagerank_scipy"] = (nx.pagerank_scipy, ga.pagerank)
30-
replacements["pagerank_numpy"] = (nx.pagerank_numpy, ga.pagerank)
3129
for key, (orig_val, new_val) in replacements.items():
3230
setattr(orig, key, orig_val)
33-
if key not in {"pagerank_numpy"}:
34-
assert inspect.signature(orig_val) == inspect.signature(new_val), key
31+
if key not in {}:
32+
# Ignore keyword-only parameters (works until NetworkX has keyword-only arguments)
33+
sig = inspect.signature(new_val)
34+
sig = sig.replace(
35+
parameters=[x for x in sig.parameters.values() if x.kind != x.KEYWORD_ONLY]
36+
)
37+
assert inspect.signature(orig_val) == sig, key
3538
for name, module in sys.modules.items():
3639
if not name.startswith("networkx.") and name != "networkx":
3740
continue

0 commit comments

Comments
 (0)