|
1 | 1 | import numpy as np
|
2 |
| -from graphblas import Matrix, Vector, binary, monoid, replace, select, unary |
3 |
| -from graphblas.semiring import any_times, plus_pair, plus_times |
| 2 | +from graphblas import Matrix, Vector, binary, monoid, replace, unary |
| 3 | +from graphblas.semiring import plus_first, plus_pair, plus_times |
4 | 4 |
|
5 | 5 | from graphblas_algorithms.classes.digraph import to_graph
|
6 | 6 | from graphblas_algorithms.classes.graph import to_undirected_graph
|
@@ -244,100 +244,141 @@ def average_clustering(G, nodes=None, weight=None, count_zeros=True):
|
244 | 244 | return func(G, weighted=weighted, count_zeros=count_zeros, mask=mask)
|
245 | 245 |
|
246 | 246 |
|
247 |
| -def square_clustering_core(G, node_ids=None): |
248 |
| - # node_ids argument is a bit different from what we do elsewhere. |
249 |
| - # Normally, we take a mask or vector in a `_core` function. |
250 |
| - # By accepting an iterable here, it could be of node ids or node keys. |
251 |
| - A, degrees = G.get_properties("A degrees+") # TODO" how to handle self-edges? |
252 |
| - if node_ids is None: |
253 |
| - # Can we do this better using SuiteSparse:GraphBLAS iteration? |
254 |
| - node_ids = A.reduce_rowwise(monoid.any).new(name="node_ids") # all nodes with edges |
255 |
| - C = unary.positionj(A).new(name="C") |
256 |
| - rv = Vector(float, A.nrows, name="square_clustering") |
257 |
| - row = Vector(A.dtype, A.ncols, name="row") |
258 |
| - M = Matrix(int, A.nrows, A.ncols, name="M") |
259 |
| - Q = Matrix(int, A.nrows, A.ncols, name="Q") |
260 |
| - for v in node_ids: |
261 |
| - # Th mask M indicates the u and w neighbors of v to "iterate" over |
262 |
| - row << A[v, :] |
263 |
| - M << row.outer(row, binary.pair) |
264 |
| - M << select.tril(M, -1) |
265 |
| - # To compute q_v(u, w), the number of common neighbors of u and w other than v (squares), |
266 |
| - # we first set the v'th column to zero, which lets us ignore v as a common neighbor. |
267 |
| - Q << binary.isne(C, v) # `isne` keeps the dtype as int |
268 |
| - # Q: count the number of squares for each u-w combination! |
269 |
| - Q(M.S, replace) << plus_times(Q @ Q.T) |
270 |
| - # Total squares for v |
271 |
| - squares = Q.reduce_scalar().get(0) |
272 |
| - if squares == 0: |
273 |
| - rv[v] = 0 |
274 |
| - continue |
275 |
| - # Denominator is the total number of squares that could exist. |
276 |
| - # First contribution is degrees[u] + degrees[w] for each u-w combo. |
277 |
| - Q(M.S, replace) << degrees.outer(degrees, binary.plus) |
278 |
| - deg_uw = Q.reduce_scalar().new() |
279 |
| - # Then we subtract off # squares, 1 for each u and 1 for each w for all combos, |
280 |
| - # and 1 for each edge where u-w or w-u are connected (which would make triangles). |
281 |
| - Q << binary.pair(A & M) # Are u-w connected? Can skip if bipartite |
282 |
| - denom = deg_uw.get(0) - (squares + 2 * M.nvals + 2 * Q.nvals) |
283 |
| - rv[v] = squares / denom |
284 |
| - return rv |
285 |
| - |
286 |
| - |
287 |
| -def square_clustering_core_full(G): |
288 |
| - # Warning: only tested on undirected graphs |
289 |
| - # Read-only matrices we'll use throughout the calculation |
| 247 | +def single_square_clustering_core(G, idx): |
290 | 248 | A, degrees = G.get_properties("A degrees+") # TODO" how to handle self-edges?
|
291 |
| - D = degrees.diag(name="D") |
292 |
| - P2 = plus_pair(A @ A.T).new(mask=~D.S, name="P2") |
293 |
| - |
| 249 | + deg = degrees.get(idx, 0) |
| 250 | + if deg <= 1: |
| 251 | + return 0 |
| 252 | + # P2 from https://arxiv.org/pdf/2007.11111.pdf; we'll also use it as scratch |
| 253 | + v = A[idx, :].new(name="v") |
| 254 | + p2 = plus_pair(A.T @ v).new(name="p2") |
| 255 | + del p2[idx] |
| 256 | + # Denominator is thought of as the total number of squares that could exist. |
| 257 | + # We use the definition from https://arxiv.org/pdf/0710.0117v1.pdf (equation 2). |
| 258 | + # |
| 259 | + # (1) Subtract 1 for each edge where u-w or w-u are connected (which would make triangles) |
| 260 | + denom = -plus_first(p2 @ v).get(0) |
294 | 261 | # Numerator: number of squares
|
295 | 262 | # Based on https://arxiv.org/pdf/2007.11111.pdf (sigma_12, c_4)
|
296 |
| - Q = (P2 - 1).new(name="Q") |
297 |
| - Q << Q * P2 |
298 |
| - squares = Q.reduce_rowwise().new(name="squares") |
299 |
| - squares(squares.V, replace=True) << squares // 2 # Drop zeros |
| 263 | + p2(binary.times) << p2 - 1 |
| 264 | + squares = p2.reduce().get(0) // 2 |
| 265 | + if squares == 0: |
| 266 | + return 0 |
| 267 | + # (2) Subtract 1 for each u and 1 for each w for all combos: degrees * (degrees - 1) |
| 268 | + denom -= deg * (deg - 1) |
| 269 | + # (3) The main contribution to the denominator: degrees[u] + degrees[w] for each u-w combo. |
| 270 | + # This is the only positive term. |
| 271 | + denom += plus_times(v @ degrees).value * (deg - 1) |
| 272 | + # (4) Subtract the number of squares |
| 273 | + denom -= squares |
| 274 | + # And we're done! |
| 275 | + return squares / denom |
| 276 | + |
| 277 | + |
| 278 | +def square_clustering_core(G, node_ids=None): |
| 279 | + # Warning: only tested on undirected graphs. |
| 280 | + # Also, it may use a lot of memory, because we compute `P2 = A @ A.T` |
| 281 | + # |
| 282 | + # Pseudocode: |
| 283 | + # P2(~degrees.diag().S) = plus_pair(A @ A.T) |
| 284 | + # tri = first(P2 & A).reduce_rowwise() |
| 285 | + # squares = (P2 * (P2 - 1)).reduce_rowwise() / 2 |
| 286 | + # uw_count = degrees * (degrees - 1) |
| 287 | + # uw_degrees = plus_times(A @ degrees) * (degrees - 1) |
| 288 | + # square_clustering = squares / (uw_degrees - uw_count - tri - squares) |
| 289 | + # |
| 290 | + A, degrees = G.get_properties("A degrees+") # TODO" how to handle self-edges? |
| 291 | + # P2 from https://arxiv.org/pdf/2007.11111.pdf; we'll also use it as scratch |
| 292 | + if node_ids is not None: |
| 293 | + v = Vector.from_values(node_ids, True, size=degrees.size) |
| 294 | + Asubset = binary.second(v & A).new(name="A_subset") |
| 295 | + else: |
| 296 | + Asubset = A |
| 297 | + A = A.T |
| 298 | + D = degrees.diag(name="D") |
| 299 | + P2 = plus_pair(Asubset @ A).new(mask=~D.S, name="P2") |
300 | 300 |
|
301 | 301 | # Denominator is thought of as the total number of squares that could exist.
|
302 | 302 | # We use the definition from https://arxiv.org/pdf/0710.0117v1.pdf (equation 2).
|
303 |
| - # First three contributions will become negative in the final step. |
| 303 | + # denom = uw_degrees - uw_count - tri - squares |
304 | 304 | #
|
305 |
| - # (1) Subtract 1 for each u and 1 for each w for all combos: degrees * (degrees - 1) |
306 |
| - denom = (degrees - 1).new(name="denom") |
307 |
| - denom << denom * degrees |
| 305 | + # (1) Subtract 1 for each edge where u-w or w-u are connected (i.e., triangles) |
| 306 | + # tri = first(P2 & A).reduce_rowwise() |
| 307 | + D << binary.first(P2 & Asubset) |
| 308 | + neg_denom = D.reduce_rowwise().new(name="neg_denom") |
| 309 | + del D |
308 | 310 |
|
309 |
| - # (2) Subtract the number of squares |
310 |
| - denom << binary.plus(denom & squares) |
| 311 | + # Numerator: number of squares |
| 312 | + # Based on https://arxiv.org/pdf/2007.11111.pdf (sigma_12, c_4) |
| 313 | + # squares = (P2 * (P2 - 1)).reduce_rowwise() / 2 |
| 314 | + P2(binary.times) << P2 - 1 |
| 315 | + squares = P2.reduce_rowwise().new(name="squares") |
| 316 | + del P2 |
| 317 | + squares(squares.V, replace) << binary.cdiv(squares, 2) # Drop zeros |
| 318 | + |
| 319 | + # (2) Subtract 1 for each u and 1 for each w for all combos: degrees * (degrees - 1) |
| 320 | + # uw_count = degrees * (degrees - 1) |
| 321 | + denom = (degrees - 1).new(mask=squares.S, name="denom") |
| 322 | + neg_denom(binary.plus) << degrees * denom |
311 | 323 |
|
312 |
| - # (3) Subtract 1 for each edge where u-w or w-u are connected (which would make triangles) |
313 |
| - Q << binary.first(P2 & A) |
314 |
| - denom(binary.plus) << Q.reduce_rowwise() |
| 324 | + # (3) The main contribution to the denominator: degrees[u] + degrees[w] for each u-w combo. |
| 325 | + # uw_degrees = plus_times(A @ degrees) * (degrees - 1) |
| 326 | + # denom(binary.times) << plus_times(A @ degrees) |
| 327 | + denom(binary.times, denom.S) << plus_times(Asubset @ degrees) |
315 | 328 |
|
316 |
| - # The main contribution to the denominator: degrees[u] + degrees[w] for each u-w combo. |
317 |
| - # This is the only positive term. We subtract all other terms from this one, hence rminus. |
318 |
| - Q(A.S, replace=True) << plus_pair(A @ P2.T) |
319 |
| - Q << any_times(Q @ D) |
320 |
| - denom(binary.rminus) << Q.reduce_rowwise() |
| 329 | + # (4) Subtract the number of squares |
| 330 | + denom(binary.minus) << binary.plus(neg_denom & squares) |
321 | 331 |
|
322 | 332 | # And we're done! This result does not include 0s
|
323 | 333 | return (squares / denom).new(name="square_clustering")
|
324 | 334 |
|
325 | 335 |
|
326 |
| -def square_clustering(G, nodes=None): |
| 336 | +def _split(L, k): |
| 337 | + """Split a list into approximately-equal parts""" |
| 338 | + N = len(L) |
| 339 | + start = 0 |
| 340 | + for i in range(1, k): |
| 341 | + stop = (N * i + k - 1) // k |
| 342 | + if stop != start: |
| 343 | + yield L[start:stop] |
| 344 | + start = stop |
| 345 | + if stop != N: |
| 346 | + yield L[stop:] |
| 347 | + |
| 348 | + |
| 349 | +def _square_clustering_split(G, node_ids=None, *, nsplits): |
| 350 | + if node_ids is None: |
| 351 | + node_ids = G._A.reduce_rowwise(monoid.any).to_values()[0] |
| 352 | + result = None |
| 353 | + for chunk_ids in _split(node_ids, nsplits): |
| 354 | + res = square_clustering_core(G, chunk_ids) |
| 355 | + if result is None: |
| 356 | + result = res |
| 357 | + else: |
| 358 | + result << monoid.any(result | res) |
| 359 | + return result |
| 360 | + |
| 361 | + |
| 362 | +def square_clustering(G, nodes=None, *, nsplits=None): |
327 | 363 | G = to_undirected_graph(G)
|
328 | 364 | if len(G) == 0:
|
329 | 365 | return {}
|
330 | 366 | elif nodes is None:
|
331 | 367 | # Should we use this one for subsets of nodes as well?
|
332 |
| - result = square_clustering_core_full(G) |
| 368 | + if nsplits is None: |
| 369 | + result = square_clustering_core(G) |
| 370 | + else: |
| 371 | + result = _square_clustering_split(G, nsplits=nsplits) |
333 | 372 | return G.vector_to_dict(result, fillvalue=0)
|
334 | 373 | elif nodes in G:
|
335 | 374 | idx = G._key_to_id[nodes]
|
336 |
| - result = square_clustering_core(G, [idx]) |
337 |
| - return result.get(idx) |
| 375 | + return single_square_clustering_core(G, idx) |
338 | 376 | else:
|
339 | 377 | ids = G.list_to_ids(nodes)
|
340 |
| - result = square_clustering_core(G, ids) |
| 378 | + if nsplits is None: |
| 379 | + result = square_clustering_core(G, ids) |
| 380 | + else: |
| 381 | + result = _square_clustering_split(G, ids, nsplits=nsplits) |
341 | 382 | return G.vector_to_dict(result)
|
342 | 383 |
|
343 | 384 |
|
|
0 commit comments