**QuickSI**

This paper use the idea of **minimum spanning tree** to construct matching order to reduce search space.

Although subgraph matching usually have three steps, this paper focus on the last two, that is, **matching order generation and enumeration procedure**.

For matching order generation part, the authors first calculate **weights** for each node and each edge in the query graph based on the **frequency** of the type appearing in the graph data(because the algorithm of minimum spanning tree needs weights).

Then they perform an algorithm similar to Prim's algorithm to construct the matching order. At the begining, they select an edge based on the **degree of the nodes** it linked. They get the **minimum** one, beacause the **importance of edge** between them would be larger. They put these two nodes in the matching order sequence.

After that, they check **each edge** linked to nodes in the maching order sequence to get the next node. How they select the next node(edge)? they first get edges with **minimum weights**(appear **least time** in graph data, so it's easy to make sure. This idea is similar to select the node with **max degree**). And then check the **indegree** of the one end of the edge and get the edges with **maximum** one. This is reasonable, because we want to select a node that has **as many edges as possible** with node we already put in the matching order(this would help reduce the search space). If there're still multiple backup nodes(edges), we should then get the edge with **minimum degree** of one end node(the node would be selected as the next node in the matching order. Why minimum degree? because we already make sure it has many edges with nodes in the sequence. If we find the one with minimum degree, then this node may be very important to this structure now). Finally, if there's still a tie, we should randomly select one.

Perform these steps number of nodes times, we would get a valid matching order.

Then they use the enumeration procedure like **UllmanAlgorithm**, but with more conditions to reduce search space.

In [79]:
!git clone https://github.com/RapidsAtHKUST/SubgraphMatching.git

fatal: destination path 'SubgraphMatching' already exists and is not an empty directory.


In [80]:
from collections import defaultdict
import random

In [81]:
class graph():
  def __init__(self, graphid, node2label, node2degree, edges):
    self.graphid = graphid
    self.node2label = node2label
    self.node2degree = node2degree
    self.edges = edges
    self.candidateset = defaultdict(set)
    self.label2node = defaultdict(set)
    for node in self.node2label:
      self.label2node[self.node2label[node]].add(node)
    self.phi = []
    self.phiparent = {}

  def reset(self):
    self.candidateset = defaultdict(set)
    self.phi = []
    self.phiparent = {}

In [82]:
def get_graph(filepath, filename):
  global qcount
  global gcount

  node2label = {}
  node2degree = {}
  edges = defaultdict(set)
  f = open(filepath, "r", encoding="utf-8")

  _, nodenum, edgenum = f.readline().strip().split()
  for i in range(int(nodenum)):
    _, nodeid, nodelabel, nodedegree = f.readline().strip().split()
    node2label[int(nodeid)] = int(nodelabel)
    node2degree[int(nodeid)] = int(nodedegree)  
  for i in range(int(edgenum)):
    _, node1, node2 = f.readline().strip().split()
    edges[int(node1)].add(int(node2))
    edges[int(node2)].add(int(node1))

  f.close()
  g = graph(filename, node2label, node2degree, edges)

  return g

In [83]:
qcount = 0
gcount = 0

import os
qs = []
qdir = "SubgraphMatching/test/query_graph"
for f in os.listdir(qdir):
  filepath = os.path.join(qdir, f)
  qs.append(get_graph(filepath, f))

gs = []
gdir = "SubgraphMatching/test/data_graph"
for f in os.listdir(gdir):
  filepath = os.path.join(gdir, f)
  gs.append(get_graph(filepath, f))

print(len(qs))
print(len(gs))

f = open("SubgraphMatching/test/expected_output.res", "r", encoding="utf-8")
lines = f.readlines()
f.close()

expects = {}
for line in lines:
  name, times = line.strip().split(":")
  expects[name + ".graph"] = int(times)
print(len(expects))

200
1
200


In [84]:
def QuickSI_MOG(q, g):
  wnodes = {}
  wedges = {}
  for node in q.node2label:
    wnodes[node] = len(g.label2node[q.node2label[node]])

  ngedges = defaultdict(int)
  for node1 in g.edges:
    for node2 in g.edges[node1]:
      edge = (g.node2label[node1], g.node2label[node2])
      ngedges[edge] += 1

  for node1 in q.edges:
    for node2 in q.edges[node1]:
      edge = (q.node2label[node1], q.node2label[node2])
      if (node1, node2) in wedges:
        continue
      wedges[(node1, node2)] = ngedges[edge]


  p = set()
  minedge = min(wedges, key=wedges.get)
  for edge in wedges:
    if wedges[edge] == wedges[minedge]:
      p.add(edge)
  
  tmp = p
  p = set()
  if len(tmp) > 1:
    for edge1 in tmp:
      sumdegree1 = q.node2degree[edge1[0]] + q.node2degree[edge1[1]]
      flag = True
      for edge2 in tmp:
        sumdegree2 = q.node2degree[edge2[0]] + q.node2degree[edge2[1]]
        if sumdegree2 < sumdegree1:
          flag = False
          break
      if flag:
        p.add(edge1)
    e = random.choice(list(p))
  else:
    e = list(tmp)[0]
  
  q.phi.append(e[0])
  q.phi.append(e[1])
  q.phiparent[e[0]] = -1
  q.phiparent[e[1]] = e[0]
  visited = set()
  visited.add(e[0])
  visited.add(e[1])

  wedges.pop(e)
  wedges.pop((e[1], e[0]))

  while len(q.phi) != len(q.node2label):
    p = set()
    for edge in wedges:
      if (edge[0] in visited and edge[1] not in visited) or (edge[1] in visited and edge[0] not in visited):
        p.add(edge)

    tmp = p
    p = set()
    for edge1 in tmp:
      flag = True
      for edge2 in tmp:
        if wedges[edge2] < wedges[edge1]:
          flag = False
          break
      if flag:
        p.add(edge1)
    
    tmp = p
    p = set()

    if len(tmp) > 1:
      for edge1 in tmp:
        flag = True
        indg1 = set()
        indg1 |= visited
        indg1.add(edge1[1])
        totalindg1 = 0
        for node in indg1:
          totalindg1 += q.node2degree[node]

        for edge2 in tmp:
          indg2 = set()
          indg2 |= visited
          indg2.add(edge2[1])
          totalindg2 = 0
          for node in indg2:
            totalindg2 += q.node2degree[node]
          if totalindg1 < totalindg2:
            flag = False
            break
        if flag:
          p.add(edge1)
    else:
      p = tmp
    tmp = p


    p = set()
    if len(tmp) > 1:
      for edge1 in tmp:
        flag = True
        degree1 = q.node2degree[edge1[1]]
        for edge2 in tmp:
          degree2 = q.node2degree[edge2[1]]
          if degree1 > degree2:
            flag = False
            break
        if flag:
          p.add(edge1)
    else:
      p = tmp
    

    if len(p) > 1:
      e = random.choice(list(p))
    else:
      e = list(p)[0]
    
    if e[1] not in visited:
      q.phi.append(e[1])
      visited.add(e[1])
      q.phiparent[e[1]] = e[0]
    else:
      q.phi.append(e[0])
      visited.add(e[0])
      q.phiparent[e[0]] = e[1]
    
    for edge in wedges.copy():
      if edge[0] in visited and edge[1] in visited:
        wedges.pop(edge)
    

In [85]:
def QuickSI_EP(q, g, m, i, totalresult): # not equal to the original code
  if i == len(q.phi) + 1:
    totalresult.append(m.copy())
    return 
  result = {}
  u = -1
  for node in q.phi:
    if node not in m:
      u = node
      break

  lc = set()

  if i == 1:
    vs = g.label2node[q.node2label[u]]
    for v in vs:
      if g.node2degree[v] >= q.node2degree[u]:
        lc.add(v)
  else:
    for v in g.edges[m[q.phiparent[u]]]:
      if q.node2label[u] == g.node2label[v] and g.node2degree[v] >= q.node2degree[u]:
        flag = True
        for node in q.phi:
          if node == u:
            break
          if node == q.phiparent[u]:
            continue
          #if m[node] not in g.edges[v]:
          if v == m[node] or (m[node] not in g.edges[v] and (node in q.edges[u] or u in q.edges[node])):
            flag = False
            break
        if flag:
          lc.add(v)

  for node in lc:
    if node not in set(m.values()):
      m[u] = node
      QuickSI_EP(q, g, m, i + 1, totalresult)
      m.pop(u)

In [86]:
queries = {}
for g in gs:
  for q in qs:
    q.reset()
    QuickSI_MOG(q, g)
    totalresult = []
    QuickSI_EP(q, g, {}, 1, totalresult)
    queries[q.graphid] = len(totalresult)

print(queries)

{'query_dense_16_91.graph': 3, 'query_dense_16_96.graph': 17, 'query_dense_16_87.graph': 12, 'query_dense_16_179.graph': 184, 'query_dense_16_155.graph': 16, 'query_dense_16_103.graph': 48, 'query_dense_16_134.graph': 17, 'query_dense_16_160.graph': 2688, 'query_dense_16_107.graph': 21, 'query_dense_16_57.graph': 3, 'query_dense_16_142.graph': 16, 'query_dense_16_71.graph': 9, 'query_dense_16_161.graph': 44, 'query_dense_16_152.graph': 432, 'query_dense_16_60.graph': 10, 'query_dense_16_158.graph': 2, 'query_dense_16_115.graph': 10, 'query_dense_16_81.graph': 124, 'query_dense_16_168.graph': 75, 'query_dense_16_5.graph': 4, 'query_dense_16_53.graph': 12, 'query_dense_16_124.graph': 2, 'query_dense_16_68.graph': 256, 'query_dense_16_56.graph': 4, 'query_dense_16_27.graph': 8, 'query_dense_16_141.graph': 20, 'query_dense_16_65.graph': 2, 'query_dense_16_136.graph': 1, 'query_dense_16_14.graph': 2, 'query_dense_16_22.graph': 9, 'query_dense_16_43.graph': 3, 'query_dense_16_84.graph': 8, '

In [87]:
flag = True
for name in expects:
  if expects[name] != queries[name]:
    print(name)
    flag = False
if flag:
  print("correct")

correct
