In [1]:
from connectedComponentsLabeling import connectedComponentsLabeler as ccl
from time import time
import pandas as pd
import numpy as np
import duckdb

# Function to simulate a random graph

In [2]:
def simulateDataFrame(numberOfEdges=100*1000,numberOfNodes=10*1000):
    return pd.DataFrame({'a':np.random.randint(0, high=numberOfNodes, size=numberOfEdges),
                         'b':np.random.randint(0, high=numberOfNodes, size=numberOfEdges)})

# Test of duckpgq

In [3]:
numberOfEdges=40000
numberOfNodes=80000

In [4]:
pgq_edges_df=simulateDataFrame(numberOfEdges=numberOfEdges,numberOfNodes=numberOfNodes)
pgq_edges_df.head()

Unnamed: 0,a,b
0,38020,57299
1,1250,33407
2,49992,55488
3,68953,32832
4,64258,25499


In [5]:
db=duckdb.connect(':memory:')

In [6]:
db.sql('''
create table pgq_edges as
select *
from pgq_edges_df
''')

In [7]:
db.sql('LOAD duckpgq;')

In [8]:
db.sql('''
 create or replace table pgq_nodes as
 select distinct nodeId
 from (
         select a as nodeId from pgq_edges
         union
         select b as nodeId from pgq_edges
      );
''')

In [9]:
db.sql('''
CREATE PROPERTY GRAPH pgq_graph
  VERTEX TABLES (
    pgq_nodes
  )
  EDGE TABLES (
    pgq_edges SOURCE KEY (a)      REFERENCES pgq_nodes (nodeId)
              DESTINATION KEY (b) REFERENCES pgq_nodes (nodeId)
    LABEL linkedWith
  );
''')

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ 0 rows  │
└─────────┘

In [23]:
%%time 

startTime=time()

pgq_cc=db.sql('''
SELECT * FROM weakly_connected_component(pgq_graph, pgq_nodes, linkedWith)
''').to_df()

pgqTime=time()-startTime

CPU times: user 33.5 s, sys: 156 ms, total: 33.6 s
Wall time: 33.4 s


In [28]:
%%time 

startTime=time()

yoba_cc=ccl(pgq_edges_df).getConnectedCompontents()

yobaTime=time()-startTime

CPU times: user 172 ms, sys: 13 ms, total: 185 ms
Wall time: 190 ms


# Check that duckpgq and yoba methods lead to the same results

In [12]:
duckdb.sql('''
select componentId, count(distinct nodeId) as ms_countOf_nodes
from pgq_cc
group by 1
order by 2 desc
limit 10
''')

┌─────────────┬──────────────────┐
│ componentId │ ms_countOf_nodes │
│    int64    │      int64       │
├─────────────┼──────────────────┤
│          11 │             2204 │
│          18 │              501 │
│         276 │              440 │
│          16 │              332 │
│          12 │              225 │
│         450 │              219 │
│         579 │              192 │
│         123 │              189 │
│          44 │              178 │
│          13 │              171 │
├─────────────┴──────────────────┤
│ 10 rows              2 columns │
└────────────────────────────────┘

In [13]:
duckdb.sql('''
select componentId, count(distinct nodeId) as ms_countOf_nodes
from yoba_cc
group by 1
order by 2 desc
limit 10
''')

┌─────────────┬──────────────────┐
│ componentId │ ms_countOf_nodes │
│    int64    │      int64       │
├─────────────┼──────────────────┤
│       46545 │             2204 │
│       24866 │              501 │
│       22238 │              440 │
│       11488 │              332 │
│       23806 │              225 │
│       33393 │              219 │
│       12888 │              192 │
│       49032 │              189 │
│       40713 │              178 │
│       38071 │              171 │
├─────────────┴──────────────────┤
│ 10 rows              2 columns │
└────────────────────────────────┘

In [14]:
yoba_ccSizeDistribution=duckdb.sql('''
select ms_countOf_nodes, count(*) as ms_countOf_components
from (
        select componentId, count(distinct nodeId) as ms_countOf_nodes
        from yoba_cc
        group by 1
        order by 2 desc
     )
group by 1
order by 1 desc
''').to_df()

In [15]:
pgq_ccSizeDistribution=duckdb.sql('''
select ms_countOf_nodes, count(*) as ms_countOf_components
from (
        select componentId, count(distinct nodeId) as ms_countOf_nodes
        from pgq_cc
        group by 1
        order by 2 desc
     )
group by 1
order by 1 desc
''').to_df()

In [16]:
yoba_ccSizeDistribution.head()

Unnamed: 0,ms_countOf_nodes,ms_countOf_components
0,2204,1
1,501,1
2,440,1
3,332,1
4,225,1


In [17]:
pgq_ccSizeDistribution.head()

Unnamed: 0,ms_countOf_nodes,ms_countOf_components
0,2204,1
1,501,1
2,440,1
3,332,1
4,225,1


In [18]:
pgq_ccSizeDistribution.equals(yoba_ccSizeDistribution)

True

In [19]:
pgqTime/yobaTime

227.32582192367641