In [1]:
from connectedComponentsLabeling import connectedComponentsLabeler as ccl
from time import time
import pandas as pd
import numpy as np
import duckdb

# Function to simulate a random graph

In [2]:
def simulateDataFrame(numberOfEdges=100*1000,numberOfNodes=10*1000):
    return pd.DataFrame({'a':np.random.randint(0, high=numberOfNodes, size=numberOfEdges),
                         'b':np.random.randint(0, high=numberOfNodes, size=numberOfEdges)})

# Test of duckpgq

In [3]:
numberOfEdges=40000
numberOfNodes=80000

In [4]:
pgq_edges_df=simulateDataFrame(numberOfEdges=numberOfEdges,numberOfNodes=numberOfNodes)
pgq_edges_df.head()

Unnamed: 0,a,b
0,44768,49475
1,40553,20556
2,72275,46153
3,72854,70597
4,50531,59389


In [5]:
db=duckdb.connect(':memory:')

In [6]:
db.sql('''
create table pgq_edges as
select *
from pgq_edges_df
''')

In [7]:
db.sql('LOAD duckpgq;')

In [8]:
db.sql('''
 create or replace table pgq_nodes as
 select distinct nodeId
 from (
         select a as nodeId from pgq_edges
         union
         select b as nodeId from pgq_edges
      );
''')

In [9]:
db.sql('''
CREATE PROPERTY GRAPH pgq_graph
  VERTEX TABLES (
    pgq_nodes
  )
  EDGE TABLES (
    pgq_edges SOURCE KEY (a)      REFERENCES pgq_nodes (nodeId)
              DESTINATION KEY (b) REFERENCES pgq_nodes (nodeId)
    LABEL linkedWith
  );
''')

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ 0 rows  │
└─────────┘

In [10]:
%%time 

startTime=time()

pgq_cc=db.sql('''
SELECT * FROM weakly_connected_component(pgq_graph, pgq_nodes, linkedWith);
''').to_df()

pgqTime=time()-startTime

CPU times: user 34.4 s, sys: 109 ms, total: 34.5 s
Wall time: 34.3 s


In [11]:
%%time 

startTime=time()

yoba_cc=ccl(pgq_edges_df).getConnectedCompontents()

yobaTime=time()-startTime

CPU times: user 132 ms, sys: 4.99 ms, total: 137 ms
Wall time: 136 ms


# Check that duckpgq and yoba methods lead to the same results

In [12]:
duckdb.sql('''
select componentId, count(distinct nodeId) as ms_countOf_nodes
from pgq_cc
group by 1
order by 2 desc
limit 10
''')

┌─────────────┬──────────────────┐
│ componentId │ ms_countOf_nodes │
│    int64    │      int64       │
├─────────────┼──────────────────┤
│          42 │             1879 │
│          57 │              651 │
│         258 │              344 │
│         609 │              303 │
│          99 │              300 │
│          53 │              294 │
│         161 │              280 │
│           8 │              238 │
│         211 │              231 │
│         326 │              229 │
├─────────────┴──────────────────┤
│ 10 rows              2 columns │
└────────────────────────────────┘

In [13]:
duckdb.sql('''
select componentId, count(distinct nodeId) as ms_countOf_nodes
from yoba_cc
group by 1
order by 2 desc
limit 10
''')

┌─────────────┬──────────────────┐
│ componentId │ ms_countOf_nodes │
│    int64    │      int64       │
├─────────────┼──────────────────┤
│         570 │             1879 │
│       15623 │              651 │
│       26862 │              344 │
│       14072 │              303 │
│         603 │              300 │
│       24937 │              294 │
│       45661 │              280 │
│        9010 │              238 │
│       29990 │              231 │
│       38024 │              229 │
├─────────────┴──────────────────┤
│ 10 rows              2 columns │
└────────────────────────────────┘

In [14]:
yoba_ccSizeDistribution=duckdb.sql('''
select ms_countOf_nodes, count(*) as ms_countOf_components
from (
        select componentId, count(distinct nodeId) as ms_countOf_nodes
        from yoba_cc
        group by 1
        order by 2 desc
     )
group by 1
order by 1 desc
''').to_df()

In [15]:
pgq_ccSizeDistribution=duckdb.sql('''
select ms_countOf_nodes, count(*) as ms_countOf_components
from (
        select componentId, count(distinct nodeId) as ms_countOf_nodes
        from pgq_cc
        group by 1
        order by 2 desc
     )
group by 1
order by 1 desc
''').to_df()

In [16]:
yoba_ccSizeDistribution.head()

Unnamed: 0,ms_countOf_nodes,ms_countOf_components
0,1879,1
1,651,1
2,344,1
3,303,1
4,300,1


In [17]:
pgq_ccSizeDistribution.head()

Unnamed: 0,ms_countOf_nodes,ms_countOf_components
0,1879,1
1,651,1
2,344,1
3,303,1
4,300,1


In [18]:
pgq_ccSizeDistribution.equals(yoba_ccSizeDistribution)

True

In [19]:
pgqTime/yobaTime

252.94862079923593