In [1]:
from connectedComponentsLabeling import connectedComponentsLabeler as ccl
from time import time
import pandas as pd
import numpy as np
import duckdb

# Function to simulate a random graph

In [2]:
def simulateDataFrame(numberOfEdges=100*1000,numberOfNodes=10*1000):
    return pd.DataFrame({'a':np.random.randint(0, high=numberOfNodes, size=numberOfEdges),
                         'b':np.random.randint(0, high=numberOfNodes, size=numberOfEdges)})

# Test of duckpgq

In [3]:
numberOfEdges=40000
numberOfNodes=80000

In [4]:
pgq_edges_df=simulateDataFrame(numberOfEdges=numberOfEdges,numberOfNodes=numberOfNodes)
pgq_edges_df.head()

Unnamed: 0,a,b
0,392,21686
1,39485,23332
2,15981,61465
3,23302,55114
4,32591,41948


In [5]:
db=duckdb.connect(':memory:')

In [6]:
db.sql('''
create table pgq_edges as
select *
from pgq_edges_df
''')

In [7]:
db.sql('LOAD duckpgq;')

In [8]:
db.sql('''
 create or replace table pgq_nodes as
 select distinct nodeId
 from (
         select a as nodeId from pgq_edges
         union
         select b as nodeId from pgq_edges
      );
''')

In [9]:
db.sql('''
CREATE PROPERTY GRAPH pgq_graph
  VERTEX TABLES (
    pgq_nodes
  )
  EDGE TABLES (
    pgq_edges SOURCE KEY (a)      REFERENCES pgq_nodes (nodeId)
              DESTINATION KEY (b) REFERENCES pgq_nodes (nodeId)
    LABEL linkedWith
  );
''')

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ 0 rows  │
└─────────┘

In [10]:
%%time 

startTime=time()

pgq_cc=db.sql('''
SELECT * FROM weakly_connected_component(pgq_graph, pgq_nodes, linkedWith);
''').to_df()

pgqTime=time()-startTime

CPU times: user 1min 13s, sys: 129 ms, total: 1min 13s
Wall time: 36.9 s


In [11]:
%%time 

startTime=time()

yoba_cc=ccl(pgq_edges_df).getConnectedCompontents()

yobaTime=time()-startTime

CPU times: user 141 ms, sys: 12 ms, total: 153 ms
Wall time: 151 ms


# Check that duckpgq and yoba methods lead to the same results

In [12]:
duckdb.sql('''
select componentId, count(distinct nodeId) as ms_countOf_nodes
from pgq_cc
group by 1
order by 2 desc
limit 10
''')

┌─────────────┬──────────────────┐
│ componentId │ ms_countOf_nodes │
│    int64    │      int64       │
├─────────────┼──────────────────┤
│          32 │             1353 │
│         108 │              379 │
│         104 │              297 │
│           5 │              272 │
│           3 │              252 │
│         253 │              245 │
│         966 │              235 │
│           8 │              232 │
│          16 │              219 │
│         206 │              214 │
├─────────────┴──────────────────┤
│ 10 rows              2 columns │
└────────────────────────────────┘

In [13]:
duckdb.sql('''
select componentId, count(distinct nodeId) as ms_countOf_nodes
from yoba_cc
group by 1
order by 2 desc
limit 10
''')

┌─────────────┬──────────────────┐
│ componentId │ ms_countOf_nodes │
│    int64    │      int64       │
├─────────────┼──────────────────┤
│       41089 │             1353 │
│       35303 │              379 │
│       15980 │              297 │
│       43560 │              272 │
│       17163 │              252 │
│       11038 │              245 │
│       49982 │              235 │
│       47956 │              232 │
│       35600 │              219 │
│       34417 │              214 │
├─────────────┴──────────────────┤
│ 10 rows              2 columns │
└────────────────────────────────┘

In [14]:
yoba_ccSizeDistribution=duckdb.sql('''
select ms_countOf_nodes, count(*) as ms_countOf_components
from (
        select componentId, count(distinct nodeId) as ms_countOf_nodes
        from yoba_cc
        group by 1
        order by 2 desc
     )
group by 1
order by 1 desc
''').to_df()

In [15]:
pgq_ccSizeDistribution=duckdb.sql('''
select ms_countOf_nodes, count(*) as ms_countOf_components
from (
        select componentId, count(distinct nodeId) as ms_countOf_nodes
        from pgq_cc
        group by 1
        order by 2 desc
     )
group by 1
order by 1 desc
''').to_df()

In [16]:
yoba_ccSizeDistribution.head()

Unnamed: 0,ms_countOf_nodes,ms_countOf_components
0,1353,1
1,379,1
2,297,1
3,272,1
4,252,1


In [17]:
pgq_ccSizeDistribution.head()

Unnamed: 0,ms_countOf_nodes,ms_countOf_components
0,1353,1
1,379,1
2,297,1
3,272,1
4,252,1


In [18]:
pgq_ccSizeDistribution.equals(yoba_ccSizeDistribution)

True

In [19]:
pgqTime/yobaTime

243.6736096442972