In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import connectorx as cx
import os
import pandas as pd

import src.data.utils as du

## With dupes

Joining data from three medium-sized tables.

V1 index is two multicolumn b-tree indices on `source`/`target` and `source_id`/`target_id`.

| `return_type` | lib | index on lookup | time (seconds) |
| --- | --- | --- | --- |
| None | pgAdmin | no | 53 |
| `arrow` | `connectorx` | no | 52 |
| `pandas` | `connectorx` | no | 115 |
| `pandas` | `pandas=1.3.5` | no | 72 |
| None | pgAdmin | v1 | 15 |
| `arrow` | `connectorx` | v1 | 10 |
| `pandas` | `connectorx` | v1 | 15 |
| `pandas` | `pandas=1.3.5` | v1 | 17 |

In [4]:
sql = """
    select
        ch.id,
        ch.company_name as ch_name,
        dh.name as dh_name,
        ew.company_name as ew_name
    from (
        select 
            *
        from
            _user_eaf4fd9a.lookup lookup
        where
            lookup.source = 'companieshouse_companies'
            and lookup.target in (
                'dit_data_hub__companies',
                'dit_export_wins__wins_dataset'
            )
    ) lookup
    right outer join companieshouse.companies ch on
        lookup.source_id = ch.id::text
        and lookup.source = 'companieshouse_companies'
    left join dit.data_hub__companies dh on
        lookup.target_id = dh.id::text
        and lookup.target = 'dit_data_hub__companies'
    left join dit.export_wins__wins_dataset ew on
        lookup.target_id = ew.id::text
        and lookup.target = 'dit_export_wins__wins_dataset'  
"""

In [9]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "arrow"
)

CPU times: user 5.07 s, sys: 375 ms, total: 5.44 s
Wall time: 10.6 s


In [6]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "pandas"
)

CPU times: user 5.39 s, sys: 673 ms, total: 6.07 s
Wall time: 17.3 s


In [7]:
%%time

with du.sql_engine.connect() as connection:
    df = pd.read_sql(
        sql, 
        connection
    )

CPU times: user 10.7 s, sys: 1.44 s, total: 12.2 s
Wall time: 19 s


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5379033 entries, 0 to 5379032
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   id       object
 1   ch_name  object
 2   dh_name  object
 3   ew_name  object
dtypes: object(4)
memory usage: 164.2+ MB


## Without dupes

So top result only. Same data as above. No non-indexed benchmark, sorry.

V1 index is two multicolumn b-tree indices on `source`/`target` and `source_id`/`target_id`.

| `return_type` | lib | index on lookup | time (seconds) |
| --- | --- | --- | --- |
| None | pgAdmin | v1 | x |
| `arrow` | `connectorx` | v1 | x |
| `pandas` | `connectorx` | v1 | x |
| `pandas` | `pandas=1.3.5` | v1 | x |

I terminated the first attempt after 5 mins. Too slow to be viable -- I need a new approach.

Why not join as above (quick) but bring in the `match_probability`, then take the max from the results?

In [13]:
sql = """
    select
        ch.id,
        ch.company_name as ch_name,
        dh.name as dh_name,
        ew.company_name as ew_name
    from (
        select
            source,
            source_id,
            array_agg(target) as target, 
            array_agg(target_id) as target_id
        from (
            select distinct on (
                lookup.source_id, 
                lookup.target,
                lookup.target_cluster
            )
                *
            from
                _user_eaf4fd9a.lookup lookup
            where
                lookup.source = 'companieshouse_companies'
                and lookup.target in (
                    'dit_data_hub__companies',
                    'dit_export_wins__wins_dataset'
                )
            order by
                lookup.source_id, 
                lookup.target,
                lookup.target_cluster,
                lookup.match_probability desc
        ) lookup
        where
            lookup.source = 'companieshouse_companies'
            and lookup.target in (
                'dit_data_hub__companies',
                'dit_export_wins__wins_dataset'
            )
        group by
            source,
            source_id
    ) lookup
    right join companieshouse.companies ch on
        lookup.source_id = ch.id::text 
        and lookup.source = 'companieshouse_companies'
    left join dit.data_hub__companies dh on
        dh.id::text = any(lookup.target_id)
        and 'dit_data_hub__companies' = any(lookup.target)
    left join dit.export_wins__wins_dataset ew on
        ew.id::text = any(lookup.target_id)
        and 'dit_export_wins__wins_dataset' = any(lookup.target)
"""

In [None]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "arrow"
)

In [None]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "pandas"
)

In [None]:
%%time

with du.sql_engine.connect() as connection:
    df = pd.read_sql(
        sql, 
        connection
    )

In [None]:
df.info()