In [3]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [19]:
import connectorx as cx
import os
import pandas as pd

import src.data.utils as du

## With dupes

Joining data from three medium-sized tables.

| `return_type` | lib | index on lookup | time (seconds) |
| --- | --- | --- | --- |
| None | pgAdmin | no | 53 |
| `arrow` | `connectorx` | no | 52 |
| `pandas` | `connectorx` | no | 115 |
| `pandas` | `pandas=1.3.5` | no | 72 |
| None | pgAdmin | yes | 15 |
| `arrow` | `connectorx` | yes | 7 |
| `pandas` | `connectorx` | yes | 15 |
| `pandas` | `pandas=1.3.5` | yes | 17 |

In [20]:
sql = """
    select
        ch.id,
        ch.company_name as ch_name,
        dh.name as dh_name,
        ew.company_name as ew_name
    from (
        select 
            *
        from
            _user_eaf4fd9a.lookup lookup
        where
            lookup.source = 'companieshouse_companies'
            and lookup.target in (
                'dit_data_hub__companies',
                'dit_export_wins__wins_dataset'
            )
    ) lookup
    right outer join companieshouse.companies ch on
        lookup.source_id = ch.id::text
        and lookup.source = 'companieshouse_companies'
    left join dit.data_hub__companies dh on
        lookup.target_id = dh.id::text
        and lookup.target = 'dit_data_hub__companies'
    left join dit.export_wins__wins_dataset ew on
        lookup.target_id = ew.id::text
        and lookup.target = 'dit_export_wins__wins_dataset'  
"""

In [25]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "arrow"
)

CPU times: user 3.56 s, sys: 123 ms, total: 3.68 s
Wall time: 8.66 s


In [26]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "pandas"
)

CPU times: user 4.03 s, sys: 424 ms, total: 4.46 s
Wall time: 15.2 s


In [27]:
%%time

with du.sql_engine.connect() as connection:
    df = pd.read_sql(
        sql, 
        connection
    )

CPU times: user 9.63 s, sys: 844 ms, total: 10.5 s
Wall time: 17.3 s


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5379036 entries, 0 to 5379035
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   id       object
 1   ch_name  object
 2   dh_name  object
 3   ew_name  object
dtypes: object(4)
memory usage: 164.2+ MB


## Without dupes

So top result only.

In [29]:
sql = """
    select
        ch.id,
        ch.company_name as ch_name,
        dh.name as dh_name,
        ew.company_name as ew_name
    from (
        select
            source,
            source_id,
            array_agg(target) as target, 
            array_agg(target_id) as target_id
        from (
            select distinct on (
                lookup.source_id, 
                lookup.target,
                lookup.target_cluster
            )
                *
            from
                _user_eaf4fd9a.lookup lookup
            where
                lookup.source = 'companieshouse_companies'
                and lookup.target in (
                    'dit_data_hub__companies',
                    'dit_export_wins__wins_dataset'
                )
            order by
                lookup.source_id, 
                lookup.target,
                lookup.target_cluster,
                lookup.match_probability desc
        ) lookup
        where
            lookup.source = 'companieshouse_companies'
            and lookup.target in (
                'dit_data_hub__companies',
                'dit_export_wins__wins_dataset'
            )
        group by
            source,
            source_id
    ) lookup
    right join companieshouse.companies ch on
        lookup.source_id = ch.id::text 
        and lookup.source = 'companieshouse_companies'
    left join dit.data_hub__companies dh on
        dh.id::text = any(lookup.target_id)
        and 'dit_data_hub__companies' = any(lookup.target)
    left join dit.export_wins__wins_dataset ew on
        ew.id::text = any(lookup.target_id)
        and 'dit_export_wins__wins_dataset' = any(lookup.target);
"""

In [30]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "arrow"
)

RuntimeError: db error: ERROR: column lookup.match_probability does not exist

In [None]:
%%time

df = cx.read_sql(
    conn = f"postgres://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}",
    query = sql,
    return_type = "pandas"
)

In [None]:
%%time

with du.sql_engine.connect() as connection:
    df = pd.read_sql(
        sql, 
        connection
    )

In [None]:
df.info()