In [None]:
import numpy as np
import polars as pa
import pyarrow as pa

from matchbox.common.hash import hash_data

max_source_id = 1_000_000
total_rows = 10_000_000
df = pa.Table.from_pydict(
    {
        "cluster_id": pa.array(range(total_rows)),
        "probability": pa.array(np.random.randint(0, 100, total_rows)),
        "leaves": pa.array([
            [hash_data(int(leaf)) for leaf in np.random.randint(0, max_source_id, 5)]
            for _ in range(total_rows)
        ]),
        "cluster_hash": pa.array([
            hash_data(int(scid))
            for scid in np.random.randint(0, max_source_id, total_rows)
        ]),
    },
    schema=pa.schema(
        [
            ("cluster_id", pa.uint32()),
            ("probability", pa.uint8()),
            ("cluster_hash", pa.binary()),
            ("leaves", pa.list_(pa.binary()))
        ]
    ),
)
import pyarrow.parquet as pq
pq.write_table(df, 'large.parquet')

In [6]:
import numpy as np
import polars as pa
import pyarrow as pa

from matchbox.common.hash import hash_data

import random, string
import uuid

total_rows = 10*10_000_000
df = pa.Table.from_pydict(
    {
        "source_id": pa.array(np.random.randint(0,10, total_rows)),
        "hash": pa.array([
            hash_data(int(scid)) for scid in range(total_rows)
        ]),
        "key": pa.array([str(uuid.uuid4()) for _ in range(total_rows)])
    },
    schema=pa.schema(
        [
            ("source_id", pa.uint32()),
            ("hash", pa.binary()),
            ("key", pa.string()),
        ]
    ),
)
import pyarrow.parquet as pq
# pq.write_table(df, 'large.parquet')

In [7]:
pq.write_table(df, 'keys.parquet')

In [1]:
import pyarrow.parquet as pq
table = pq.read_table('large.parquet')

In [2]:
import polars as pl

In [3]:
df = pl.from_arrow(table)

In [7]:
import polars as pl

from matchbox.server.base import (
    MatchboxDBAdapter,
    MatchboxServerSettings,
    get_backend_settings,
    settings_to_backend,
)

get_backend_settings(MatchboxServerSettings().backend_type)

SettingsClass = get_backend_settings(MatchboxServerSettings().backend_type)
SETTINGS = SettingsClass()
BACKEND = settings_to_backend(SETTINGS)


BACKEND.s3_dump(123, pl.DataFrame(), pl.DataFrame())
BACKEND.s3_load(123)

INFO  [alembic.runtime.migration] Context impl PostgresqlImpl.
INFO  [alembic.runtime.migration] Will assume transactional DDL.


(shape: (0, 0)
 ┌┐
 ╞╡
 └┘,
 shape: (0, 0)
 ┌┐
 ╞╡
 └┘)

In [1]:
from matchbox.client.helpers.index import index
from matchbox.common.factories.sources import (
    FeatureConfig,
    LinkedSourcesTestkit,
    SourceConfig,
    SuffixRule,
    linked_sources_factory,
)
from sqlalchemy import create_engine, text


user = "warehouse_user"
password = "warehouse_password"
host = "localhost"
database = "warehouse"
port = 7654

postgres_warehouse = create_engine(
    f"postgresql+psycopg://{user}:{password}@{host}:{port}/{database}"
)

n_true_entities = 100


# Create feature configurations
features = {
    "company_name": FeatureConfig(
        name="company_name",
        base_generator="company",
    ),
    "crn": FeatureConfig(
        name="crn",
        base_generator="bothify",
        parameters=(("text", "???-###-???-###"),),
    ),
    "duns": FeatureConfig(
        name="duns",
        base_generator="numerify",
        parameters=(("text", "########"),),
    ),
    "cdms": FeatureConfig(
        name="cdms",
        base_generator="numerify",
        parameters=(("text", "ORG-########"),),
    ),
}

# Create source configurations that match our test fixtures
source_configs = (
    SourceConfig(
        full_name="e2e.crn",
        engine=postgres_warehouse,
        features=(
            features["company_name"].add_variations(
                SuffixRule(suffix=" Limited"),
                SuffixRule(suffix=" UK"),
                SuffixRule(suffix=" Company"),
            ),
            features["crn"],
        ),
        drop_base=True,
        n_true_entities=n_true_entities,
        repetition=0,  # No duplicates within the variations
    ),
    SourceConfig(
        full_name="e2e.duns",
        engine=postgres_warehouse,
        features=(
            features["company_name"],
            features["duns"],
        ),
        n_true_entities=n_true_entities // 2,  # Half the companies
        repetition=0,
    ),
    SourceConfig(
        full_name="e2e.cdms",
        engine=postgres_warehouse,
        features=(
            features["crn"],
            features["cdms"],
        ),
        n_true_entities=n_true_entities,
        repetition=1,  # Duplicate all rows
    ),
)

linked_testkit = linked_sources_factory(
    source_configs=source_configs,
    seed=42,  # For reproducibility
)

with postgres_warehouse.connect() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS e2e;"))
    conn.commit()

# Setup code - Create tables in warehouse
for source_testkit in linked_testkit.sources.values():
    source_testkit.to_warehouse(engine=postgres_warehouse)

for source_testkit in linked_testkit.sources.values():
    source = source_testkit.source
    index(
        full_name=source.address.full_name,
        db_pk="pk",  # Primary key in our test data
        engine=postgres_warehouse,
        columns=[col.model_dump() for col in source.columns],
    )
