# Disjoint

This is a main development notebook used for test data generation and benchmark analysis. The main goal is to see how algorithm design and data distribution impact the runtime cost of finding unique sets in a set of sets.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import dispy

In [20]:
def gen_rng_set(gen, n_sets, set_size) -> np.array:
    return gen.integers(2, size=[n_sets, set_size])

def gen_uniq_rng_set(n_sets, set_size, seed=42, iterations=1e5) -> np.array:
    dst = np.empty((0, set_size), dtype=np.int32)
    gen = np.random.default_rng(seed=seed)
    for _ in range(int(iterations)):
        dst = np.vstack((
            dst,
            gen_rng_set(
                gen,
                n_sets=n_sets-len(dst),
                set_size=set_size,
            )
        ))

        if len(dst) == n_sets:
            break

    return dst

np_sets = gen_uniq_rng_set(1000, 10)


In [21]:
uniques, cnts = np.unique(np_sets, axis=0, return_counts=True)
print(len(uniques))

627


In [22]:
tb_sets = dispy.create_subsets(np_sets)

In [23]:
len(tb_sets)
print(len(dispy.unique(tb_sets)))

627


In [24]:
%%timeit
np.unique(np_sets, axis=0)

2.83 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%%timeit
dispy.unique(tb_sets)

499 µs ± 84.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
