In [1]:
from src.pandas_pattern_generator import PandasPatternGenerator
from src.lsh import LSHashMap, hamming_dist, norm_vectors
from src.bloom_count import bloom

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask.dataframe.hyperloglog import compute_hll_array
import itertools

In [2]:
BITS = 8
NPats = 4

ppg = PandasPatternGenerator(1000, 10)

In [3]:
def embed(s: pd.Series, bits: int = 8) -> np.array:
    hll_embeds = compute_hll_array(s, bits)
    cms_embeds = bloom(s, 2**(bits-3), 2**3)
    return np.concatenate([hll_embeds, cms_embeds])


In [4]:
embeds = np.asarray([embed(ppg.generate_pattern(x), BITS) for x in range(1, NPats + 1)])
embeds = norm_vectors(embeds)

overlaps = np.asarray([ppg.get_overlap(x, y) for x,y in itertools.product(*[range(1, NPats + 1)]*2)])
overlaps = overlaps.reshape(NPats, NPats).T

  res = (vs - means) / stds


In [5]:
embeds

array([[ 0.        ,  0.57735027,  0.        , ...,  1.57950795,
         1.64266008,  1.65353476],
       [ 0.        ,  0.57735027,  0.        , ...,  0.14359163,
        -0.18850198, -0.09186304],
       [ 0.        , -1.73205081,  0.        , ..., -0.86154979,
        -0.40393281, -0.6430413 ],
       [ 0.        ,  0.57735027,  0.        , ..., -0.86154979,
        -1.05022529, -0.91863042]])

In [6]:
overlaps

array([[1000,  500,  334,  250],
       [ 500,  500,  167,  250],
       [ 334,  167,  334,   84],
       [ 250,  250,   84,  250]])

In [7]:
d = LSHashMap(embeds, 32)
d.bins

{309138089: [2], 533999120: [1], 2156763807: [3], 3979535718: [0]}

In [8]:
h_dists = np.asarray([hamming_dist(x, y) for x,y in itertools.product(d.keys(), repeat=2)])
h_dists = h_dists.reshape(len(d.bins), len(d.bins))
h_dists

array([[ 0, 17, 12, 27],
       [17,  0, 19, 20],
       [12, 19,  0, 25],
       [27, 20, 25,  0]])

---

In [31]:
pds= [
    pd.Series([0]*1000),
    pd.Series([0]*750 + [1]*250),
    pd.Series([0]*500 + [1]*500),
    pd.Series(np.arange(1000)),
    pd.Series(np.arange(500, 1500))
]

arrs = np.asarray([embed(x, BITS) for x in pds])
narrs = norm_vectors(arrs)

sims = []
for x in narrs:
    inner = []
    for y in narrs:
        inner.append(np.dot(x, y)/(np.linalg.norm(x) * np.linalg.norm(y)))
    sims.append(inner)
    
np.array(sims)

array([[ 1.        ,  0.97650562,  0.91853852, -0.92946651, -0.92664867],
       [ 0.97650562,  1.        ,  0.98040213, -0.94670575, -0.94903287],
       [ 0.91853852,  0.98040213,  1.        , -0.92806457, -0.93052106],
       [-0.92946651, -0.94670575, -0.92806457,  1.        ,  0.79850605],
       [-0.92664867, -0.94903287, -0.93052106,  0.79850605,  1.        ]])

In [33]:
d = LSHashMap(narrs, 8)
d.bins

{59: [0, 1], 123: [2], 132: [3], 228: [4]}

In [34]:
h_dists = np.asarray([hamming_dist(x, y) for x,y in itertools.product(d.keys(), repeat=2)])
h_dists = h_dists.reshape(len(d.bins), len(d.bins))
h_dists

array([[0, 1, 7, 7],
       [1, 0, 8, 6],
       [7, 8, 0, 2],
       [7, 6, 2, 0]])

In [35]:
N = 10**3

pds= [
    pd.Series(np.arange(N*x)) for x in [1, 2,  4,  10]
]


arrs = np.asarray([embed(x, BITS) for x in pds])
narrs = norm_vectors(arrs)

sims = []
for x in narrs:
    inner = []
    for y in narrs:
        inner.append(np.dot(x, y)/(np.linalg.norm(x) * np.linalg.norm(y)))
    sims.append(inner)
    
np.array(sims)

array([[ 1.        ,  0.61774776, -0.35228962, -0.8753627 ],
       [ 0.61774776,  1.        , -0.04936458, -0.85924114],
       [-0.35228962, -0.04936458,  1.        , -0.01883305],
       [-0.8753627 , -0.85924114, -0.01883305,  1.        ]])

In [38]:
d = LSHashMap(narrs, 32)
d.bins

{13322487: [1], 289620093: [0], 3689830307: [2], 4004822784: [3]}

In [39]:
h_dists = np.asarray([hamming_dist(x, y) for x,y in itertools.product(d.keys(), repeat=2)])
h_dists = h_dists.reshape(len(d.bins), len(d.bins))
h_dists

array([[ 0,  8, 17, 27],
       [ 8,  0, 19, 29],
       [17, 19,  0, 16],
       [27, 29, 16,  0]])

In [40]:
N = 10**5

pds= [
    pd.Series(np.arange(x, N + x)) for x in [0, int(N*0.20),  int(N*0.50),  int(N*0.80), N]
]


arrs = np.asarray([embed(x, BITS) for x in pds])
narrs = norm_vectors(arrs)

sims = []
for x in narrs:
    inner = []
    for y in narrs:
        inner.append(np.dot(x, y)/(np.linalg.norm(x) * np.linalg.norm(y)))
    sims.append(inner)
    
np.array(sims)

array([[ 1.        ,  0.30777888, -0.3184776 , -0.60875666, -0.56948444],
       [ 0.30777888,  1.        , -0.09509274, -0.58381856, -0.59714992],
       [-0.3184776 , -0.09509274,  1.        , -0.0601579 , -0.27921485],
       [-0.60875666, -0.58381856, -0.0601579 ,  1.        ,  0.3539794 ],
       [-0.56948444, -0.59714992, -0.27921485,  0.3539794 ,  1.        ]])

In [41]:
d = LSHashMap(narrs, 32)
d.bins

{748931278: [2],
 901142650: [4],
 1515984801: [0],
 2737007482: [3],
 3428693509: [1]}

In [42]:
h_dists = np.asarray([hamming_dist(x, y) for x,y in itertools.product(d.keys(), repeat=2)])
h_dists = h_dists.reshape(len(d.bins), len(d.bins))
h_dists

array([[ 0, 12, 26, 14, 21],
       [12,  0, 22, 12, 25],
       [26, 22,  0, 24, 11],
       [14, 12, 24,  0, 25],
       [21, 25, 11, 25,  0]])