In [1]:
from src.pandas_pattern_generator import PandasPatternGenerator
from src.lsh import LSHashMap, hamming_dist, norm_vectors
from src.bloom_count import bloom
from src.pattern_overlap import pattern_overlap

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dask.dataframe.hyperloglog import compute_hll_array
import itertools


In [2]:
BITS = 8
NPats = 8

ppg = PandasPatternGenerator(10000, 10)

overlaps = np.asarray([ppg.get_overlap(x, y) for x,y in itertools.product(*[range(1, NPats + 1)]*2)])
overlaps = overlaps.reshape(NPats, NPats).T

In [3]:
overlaps

array([[10000,  5000,  3334,  2500,  2000,  1667,  1429,  1250],
       [ 5000,  5000,  1667,  2500,  1000,  1667,   715,  1250],
       [ 3334,  1667,  3334,   834,   667,  1667,   477,   417],
       [ 2500,  2500,   834,  2500,   500,   834,   358,  1250],
       [ 2000,  1000,   667,   500,  2000,   334,   286,   250],
       [ 1667,  1667,  1667,   834,   334,  1667,   239,   417],
       [ 1429,   715,   477,   358,   286,   239,  1429,   179],
       [ 1250,  1250,   417,  1250,   250,   417,   179,  1250]])

In [10]:
patterns = np.asarray([ppg.generate_pattern(x) for x in range(1, NPats + 1)])

sameLength = True
if sameLength:
    N = 10**4

    pds= [
        pd.Series(np.arange(N*x)) for x in [1, 2,  4,  10, 20]
    ] + [pd.Series([1]), pd.Series([1])]
#     patterns = np.asarray([embed(x, BITS) for x in pds])
    patterns= np.asarray(pds)

po = pattern_overlap(patterns, LSHwidth=32)
overlaps_close, neighbor_sets = po.get_overlaps(max_ham_distance=10)


print('neighbor sets = \n', neighbor_sets)
print('overlaps = \n', overlaps_close)

neighbor sets = 
 {(0, 1), (3, 4), (1, 2, 3), (2, 3, 4), (5, 6), (0, 1, 2)}
overlaps = 
 [[ 10000   7881      0      0      0      0      0]
 [  7881  20000  14834      0      0      0      0]
 [     0  14834  40000  26444      0      0      0]
 [     0      0  26444 100000  85618      0      0]
 [     0      0      0  85618 200000      0      0]
 [     0      0      0      0      0      1      0]
 [     0      0      0      0      0      0      1]]


In [11]:
h_dists = np.asarray([hamming_dist(x, y) for x,y in itertools.product(po.lsh.keys(), repeat=2)])
h_dists = h_dists.reshape(len(po.lsh.bins), len(po.lsh.bins))
h_dists

array([[ 0, 10,  9, 15, 17, 25],
       [10,  0, 17, 21,  7, 29],
       [ 9, 17,  0,  6, 24, 18],
       [15, 21,  6,  0, 28, 14],
       [17,  7, 24, 28,  0, 22],
       [25, 29, 18, 14, 22,  0]])

In [6]:

sinproj = np.zeros((len(patterns), len(patterns)))
sinproj2 = np.zeros((len(patterns), len(patterns)))
for i in range(len(patterns)):
    for j in range(len(patterns)):
        dp = np.dot(po.embs[i,:], po.embs[j,:])
        sinproj[i,j] = dp/(np.linalg.norm(po.embs[i,:]) * np.linalg.norm(po.embs[j,:]))
        sinproj2[i,j] = dp

print(sinproj)
print()
print(sinproj2)

[[ 1.          0.76370156  0.3469626  -0.47634643 -0.74414614  0.27396644
   0.27396644]
 [ 0.76370156  1.          0.70658228 -0.10312097 -0.45544072 -0.1284388
  -0.1284388 ]
 [ 0.3469626   0.70658228  1.          0.47524061  0.09977501 -0.63959435
  -0.63959435]
 [-0.47634643 -0.10312097  0.47524061  1.          0.88767145 -0.95670059
  -0.95670059]
 [-0.74414614 -0.45544072  0.09977501  0.88767145  1.         -0.80194914
  -0.80194914]
 [ 0.27396644 -0.1284388  -0.63959435 -0.95670059 -0.80194914  1.
   1.        ]
 [ 0.27396644 -0.1284388  -0.63959435 -0.95670059 -0.80194914  1.
   1.        ]]

[[ 1.          0.76370156  0.3469626  -0.47634643 -0.74414614  0.27396644
   0.27396644]
 [ 0.76370156  1.          0.70658228 -0.10312097 -0.45544072 -0.1284388
  -0.1284388 ]
 [ 0.3469626   0.70658228  1.          0.47524061  0.09977501 -0.63959435
  -0.63959435]
 [-0.47634643 -0.10312097  0.47524061  1.          0.88767145 -0.95670059
  -0.95670059]
 [-0.74414614 -0.45544072  0.09977501

In [7]:
po.embs.shape

(7, 512)

---