# DBP experiment
1. run tp-tp model and tp-dbp model
2. set k to a smaller number and verify my guess
3. use edge dataset to evaluate the knn property

* parametric_umap, tp-tp -> encoder_original
* parametric_umap_DBP,tp-tp -> encoder_core
* parametric_umap_DBP, tp-dbp -> encoder_dbp

## create edges

In [1]:
import numpy as np
train_data = np.load("train_data.npy")
train_pred_labels = np.load("train_pred_labels.npy")
test_data = np.load("test_data.npy")
test_pred_labels = np.load("test_pred_labels.npy")
border_center = np.load("border_center.npy")
border_center_labels= np.load("border_center_labels.npy")

In [2]:
batch_size = 20

train_num = 50000
augmentation_num = 10000
border_num = 5000

fitting_data = np.concatenate((train_data[-train_num:],
#                                augmentation_data[:augmentation_num],
                               border_center[:border_num]), axis=0)
fitting_data.shape

(55000, 2048)

In [3]:
# number of trees in random projection forest
n_trees = 5 + int(round((fitting_data.shape[0]) ** 0.5 / 20.0))
# max number of nearest neighbor iters to perform
n_iters = max(5, int(round(np.log2(fitting_data.shape[0]))))
# distance metric
metric = "euclidean"
# number of neighbors for computing k-neighbor graph
n_neighbors = 15

In [4]:
from pynndescent import NNDescent
# get nearest neighbors
nnd = NNDescent(
    fitting_data.reshape((len(fitting_data), np.product(np.shape(fitting_data)[1:]))),
    n_neighbors=n_neighbors,
    metric=metric,
    n_trees=n_trees,
    n_iters=n_iters,
    max_candidates=60,
    verbose=True
)

Wed Jan 20 21:09:17 2021 Building RP forest with 17 trees
Wed Jan 20 21:09:21 2021 NN descent for 16 iterations
	 1  /  16
	 2  /  16
	 3  /  16
	Stopping threshold met -- exiting after 3 iterations


In [5]:
from sklearn.utils import check_random_state
from umap.umap_ import fuzzy_simplicial_set
knn_indices, knn_dists = nnd.neighbor_graph
random_state = check_random_state(None)

In [6]:
umap_graph, sigmas, rhos, dists = fuzzy_simplicial_set(
    X=fitting_data,
    n_neighbors=n_neighbors,
    metric=metric,
    random_state=random_state,
    knn_indices=knn_indices,
    knn_dists=knn_dists,
    return_dists=True,
)

In [7]:
graph = umap_graph.tocoo()
graph.sum_duplicates()
graph.eliminate_zeros()
graph.data[graph.data < (graph.data.max() / float(500))] = 0.0
graph.eliminate_zeros()
graph

<55000x55000 sparse matrix of type '<class 'numpy.float32'>'
	with 1099774 stored elements in COOrdinate format>

In [8]:
head,tail,weight= graph.row,graph.col,graph.data
len(head),len(tail)

(1099774, 1099774)

In [9]:
tp_head = (head<50000)
tp_tail = (tail<50000)
tp_tp = (tp_head & tp_tail)
np.sum(tp_tp)

1003630

In [82]:
1003630/2,85118/2

(501815.0, 42559.0)

In [10]:
dbp_head = (head>=50000)
dbp_tail = (tail>=50000)
dbp_dbp = (dbp_head & dbp_tail)
np.sum(dbp_dbp)

85116

In [11]:
tp_dbp = ((tp_head & dbp_tail) | (dbp_head & tp_tail))
np.sum(tp_dbp)

11028

In [12]:
tp_dbp = (tp_head & dbp_tail)
np.sum(tp_dbp)

5514

In [13]:
edges = np.argwhere(tp_dbp==True)
edges = np.squeeze(edges)
edges

array([1009144, 1009145, 1009146, ..., 1099657, 1099658, 1099674],
      dtype=int64)

In [14]:
heads = head[edges]
tails = tail[edges]
heads = set(heads)
tails=set(tails)
len(heads), len(tails),len(edges)

(710, 2213, 5514)

## load different autoencoder

In [15]:
# define encoder
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [23]:
import os
save_location = "parametric_umap_models\dbp_exp"
# load model
# load encoder
encoder_output = os.path.join(save_location, "encoder_core_tpdp")
if os.path.exists(encoder_output):
    load_encoder = tf.keras.models.load_model(encoder_output)
    print("Keras encoder model loaded from {}".format(encoder_output))

# # load decoder
# decoder_output = os.path.join(save_location, "decoder_original")
# if os.path.exists(decoder_output):
#     load_decoder = tf.keras.models.load_model(decoder_output)
#     print("Keras decoder model loaded from {}".format(decoder_output))

Keras encoder model loaded from parametric_umap_models\dbp_exp\encoder_core_tpdp


In [17]:
low_center = load_encoder(border_center).cpu().numpy()
low_train = load_encoder(train_data).cpu().numpy()

Instructions for updating:
Use tf.identity instead.


In [6]:
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(border_center) 
low_tree = KDTree(low_center)

In [7]:
_, high_ind = high_tree.query(train_data, k=N) 
_, low_ind = low_tree.query(low_train, k=N) 

In [8]:
border_pres = np.zeros(len(train_data))
for i in range(len(train_data)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))

In [13]:
np.bincount(border_pres.astype("int64"))

array([32185,  9015,  2494,  1806,  3116,  1093,   162,    66,    49,
          13,     1], dtype=int64)

for training edges evaluation only

In [24]:
# core tpdp
tp = head[edges]
dbp = tail[edges]
tp = fitting_data[tp]
dbp = fitting_data[dbp]
low_center = load_encoder(dbp).cpu().numpy()
low_train = load_encoder(tp).cpu().numpy()
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(dbp) 
low_tree = KDTree(low_center)

In [19]:
_, high_ind = high_tree.query(tp, k=10) 
_, low_ind = low_tree.query(low_train, k=10) 
# N=10
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(2.044811320754717, 10.0, 0.0)

In [20]:
_, high_ind = high_tree.query(tp, k=20) 
_, low_ind = low_tree.query(low_train, k=20) 
# N=20
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(5.7338534107402035, 20.0, 0.0)

In [21]:
_, high_ind = high_tree.query(tp, k=30) 
_, low_ind = low_tree.query(low_train, k=30) 
# N=30
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(9.789550072568941, 30.0, 0.0)

In [22]:
_, high_ind = high_tree.query(tp, k=50) 
_, low_ind = low_tree.query(low_train, k=50) 
# N=50
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(19.642960812772134, 50.0, 0.0)

In [22]:
_, high_ind = high_tree.query(tp, k=100) 
_, low_ind = low_tree.query(low_train, k=100) 
# N=50
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(19.642960812772134, 50.0, 0.0)

In [19]:
# core tptp tpdp
_, high_ind = high_tree.query(tp, k=10) 
_, low_ind = low_tree.query(low_train, k=10) 
# N=10
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(1.327348567283279, 10.0, 0.0)

In [20]:
_, high_ind = high_tree.query(tp, k=20) 
_, low_ind = low_tree.query(low_train, k=20) 
# N=20
border_pres = np.zeros(len(tp))
for i in range(len(tp)):64
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(3.6824446862531737, 19.0, 0.0)

In [21]:
_, high_ind = high_tree.query(tp, k=30) 
_, low_ind = low_tree.query(low_train, k=30) 
# N=30
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(6.708741385564019, 30.0, 0.0)

In [22]:
_, high_ind = high_tree.query(tp, k=50) 
_, low_ind = low_tree.query(low_train, k=50) 
# N=50
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(19.642960812772134, 50.0, 0.0)

In [22]:
_, high_ind = high_tree.query(tp, k=100) 
_, low_ind = low_tree.query(low_train, k=100) 
# N=100
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
border_pres.mean(),border_pres.max(),border_pres.min()

(32.050961189698945, 86.0, 0.0)

In [51]:
# N=100
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
np.bincount(border_pres.astype("int64")), border_pres.mean()

(array([ 94,   8, 110,  35,  80,  29,  54,   8,  46, 147,  91,  66,  75,
         68, 135,  54,  84, 142, 102,  95, 110, 133,  38, 132, 185, 184,
        119, 122, 163,  75,  79,  72,  72,  69,  32,  28,  54,  40, 162,
         75,  83, 159, 108,  98,  81,  31,  53, 177, 113,  21,  88,  23,
         57,  68,  20,   6,  44,  48,  51, 102,  48,  74,  76,   6,  34,
         78,  85,   1,  25,  30,  39,  21,  13,   0,  12,  14,   2,   0,
          1,   0,  23,   0,   0,   0,   0,  12,   2,  15], dtype=int64),
 32.7645670720639)

In [66]:
import os
save_location = "parametric_umap_models\dbp_exp"
# load model
# load encoder
encoder_output = os.path.join(save_location, "encoder_core")
if os.path.exists(encoder_output):
    load_encoder = tf.keras.models.load_model(encoder_output)
    print("Keras encoder model loaded from {}".format(encoder_output))

# # load decoder
# decoder_output = os.path.join(save_location, "decoder_core")
# if os.path.exists(decoder_output):
#     load_decoder = tf.keras.models.load_model(decoder_output)
#     print("Keras decoder model loaded from {}".format(decoder_output))

Keras encoder model loaded from parametric_umap_models\dbp_exp\encoder_core


In [15]:
low_center = load_encoder(border_center).cpu().numpy()
low_train = load_encoder(train_data).cpu().numpy()

In [16]:
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(border_center) 
low_tree = KDTree(low_center)

In [17]:
_, high_ind = high_tree.query(train_data, k=N) 
_, low_ind = low_tree.query(low_train, k=N) 

In [18]:
border_pres = np.zeros(len(train_data))
for i in range(len(train_data)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))

In [19]:
np.bincount(border_pres.astype("int64"))

array([22767, 10797,  6282,  5752,  2907,  1162,   306,    15,    12],
      dtype=int64)

In [67]:
tp = head[edges]
dbp = tail[edges]
tp = fitting_data[tp]
dbp = fitting_data[dbp]
low_center = load_encoder(dbp).cpu().numpy()
low_train = load_encoder(tp).cpu().numpy()
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(dbp) 
low_tree = KDTree(low_center)
_, high_ind = high_tree.query(tp, k=N) 
_, low_ind = low_tree.query(low_train, k=N) 

In [68]:
# N=10
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
np.bincount(border_pres.astype("int64")),border_pres.mean()

(array([3612,  424,  395,  289,  231,   72,  180,  118,   99,   65,   24],
       dtype=int64),
 1.250317662007624)

In [54]:
# N=100
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
np.bincount(border_pres.astype("int64")), border_pres.mean()

(array([ 75,  55,  31,  62,  55, 103, 108,  64, 139,  86,  72, 138,  60,
         69, 129,  34, 146,  81, 120,  89,  78, 157, 118, 158,  95,  36,
        150, 123, 161,  79, 116, 106, 107,  77,  46,  72, 129, 134,  52,
        113, 167,  70, 114,  41,  66,  71,  77,  69,  99,  73,  79,  60,
         61,  41,  61,  45,  44,  48,  60,  13,  75,  10,  13,  37,  28,
         23,  37,   1,  19,  12,   3,  10,   0,   0,  30,   0,   0,   2,
          6,   0,  21], dtype=int64),
 30.011254311127246)

In [77]:
import os
save_location = "parametric_umap_models\dbp_exp"
# load model
# load encoder
encoder_output = os.path.join(save_location, "encoder_dbp")
if os.path.exists(encoder_output):
    load_encoder = tf.keras.models.load_model(encoder_output)
    print("Keras encoder model loaded from {}".format(encoder_output))

# # load decoder
# decoder_output = os.path.join(save_location, "decoder_dbp")
# if os.path.exists(decoder_output):
#     load_decoder = tf.keras.models.load_model(decoder_output)
#     print("Keras decoder model loaded from {}".format(decoder_output))

Keras encoder model loaded from parametric_umap_models\dbp_exp\encoder_dbp


In [21]:
low_center = load_encoder(border_center).cpu().numpy()
low_train = load_encoder(train_data).cpu().numpy()

In [22]:
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(border_center) 
low_tree = KDTree(low_center)

In [23]:
_, high_ind = high_tree.query(train_data, k=N) 
_, low_ind = low_tree.query(low_train, k=N) 

In [24]:
border_pres = np.zeros(len(train_data))
for i in range(len(train_data)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))

In [25]:
np.bincount(border_pres.astype("int64"))

array([33452,  9499,  5792,   465,   302,   184,   117,    94,    66,
          28,     1], dtype=int64)

In [78]:
tp = head[edges]
dbp = tail[edges]
tp = fitting_data[tp]
dbp = fitting_data[dbp]
low_center = load_encoder(dbp).cpu().numpy()
low_train = load_encoder(tp).cpu().numpy()
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(dbp) 
low_tree = KDTree(low_center)
_, high_ind = high_tree.query(tp, k=N) 
_, low_ind = low_tree.query(low_train, k=N) 

In [79]:
# N=10
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
np.bincount(border_pres.astype("int64")), border_pres.mean()

(array([2925,  455,  521,  313,  313,  228,  236,  146,   65,  227,   80],
       dtype=int64),
 1.92938827373389)

In [57]:
# N=100
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
np.bincount(border_pres.astype("int64")),border_pres.mean()

(array([ 88,  27,  39,  63,  60,  90,  67,  63,  19,  86,  42, 121,  56,
         48,  14,  47,  17,  61,  84,  88,  75,  87,   9,  62, 108,  55,
        118,  34,  61,  82,  59,  82, 133, 110,  15,  71,  82,  25,  87,
         68,  27, 110,  83,  74,  76, 108,  28,  49,  42,   5,  16,  73,
         37,  88,  74,  57,  23,  36,  46,  75,  64,  96,  72,  36,  90,
         41, 116, 108,  73,  57,  77,  98,  39,  71,  68,  61,  41,  84,
         28,  44,  29,  47,  27,  60,  46,  14,  30,  13,  33,  20,  14,
         19,  17,   0,   0,  17,   0,  21,   5,   3], dtype=int64),
 43.164821201669994)

In [73]:
import os
save_location = "parametric_umap_models\dbp_exp"
# load model
# load encoder
encoder_output = os.path.join(save_location, "encoder_withouttp")
if os.path.exists(encoder_output):
    load_encoder = tf.keras.models.load_model(encoder_output)
    print("Keras encoder model loaded from {}".format(encoder_output))

# # load decoder
# decoder_output = os.path.join(save_location, "decoder_dbp")
# if os.path.exists(decoder_output):
#     load_decoder = tf.keras.models.load_model(decoder_output)
#     print("Keras decoder model loaded from {}".format(decoder_output))

Keras encoder model loaded from parametric_umap_models\dbp_exp\encoder_withouttp


In [30]:
low_center = load_encoder(border_center).cpu().numpy()
low_train = load_encoder(train_data).cpu().numpy()

In [31]:
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(border_center) 
low_tree = KDTree(low_center)

In [32]:
_, high_ind = high_tree.query(train_data, k=N) 
_, low_ind = low_tree.query(low_train, k=N) 

In [33]:
border_pres = np.zeros(len(train_data))
for i in range(len(train_data)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))

In [34]:
np.bincount(border_pres.astype("int64"))

array([42618,  6516,   237,   148,   140,   118,    89,    93,    39,
           2], dtype=int64)

In [37]:
len(border_pres)-np.count_nonzero(border_pres)

42618

In [74]:
tp = head[edges]
dbp = tail[edges]
tp = fitting_data[tp]
dbp = fitting_data[dbp]
low_center = load_encoder(dbp).cpu().numpy()
low_train = load_encoder(tp).cpu().numpy()
## nndescent?
from sklearn.neighbors import KDTree
high_tree = KDTree(dbp) 
low_tree = KDTree(low_center)
_, high_ind = high_tree.query(tp, k=N) 
_, low_ind = low_tree.query(low_train, k=N) 

In [75]:
# N=10
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
np.bincount(border_pres.astype("int64")),border_pres.mean()

(array([2385,  368,  479,  450,  433,  339,  403,  158,  249,  181,   64],
       dtype=int64),
 2.5209656925031765)

In [61]:
# N=100
border_pres = np.zeros(len(tp))
for i in range(len(tp)):
    border_pres[i] = len(np.intersect1d(high_ind[i],low_ind[i]))
np.bincount(border_pres.astype("int64")),border_pres.mean()

(array([ 68,  82,  57,  63,  16,  36,  47,   1,  25,  21,  42,  23,  32,
         24,  34,   8,   7,  32,  58,  11,  50,  18,  80,  44,   9,  41,
         16,  31,  32,  32,  54,  40,  72,  81,  64,  37,  51,  46,  43,
         46,  64,  63,  65,  66,  44,  51,  44,  78,  54,  29,   9,  78,
        108,  15,  55,  60,  24,  35,  71,  55,  63, 126, 148,  82, 129,
        117,  42, 108, 106,  59, 110,  78,  73,  27, 101, 100,  53,  86,
         70,  70,  74,  86,  95,  77, 107,  93,  63, 176,  71, 143,  58,
         24,  22,  30], dtype=int64),
 54.51025594481757)

# create the edge dataset

In [111]:
neg = np.random.permutation(len(edges))
neg = edges[neg]

(5.379653172588765, 2.761048049037231)

In [124]:
# tp-tp
low_fitting_data = load_encoder(fitting_data).cpu().numpy()

In [125]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[edges[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(2.6621539374880987, 2.76190855788689)

In [126]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[neg[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(11.881431530302024, 8.13159089576581)

In [128]:
# tp-tp core
low_fitting_data = load_encoder(fitting_data).cpu().numpy()

In [129]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[edges[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(3.240253884779291, 3.6968340137081337)

In [130]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[neg[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(15.441977167531428, 10.332535596078399)

In [132]:
# tp-dbp
low_fitting_data = load_encoder(fitting_data).cpu().numpy()

In [133]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[edges[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(0.6543270221647088, 0.41393987605949595)

In [134]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[neg[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(4.838728493410867, 2.3704286107589785)

In [136]:
# tp-dbp+dbp-dbp
low_fitting_data = load_encoder(fitting_data).cpu().numpy()

In [137]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[edges[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(0.5080731251336978, 0.5900279612300334)

In [138]:
dists = np.zeros(len(edges))
for i in range(len(edges)):
    h = low_fitting_data[head[edges[i]]]
    t = low_fitting_data[tail[neg[i]]]
    dists[i] = np.linalg.norm(h-t)
dists.mean(), np.std(dists,axis=0)

(5.379653172588765, 2.761048049037231)