In [1]:
from scipy.spatial.distance import cdist
from models.INode import INode
from utils.dendrogram_purity import dendrogram_purity,expected_dendrogram_purity
from utils.deltasep_utils import create_dataset
from utils.add_nne import addNNE,aNNE_similarity
from utils.Graphviz import Graphviz


In [2]:
import numpy as np
from graphviz import Source

In [3]:
def add_nne_data(dataset,n,psi,t):
  """Add ik value to dataset.
  Args:
    dataset - a list of points with which to build the tree.
    n - the number of dataset to build aNNE metrix
    psi - parameter of ik
    t - paremeter of ik
  Return:
    dataset with ik value
    
  """
  met = [pt[0] for pt in dataset[:n]]
  
  x = cdist(met,met, 'euclidean') 
  oneHot,subIndexSet,aNNEMetrix = aNNE_similarity(x,psi,t)
  for i, pt in enumerate(dataset[:n]):
      pt.append(aNNEMetrix[i])
      
  return oneHot,subIndexSet,dataset
  



In [18]:
def create_trees_w_purity_check(n,psi,t,dataset):
    """Create trees over the same points.

    Create n trees, online, over the same dataset. Return pointers to the
    roots of all trees for evaluation.  The trees will be created via the insert
    methods passed in.  After each insertion, verify that the dendrogram purity
    is still 1.0 (perfect).

    Args:
        dataset - a list of points with which to build the tree.

    Returns:
        A list of pointers to the trees constructed via the insert methods
        passed in.
    """
    
    met = [pt[0] for pt in dataset[:n]]
    
    oneHot,subIndexSet,data = add_nne_data(dataset,n,psi,t)
    root = INode(exact_dist_thres=10)
    
    for i, pt in enumerate(data[:100]):
        if len(pt)==3:
          ikv = addNNE(met,pt[0],oneHot,subIndexSet)
          pt.append(ikv)
        root = root.insert(pt, collapsibles=None, L= float('inf'))
        #gv = Graphviz()
        #tree = gv.graphviz_tree(root)
        #src = Source(tree)
        #src.render('treeResult\\'+'tree'+str(i)+'.gv', view=True,format='png')
    return root

In [5]:
dimensions = [5]
size = 50
num_clus = 3


for dim in dimensions:
    print("TESTING DIMENSIONS == %d" % dim)
    dataset = create_dataset(dim, size, num_clusters=num_clus)                                 

TESTING DIMENSIONS == 5


In [13]:
len(dataset)

4601

In [6]:
def load_data(filename):
    with open(filename, 'r') as f:
        for line in f:
            splits = line.strip().split('\t')
            pid, l, vec = splits[0], splits[1], np.array([float(x)
                                                          for x in splits[2:]])
            yield ([vec, l, pid])

In [12]:
dataset = list(load_data("data/spambase.tsv"))

In [14]:
n=50
psi=10
t = 200

In [15]:
np.random.shuffle(dataset)

In [16]:
root = create_trees_w_purity_check(n,psi,t,dataset)

In [11]:
len(dataset)

150

In [8]:
import time, datetime

In [9]:
#sts = time.time()
root = create_trees_w_purity_check(n,psi,t,dataset)
#ets = time.time() 
#print(ets-sts)

In [17]:
print(expected_dendrogram_purity(root))

0.5854285714285716


In [None]:


dataset = list(load_data("data/spambase.tsv"))

gv = Graphviz()
tree = gv.graphviz_tree(root)
src = Source(tree)
src.render('treeResult\\'+'tree'+str(1)+'.gv', view=True,format='png')