In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# allFeat = np.load('/tf/notebooks/clusteringTree/stl_feat/feats_2048.npy')
# gtAll = np.load('/tf/notebooks/clusteringTree/stl_feat/labels_2048.npy')-1

allFeat = np.load('/tf/notebooks/STL-10/danielFeat.npy')
gtAll = np.load('/tf/notebooks/STL-10/danielGt.npy')



In [2]:
%load_ext autoreload
%autoreload

from sklearn.cluster import KMeans
from clusteringX2 import normIt
from clusteringX2 import clusterModel
from clusteringX2 import train_step3


def clustering(feat, numClass=60, thres=0.1):
    feat, m = normIt(feat)
    
    # initilization
    kmeans = KMeans(n_clusters=numClass, random_state=0).fit(feat)
    labelK = kmeans.predict(feat)
    means =np.zeros([numClass, feat.shape[1]])

    for i in range(numClass):
        mask = labelK == i
        means[i,:] = np.mean(feat[mask,:], axis =0)

    sigInit = np.zeros([numClass])
    for i in range(numClass):
        mask = np.logical_not(labelK == i)
        sigInit[i] = np.mean(np.matmul(feat[mask,:], np.transpose(means[i,:])))
    model = clusterModel(means.transpose(), sigInit)
    train_step3(model, feat.astype("float32"), iter =100)
    
    
    
    c = model(feat.astype("float32"))
    s = np.max(c, axis=1)
    t = np.mean(s)
    mask = s > t*0.5
    train_step3(model, feat[mask,:].astype("float32"), iter =100)
    
    
    c = model(feat.astype("float32"))
    s = np.max(c, axis=1)
    t = np.mean(s)
    mask = s > t*0.5
    train_step3(model, feat[mask,:].astype("float32"), iter =100)

 
    c = model(feat.astype("float32"))
    labels = np.argmax(c, axis =1)
    mask_ = np.zeros(numClass, dtype=bool)
        
    for i in range(numClass):
        if np.sum(labels==i)>0:
            mask_[i] = 1
    w = model.w.numpy()
    sig = model.sig.numpy()
    w = w[:,mask_]
    sig = sig[mask_]
            

    return w, sig, m, mask



In [3]:
class Node:
    def __init__(self, mask=None, w=None, sig=None, m=None):
        self.mask = mask
        self.w = w
        self.sig = sig
        self.m = m
            
class Layer:
    def __init__(self):
        self.nodeList = []
        
class Tree:
    def __init__(self):
        self.numLayers = 0
        self.listOfLayers = []
        self.numNodes = 0
        self.listOfNodes = []
        
    def addNode(self, c):
        self.listOfNodes.append(c)
        self.numNodes = self.numNodes + 1
        
    def addLayer(self, c):
        self.listOfLayers.append(c)
        self.numLayers = self.numLayers + 1

    
def startTree(allFeat, numClass=2):
    tree = Tree()    
    w, sig, m, inMask = clustering(allFeat, numClass)
    mask = np.ones(allFeat.shape[0], dtype=bool)
    #mask[np.logical_not(inMask)] = 0
    n = Node(mask, w, sig, m)

    tree.addNode(n)
    newLayer = Layer()
    newLayer.nodeList.append(tree.numNodes-1)
    tree.addLayer(newLayer)
    
    return tree

    
#mask = np.logical_or(gtAll==3,  gtAll==4)
mask = gtAll<10

feat = allFeat[mask,:]
gt = gtAll[mask]    
gtCoarseAll = np.logical_or.reduce( [gtAll == 0, gtAll == 2, gtAll == 8, gtAll == 9]).astype(int)   
gtCoarse = gtCoarseAll[mask]

tree = startTree(feat[::2])


In [4]:
def dataProjection(feat, node):
    x, _ = normIt(feat, node.m) 
    return np.matmul(x, node.w) - node.sig


    

def extendTree(tree, allFeat, numClass=2, clusteringThreshold=100):
    newLayer = Layer()
    lastLayer = tree.listOfLayers[-1]
    for nodeIndex in lastLayer.nodeList:
        node = tree.listOfNodes[nodeIndex]
        curFeat = allFeat[node.mask,:]
        score = dataProjection(curFeat, node)
        #plt.scatter(score[:,0], score[:,1], alpha=0.1)
        
        clusterInd = np.argmax(score, axis =1)
        curInd = np.where(node.mask)[0]

        for i in range(score.shape[1]):
            subMask = clusterInd == i
            if sum(subMask) > clusteringThreshold:
                
                newMask =  np.zeros(allFeat.shape[0], dtype=bool)
                newMask[curInd[subMask]] = 1
                

                w, sig, m, inMask = clustering(allFeat[newMask,:], numClass)
                #subInd = np.where(newMask)[0]
                #newMask[subInd[np.logical_not(inMask)]] = 0

                if w.shape[1]>=2:
                    n = Node(newMask, w, sig, m)
                    tree.addNode(n)
                    newLayer.nodeList.append(tree.numNodes-1)
    tree.addLayer(newLayer)


        


for k in range(100):
    extendTree(tree, feat[::2])



In [5]:
def projectTree2(feat, tree, maxClus=2, unit_vector=True, scale2data=True, verboise=True):
    
    projectionSpace = np.zeros([feat.shape[0], tree.numNodes*maxClus])
    
    cur = 0
    
    for i, n in enumerate(tree.listOfNodes):
        
        f = dataProjection(feat, n)
        numDim = f.shape[1]
        if unit_vector:
            f_ = np.concatenate([f, 10*np.ones([f.shape[0],1])], axis =1)
            f_ = f_/np.linalg.norm(f_, axis =1, keepdims=True)
            f = f_[:,:numDim]
        if scale2data:
            f = f*np.sum(n.mask)/n.mask.size 
        projectionSpace[:, cur:cur+numDim] = f
        cur = cur+numDim
        
        if verboise == True:
            print(i, n)
    projectionSpace = projectionSpace[:,:cur]
    return projectionSpace
        
    

In [6]:
print('Create Feature')

projectionSpace = projectTree2(feat, tree, unit_vector=True, scale2data=True)
pp = projectionSpace.copy()
pp_s, _ = normIt(pp)

print('num dimenions:', pp_s.shape[1])

Create Feature
0 <__main__.Node object at 0x7fa2ddac37b8>
1 <__main__.Node object at 0x7fa233551518>
2 <__main__.Node object at 0x7fa24071ba58>
3 <__main__.Node object at 0x7fa2303ddb70>
4 <__main__.Node object at 0x7fa2304c1cf8>
5 <__main__.Node object at 0x7fa233551710>
6 <__main__.Node object at 0x7fa239efcba8>
7 <__main__.Node object at 0x7fa239e73400>
8 <__main__.Node object at 0x7fa239cacf98>
9 <__main__.Node object at 0x7fa239d986a0>
10 <__main__.Node object at 0x7fa239d7fda0>
11 <__main__.Node object at 0x7fa239afba90>
12 <__main__.Node object at 0x7fa239ac95f8>
13 <__main__.Node object at 0x7fa239a1b898>
14 <__main__.Node object at 0x7fa239944e10>
15 <__main__.Node object at 0x7fa239872940>
16 <__main__.Node object at 0x7fa23996b470>
17 <__main__.Node object at 0x7fa2397dfd30>
18 <__main__.Node object at 0x7fa2397b6cf8>
19 <__main__.Node object at 0x7fa23976f550>
20 <__main__.Node object at 0x7fa239713240>
21 <__main__.Node object at 0x7fa23960d898>
22 <__main__.Node object at

In [7]:
projectionSpace = projectTree2(feat, tree, unit_vector=True, scale2data=True)
pp = projectionSpace.copy()
pp_s, _ = normIt(pp)

print('num dimenions:', pp_s.shape[1])

0 <__main__.Node object at 0x7fa2ddac37b8>
1 <__main__.Node object at 0x7fa233551518>
2 <__main__.Node object at 0x7fa24071ba58>
3 <__main__.Node object at 0x7fa2303ddb70>
4 <__main__.Node object at 0x7fa2304c1cf8>
5 <__main__.Node object at 0x7fa233551710>
6 <__main__.Node object at 0x7fa239efcba8>
7 <__main__.Node object at 0x7fa239e73400>
8 <__main__.Node object at 0x7fa239cacf98>
9 <__main__.Node object at 0x7fa239d986a0>
10 <__main__.Node object at 0x7fa239d7fda0>
11 <__main__.Node object at 0x7fa239afba90>
12 <__main__.Node object at 0x7fa239ac95f8>
13 <__main__.Node object at 0x7fa239a1b898>
14 <__main__.Node object at 0x7fa239944e10>
15 <__main__.Node object at 0x7fa239872940>
16 <__main__.Node object at 0x7fa23996b470>
17 <__main__.Node object at 0x7fa2397dfd30>
18 <__main__.Node object at 0x7fa2397b6cf8>
19 <__main__.Node object at 0x7fa23976f550>
20 <__main__.Node object at 0x7fa239713240>
21 <__main__.Node object at 0x7fa23960d898>
22 <__main__.Node object at 0x7fa2395a6860

In [8]:
tree2 = startTree(pp_s[::2])


for k in range(100):
    extendTree(tree2, pp_s[::2])


In [9]:
projectionSpace2 = projectTree2(pp_s, tree2, unit_vector=True, scale2data=True)
pp2 = projectionSpace2.copy()
pp_s2, _ = normIt(pp2)

print('num dimenions:', pp_s2.shape[1])

0 <__main__.Node object at 0x7fa252bf14a8>
1 <__main__.Node object at 0x7fa23814aba8>
2 <__main__.Node object at 0x7fa2380e37f0>
3 <__main__.Node object at 0x7fa237f5f898>
4 <__main__.Node object at 0x7fa237ef0128>
5 <__main__.Node object at 0x7fa237f77e80>
6 <__main__.Node object at 0x7fa237e98cc0>
7 <__main__.Node object at 0x7fa237dc7860>
8 <__main__.Node object at 0x7fa237cf0d68>
9 <__main__.Node object at 0x7fa237c1e940>
10 <__main__.Node object at 0x7fa237bccb38>
11 <__main__.Node object at 0x7fa237ae7f60>
12 <__main__.Node object at 0x7fa237a2ae10>
13 <__main__.Node object at 0x7fa2379d2e10>
14 <__main__.Node object at 0x7fa252bf1550>
15 <__main__.Node object at 0x7fa237890e80>
16 <__main__.Node object at 0x7fa23775f080>
17 <__main__.Node object at 0x7fa237706c50>
18 <__main__.Node object at 0x7fa2376247f0>
19 <__main__.Node object at 0x7fa2375d3f28>
20 <__main__.Node object at 0x7fa237509f28>
21 <__main__.Node object at 0x7fa237436a90>
22 <__main__.Node object at 0x7fa237362f98

In [10]:
print('Create UMAP Feature')

! pip3 install umap.learn
import umap



f_norm, _ = normIt(feat)
embedding = umap.UMAP()
fUmap = embedding.fit_transform(f_norm)

Create UMAP Feature
Processing /root/.cache/pip/wheels/d0/f8/d5/8e3af3ee957feb9b403a060ebe72f7561887fef9dea658326e/umap_learn-0.3.10-cp36-none-any.whl
Installing collected packages: umap.learn
Successfully installed umap.learn
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../usr/local/lib/python3.6/dist-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../usr/local/lib/python3.6/dist-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
The k

## Label Spreading evaluation

In [40]:
import random
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
# setup
lModel = LabelSpreading(gamma=100)
gt_use = gt.copy()
#gt_use = gtCoarse.copy()
#########


samplePerClass = 1
numClass = int(max(gt_use)+1)
ind = []
for i in range(numClass):
    index = list(np.where(gt_use==i)[0])
    if len(index) == 0:
        continue
    index = random.sample(index, samplePerClass);
    ind = ind + (index)
#ind = random.sample(range(pp_s.shape[0]), numClass*samplePerClass);


In [41]:
print('Ours')


lModel.fit(pp_s[ind], gt_use[ind])
label = lModel.predict(pp_s)
score = np.max(lModel.predict_proba(pp_s), axis=1)


print('Precision:', np.sum(label==gt_use)/label.size)
print('map:', average_precision_score(label==gt_use, score))

Ours
Precision: 0.825
map: 0.9168310383725005


In [45]:
from sklearn.decomposition import PCA

print('PCA')

f_norm, _ = normIt(feat)

pca = PCA(n_components=pp.shape[1])
f_pca = pca.fit_transform(f_norm)



lModel.fit(f_pca[ind], gt_use[ind])
label = lModel.predict(f_pca)
score = np.max(lModel.predict_proba(f_pca), axis=1)

print('Precision:', np.sum(label==gt_use)/label.size)
print('map:', average_precision_score(label==gt_use, score))

PCA
Precision: 0.4896153846153846


  probabilities /= normalizer
  probabilities /= normalizer


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [43]:
print('Naive')



lModel.fit(f_norm[ind], gt_use[ind])
label = lModel.predict(f_norm)
score = np.max(lModel.predict_proba(f_norm), axis=1)


print('Precision:', np.sum(label==gt_use)/label.size)
print('map:', average_precision_score(label==gt_use, score))

Naive


  probabilities /= normalizer
  probabilities /= normalizer


Precision: 0.13723076923076924


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
print('UMAP')

lModel.fit(fUmap[ind], gt_use[ind])
label = lModel.predict(fUmap)
score = np.max(lModel.predict_proba(fUmap), axis=1)

print('Precision:', np.sum(label==gt_use)/label.size)
print('map:', average_precision_score(label==gt_use, score))

## Test Feature Goodness

In [50]:
from sklearn.metrics import pairwise_distances

def test_feat(feat, labels, nBest= 1000):
    num_feat = feat.shape[0]
    d = pairwise_distances(feat)
    ind = np.argsort(d, axis =1)
    
    err = np.zeros(num_feat)
    
    for i in range(num_feat):
        l = labels[ind[i,:nBest+1]]      
        err[i] = nBest - (sum((l-l[0]) == 0))+1
                          
    return err



nBest = 10

err = test_feat(pp_s2[1::2], gt[1::2], nBest=nBest)
print('average error of  %f from %d nearest neighbors'  %(np.mean(err), nBest))

print('num dimenions:', pp_s2.shape[1])



average error of  0.675846 from 10 nearest neighbors
num dimenions: 180


In [51]:



err = test_feat(pp_s[1::2], gtAll[1::2], nBest=nBest)
print('average error of  %f from %d nearest neighbors'  %(np.mean(err), nBest))

print('num dimenions:', pp.shape[1])



average error of  0.663846 from 10 nearest neighbors
num dimenions: 182


In [52]:

f_norm, m_ = normIt(feat[::2])

pca = PCA(n_components=pp.shape[1])
f_pca = pca.fit_transform(f_norm)

err = test_feat(f_pca[1::2], gt[1::2], nBest=nBest)
print('average error of  %f from %d nearest neighbors'  %(np.mean(err), nBest))

print('num dimenions:', pp.shape[1])




average error of  0.713538 from 10 nearest neighbors
num dimenions: 182


In [None]:
nBest = 1

af, _ = normIt(allFeat, m_)
f_pca = pca.transform(af)

err = test_feat(f_pca[1::2], gtAll[1::2], nBest=nBest)
print('average error of  %f from %d nearest neighbors'  %(np.mean(err), nBest))

print('num dimenions:', pp.shape[1])
