### Tests on Top1

In [1]:
# standard imports
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy.spatial.distance import cdist,pdist,squareform
from sklearn import datasets, metrics
from sklearn.cluster import DBSCAN
from sklearn.neighbors import kneighbors_graph
from scipy.stats import pearsonr
from sklearn.manifold import MDS
%matplotlib inline

In [2]:
#my code
#import custom functions and classes
import myclusters2
import mymetrics2
import DBCV

In [3]:
%%bash
awk '/hexdump/' mymetrics2.py

#       hexdump -v -e '376/4 "%06f "' -e '"\n"' rmsdm_hex.dat > rmsdm.dat


### Load Data and calculate size independent $\rho$

In [7]:
#data dir
% cd /scratch/cherry_picking/top1_data
#%cd /home/gmancini/Work/top1_data

/scratch/cherry_picking/top1_data


In [8]:
RawDist = np.loadtxt("2drms-raw.dat")
RawDist.shape

(3599, 3599)

In [9]:
FDist = np.loadtxt("2drms-fil.dat")
FDist.shape

(3599, 3599)

In [10]:
rgyr_raw = np.loadtxt("rgyr_raw.dat")[:,1]
rho_raw = myclusters2.size_indep_rho(RawDist,rgyr_raw)
rho_raw.shape

(3599, 3599)

In [11]:
rgyr_f = np.loadtxt("rgyr_f.dat")[:,1]
rho_f = myclusters2.size_indep_rho(FDist,rgyr_f)
rho_f.shape

(3599, 3599)

### Settings

Settings for percentage of point, noise and metric

In [12]:
spacedef="precomputed"
usenoise="ignore"
percent = np.array((0.5,1.,1.5,2.,2.5,3.))/100.
ndata = RawDist.shape[0]
minPTS = (percent*ndata).astype(int)
print(minPTS)
nfeatures = 565*2

[ 17  35  53  71  89 107]


### DBSCAN: raw

In [13]:
print("Using raw RMSD matrix")
for mp in minPTS:
    VI = np.zeros((4,2))

    ################
    try:
        G = kneighbors_graph(RawDist,mp,mode='distance',metric=spacedef)
    except DeprecationWarning as e:
        G = G.toarray()
    epsilon = np.mean(G[np.nonzero(G)])
    print("\n############################\n")
    print("minPTS, epsilon ",mp,epsilon)
    ###############
    estimator = DBSCAN(eps=epsilon, min_samples=mp,metric=spacedef)
    init_clusters = estimator.fit_predict(RawDist)
    centroids = mymetrics2.core2centers(D=RawDist,clusters=init_clusters)
    Nclust = len(centroids)
    if Nclust==1: continue
    clusters  = mymetrics2.assign_centroid_label(init_clusters,centroids)
    nout = len(init_clusters[init_clusters==-1])
    size = [len(clusters[clusters==i]) for i in centroids]
    Outcome = dict(zip(centroids,size))
    print("Clusters, outliers:",Nclust,Outcome,nout)
    for cent in centroids:
        points = clusters==cent
        medd = np.mean(RawDist[points,:][:,points])
        st = np.std(RawDist[points,:][:,points])
        print("Median distance in cluster ",cent," = ",medd,st)
    #################
    dens_score = DBCV.DBCV(clusters=clusters,metric=spacedef,NF=nfeatures,D=rho_raw)
    dbcv = dens_score.calc_score(meas='kernel')
    #----------------
    sil = metrics.silhouette_score(RawDist,clusters,metric=spacedef)
    myeval = mymetrics2.cluster_eval(metric=spacedef,clusters=clusters,D=RawDist)
    DBI = myeval(noise=usenoise,method="DBI")
    Dunn = myeval(noise=usenoise,method="Dunn",inter="allav",intra="allav")
    psf,wss = myeval(noise=usenoise,method="psF",centroid=True)
    norm = (ndata-nout)/ndata
    print("%8s %8s %8s %8s %8s" % ("Sil","DBI","Dunn","psF","DBCV"))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil,DBI,Dunn,psf,dbcv))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil*norm,DBI/norm,Dunn*norm,psf*norm,dbcv))
    VI[0,0] = sil
    VI[0,1] = sil*norm
    VI[1,0] = DBI
    VI[1,1] = DBI/norm
    VI[2,0] = Dunn
    VI[2,1] = Dunn*norm
    VI[3,0] = psf
    VI[3,1] = psf*norm
    correl = pearsonr(VI[:,0],VI[:,1])
    print("Correl: ",correl)
print(VI)

Using raw RMSD matrix

############################

minPTS, epsilon  17 0.126150300018
Clusters, outliers: 13 {831: 32, 3075: 1407, 1860: 52, 389: 37, 1736: 207, 1161: 11, 1970: 21, 274: 29, 954: 140, 1947: 45, 1340: 147, -1: 1414, 1541: 57} 1414
Median distance in cluster  274  =  0.132874249703 0.0308240146615
Median distance in cluster  389  =  0.133424347699 0.0277201001197
Median distance in cluster  831  =  0.137301732422 0.0319057181788
Median distance in cluster  954  =  0.162021900102 0.0327697539081
Median distance in cluster  1161  =  0.119350991736 0.0415298887489
Median distance in cluster  1736  =  0.178500067353 0.0432194949033
Median distance in cluster  1340  =  0.156573419686 0.027275158378
Median distance in cluster  1541  =  0.144754931979 0.0302276939758
Median distance in cluster  1860  =  0.147096986686 0.0299178755458
Median distance in cluster  1947  =  0.140393407407 0.0280076440475
Median distance in cluster  1970  =  0.125504848073 0.0314378570332
Median di

In [None]:
RawDist[1192,1340]

In [None]:
"""myd = RawDist
sc = MDS(3,verbose=1,dissimilarity="precomputed")
Y1 = sc.fit_transform(myd)"""

### DBSCAN:$\rho_{raw}$

In [None]:
print("Using size independent rho")
for mp in minPTS:

    ################
    try:
        G = kneighbors_graph(rho_raw,mp,mode='distance',metric=spacedef)
    except DeprecationWarning as e:
        G = G.toarray()
    epsilon = np.mean(G[np.nonzero(G)])
    print("\n############################\n")
    print("minPTS, epsilon ",mp,epsilon)
    ###############
    estimator = DBSCAN(eps=epsilon, min_samples=mp,metric=spacedef)
    init_clusters = estimator.fit_predict(rho_raw)
    centroids = mymetrics2.core2centers(D=rho_raw,clusters=init_clusters)
    Nclust = len(centroids)
    if Nclust==1: continue
    clusters  = mymetrics2.assign_centroid_label(init_clusters,centroids)
    nout = len(init_clusters[init_clusters==-1])
    size = [len(clusters[clusters==i]) for i in centroids]
    Outcome = dict(zip(centroids,size))
    print("Clusters, outliers:",Nclust,Outcome,nout)
    #################
    dens_score = DBCV.DBCV(clusters=clusters,metric=spacedef,NF=nfeatures,D=rho_raw)
    dbcv = dens_score.calc_score(meas='kernel')
    #----------------
    sil = metrics.silhouette_score(rho_raw,clusters,metric=spacedef)
    myeval = mymetrics2.cluster_eval(metric=spacedef,clusters=clusters,D=rho_raw)
    DBI = myeval(noise=usenoise,method="DBI")
    Dunn = myeval(noise=usenoise,method="Dunn",inter="allav",intra="allav")
    psf,wss = myeval(noise=usenoise,method="psF",centroid=True)
    norm = (ndata-nout)/ndata
    print("%8s %8s %8s %8s %8s" % ("Sil","DBI","Dunn","psF","DBCV"))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil,DBI,Dunn,psf,dbcv))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil*norm,DBI/norm,Dunn*norm,psf*norm,dbcv))  

### DBSCAN: filt

In [None]:
print("Using PCA/filt RMSD matrix")
for mp in minPTS:

    ################
    try:
        G = kneighbors_graph(FDist,mp,mode='distance',metric=spacedef)
    except DeprecationWarning as e:
        G = G.toarray()
    epsilon = np.mean(G[np.nonzero(G)])
    print("\n############################\n")
    print("minPTS, epsilon ",mp,epsilon)
    ###############
    estimator = DBSCAN(eps=epsilon, min_samples=mp,metric=spacedef)
    init_clusters = estimator.fit_predict(FDist)
    centroids = mymetrics2.core2centers(D=FDist,clusters=init_clusters)
    Nclust = len(centroids)
    if Nclust==1: continue
    clusters  = mymetrics2.assign_centroid_label(init_clusters,centroids)
    nout = len(init_clusters[init_clusters==-1])
    size = [len(clusters[clusters==i]) for i in centroids]
    Outcome = dict(zip(centroids,size))
    print("Clusters, outliers:",Nclust,Outcome,nout)
    #################
    dens_score = DBCV.DBCV(clusters=clusters,metric=spacedef,NF=nfeatures,D=rho_raw)
    dbcv = dens_score.calc_score(meas='kernel')
    #----------------
    sil = metrics.silhouette_score(FDist,clusters,metric=spacedef)
    myeval = mymetrics2.cluster_eval(metric=spacedef,clusters=clusters,D=FDist)
    DBI = myeval(noise=usenoise,method="DBI")
    Dunn = myeval(noise=usenoise,method="Dunn",inter="allav",intra="allav")
    psf,wss = myeval(noise=usenoise,method="psF",centroid=True)
    norm = (ndata-nout)/ndata
    print("%8s %8s %8s %8s %8s" % ("Sil","DBI","Dunn","psF","DBCV"))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil,DBI,Dunn,psf,dbcv))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil*norm,DBI/norm,Dunn*norm,psf*norm,dbcv))    

### DBSCAN: $\rho_{filt}$

In [None]:
print("Using size independent rho")
for mp in minPTS:

    ################
    try:
        G = kneighbors_graph(rho_f,mp,mode='distance',metric=spacedef)
    except DeprecationWarning as e:
        G = G.toarray()
    epsilon = np.mean(G[np.nonzero(G)])
    print("\n############################\n")
    print("minPTS, epsilon ",mp,epsilon)
    ###############
    estimator = DBSCAN(eps=epsilon, min_samples=mp,metric=spacedef)
    init_clusters = estimator.fit_predict(rho_f)
    centroids = mymetrics2.core2centers(D=rho_f,clusters=init_clusters)
    Nclust = len(centroids)
    if Nclust==1: continue
    clusters  = mymetrics2.assign_centroid_label(init_clusters,centroids)
    nout = len(init_clusters[init_clusters==-1])
    size = [len(clusters[clusters==i]) for i in centroids]
    Outcome = dict(zip(centroids,size))
    print("Clusters, outliers:",Nclust,Outcome,nout)
    #################
    dens_score = DBCV.DBCV(clusters=clusters,metric=spacedef,NF=nfeatures,D=rho_raw)
    dbcv = dens_score.calc_score(meas='kernel')
    #----------------
    sil = metrics.silhouette_score(rho_f,clusters,metric=spacedef)
    myeval = mymetrics2.cluster_eval(metric=spacedef,clusters=clusters,D=rho_f)
    DBI = myeval(noise=usenoise,method="DBI")
    Dunn = myeval(noise=usenoise,method="Dunn",inter="allav",intra="allav")
    psf,wss = myeval(noise=usenoise,method="psF",centroid=True)
    norm = (ndata-nout)/ndata
    print("%8s %8s %8s %8s %8s" % ("Sil","DBI","Dunn","psF","DBCV"))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil,DBI,Dunn,psf,dbcv))
    print("%9.5f %9.5f %9.5f %9.5f %9.5f"%(sil*norm,DBI/norm,Dunn*norm,psf*norm,dbcv))   