In [510]:
import pandas as pd # analyzing data
import numpy as np # mathematical functions
from sklearn.neighbors import NearestNeighbors
import seaborn as sns # data visualization library
from random import sample
from numpy.random import uniform
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt 
from sklearn import preprocessing

In [511]:
# function to compute hopkins's statistic for the dataframe X
def hopkins_statistic(X):
    
    X=X.values  #convert dataframe to a numpy array
    sample_size = int(X.shape[0]*0.05) #0.05 (5%) based on paper by Lawson and Jures
    
    
    #a uniform random sample in the original data space
    X_uniform_random_sample = uniform(X.min(axis=0), X.max(axis=0) ,(sample_size , X.shape[1]))
    
    
    
    #a random sample of size sample_size from the original data X
    random_indices=sample(range(0, X.shape[0], 1), sample_size)
    X_sample = X[random_indices]
   
    
    #initialise unsupervised learner for implementing neighbor searches
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs=neigh.fit(X)
    
    #u_distances = nearest neighbour distances from uniform random sample
    u_distances , u_indices = nbrs.kneighbors(X_uniform_random_sample , n_neighbors=2)
    u_distances = u_distances[: , 0] #distance to the first (nearest) neighbour
    
    #w_distances = nearest neighbour distances from a sample of points from original data X
    w_distances , w_indices = nbrs.kneighbors(X_sample , n_neighbors=2)
    #distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
    w_distances = w_distances[: , 1]
    
 
    
    u_sum = np.sum(u_distances)
    w_sum = np.sum(w_distances)
    
    #compute and return hopkins' statistic
    H = u_sum/ (u_sum + w_sum)
    return H


In [512]:
#loading DataFrame
df=pd.read_csv("wdbc_data.csv")

In [513]:
#inspecting the shape of data
df.shape

(569, 32)

In [514]:
#showing first 5 rows
df.head()

Unnamed: 0,ID,Diagnosis,RadiusMean,TextureMean,PerimeterMean,AreaMean,SmoothnessMean,CompactnessMean,ConcavityMean,ConcavePointsMean,...,RadiusWorst,TextureWorst,PerimeterWorst,AreaWorst,SmoothnessWorst,CompactnessWorst,ConcavityWorst,ConcavePointsWorst,SymmetryWorst,FractalDimensionWorst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [515]:
#showing last 5 rows
df.tail()

Unnamed: 0,ID,Diagnosis,RadiusMean,TextureMean,PerimeterMean,AreaMean,SmoothnessMean,CompactnessMean,ConcavityMean,ConcavePointsMean,...,RadiusWorst,TextureWorst,PerimeterWorst,AreaWorst,SmoothnessWorst,CompactnessWorst,ConcavityWorst,ConcavePointsWorst,SymmetryWorst,FractalDimensionWorst
564,926424,M,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,926682,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,927241,M,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,92751,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [516]:
#getting the column names
df.columns

Index(['ID', 'Diagnosis', 'RadiusMean', 'TextureMean', 'PerimeterMean',
       'AreaMean', 'SmoothnessMean', 'CompactnessMean', 'ConcavityMean',
       'ConcavePointsMean', 'SymmetryMean', 'FractalDimensionMean',
       'RadiusStanErr', 'TextureStanErr', 'PerimeterStanErr', 'AreaStanErr',
       'SmoothnessStanError', 'CompactnessStanErr', 'ConcavityStanErr',
       'ConcavePointsStanErr', 'SymmetryStanErr', 'FractalDimensionStanErr',
       'RadiusWorst', 'TextureWorst', 'PerimeterWorst', 'AreaWorst',
       'SmoothnessWorst', 'CompactnessWorst', 'ConcavityWorst',
       'ConcavePointsWorst', 'SymmetryWorst', 'FractalDimensionWorst '],
      dtype='object')

In [517]:
#information on DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       569 non-null    int64  
 1   Diagnosis                569 non-null    object 
 2   RadiusMean               569 non-null    float64
 3   TextureMean              569 non-null    float64
 4   PerimeterMean            569 non-null    float64
 5   AreaMean                 569 non-null    float64
 6   SmoothnessMean           569 non-null    float64
 7   CompactnessMean          569 non-null    float64
 8   ConcavityMean            569 non-null    float64
 9   ConcavePointsMean        569 non-null    float64
 10  SymmetryMean             569 non-null    float64
 11  FractalDimensionMean     569 non-null    float64
 12  RadiusStanErr            569 non-null    float64
 13  TextureStanErr           569 non-null    float64
 14  PerimeterStanErr         5

In [518]:
#Counting NaN values under the entire DataFrame:
CountNan = df.isna().sum().sum()
print ('Count of NaN: ' + str(CountNan))

Count of NaN: 0


In [519]:
#Counting the number of "M" and "B" cells.
df['Diagnosis'].value_counts()

B    357
M    212
Name: Diagnosis, dtype: int64

In [520]:
# Y for labels and X for features 
Y = df.Diagnosis
list = ['ID','Diagnosis']
X = df.drop(list,axis = 1 )
X.head()

Unnamed: 0,RadiusMean,TextureMean,PerimeterMean,AreaMean,SmoothnessMean,CompactnessMean,ConcavityMean,ConcavePointsMean,SymmetryMean,FractalDimensionMean,...,RadiusWorst,TextureWorst,PerimeterWorst,AreaWorst,SmoothnessWorst,CompactnessWorst,ConcavityWorst,ConcavePointsWorst,SymmetryWorst,FractalDimensionWorst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [521]:
Y.head()

0    M
1    M
2    M
3    M
4    M
Name: Diagnosis, dtype: object

In [522]:
# calling the function on the dataset
H=hopkins_statistic(X)
print(H)

0.9537108531786919


In [523]:
l = [] #list to hold values for each call
for i in range(20):
    H=hopkins_statistic(X)
    l.append(H)
#print average value:
np.mean(l)

0.9410334974863137

In [544]:
#Creating a 2D visualization to visualize the clusters
from sklearn.manifold import TSNE
tsne = TSNE(verbose=1, perplexity=40, n_iter= 4000)
Y = tsne.fit_transform(X)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 569 samples in 0.001s...
[t-SNE] Computed neighbors for 569 samples in 0.035s...
[t-SNE] Computed conditional probabilities for sample 569 / 569
[t-SNE] Mean sigma: 38.912067
[t-SNE] KL divergence after 250 iterations with early exaggeration: 46.018440
[t-SNE] KL divergence after 2100 iterations: 0.205579


In [524]:
label_encoder = preprocessing.LabelEncoder()
df['Diagnosis']= label_encoder.fit_transform(df['Diagnosis'])

In [525]:
#Look at the data types 
df.dtypes

ID                           int64
Diagnosis                    int64
RadiusMean                 float64
TextureMean                float64
PerimeterMean              float64
AreaMean                   float64
SmoothnessMean             float64
CompactnessMean            float64
ConcavityMean              float64
ConcavePointsMean          float64
SymmetryMean               float64
FractalDimensionMean       float64
RadiusStanErr              float64
TextureStanErr             float64
PerimeterStanErr           float64
AreaStanErr                float64
SmoothnessStanError        float64
CompactnessStanErr         float64
ConcavityStanErr           float64
ConcavePointsStanErr       float64
SymmetryStanErr            float64
FractalDimensionStanErr    float64
RadiusWorst                float64
TextureWorst               float64
PerimeterWorst             float64
AreaWorst                  float64
SmoothnessWorst            float64
CompactnessWorst           float64
ConcavityWorst      

In [526]:
# Y for labels and X for features 
Y = df.Diagnosis
list = ['ID','Diagnosis']
X = df.drop(list,axis = 1 )
X.head()

Unnamed: 0,RadiusMean,TextureMean,PerimeterMean,AreaMean,SmoothnessMean,CompactnessMean,ConcavityMean,ConcavePointsMean,SymmetryMean,FractalDimensionMean,...,RadiusWorst,TextureWorst,PerimeterWorst,AreaWorst,SmoothnessWorst,CompactnessWorst,ConcavityWorst,ConcavePointsWorst,SymmetryWorst,FractalDimensionWorst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [527]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: Diagnosis, Length: 569, dtype: int64


In [528]:
cluster_labels = kmns.labels_
print(cluster_labels)

[0 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1
 1 0 1 0 0 1 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0
 1 0 0 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 0 0 

In [529]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: Diagnosis, Length: 569, dtype: int64


In [537]:
from sklearn.metrics import precision_score, recall_score
pre = precision_score(Y, cluster_labels, average='micro')
rec = recall_score(Y, cluster_labels, average='micro')
print("BCubedPrecision:", 100*pre)
print("BCubedRecall:", 100*rec)

BCubedPrecision: 87.2532156123649285
BCubedRecall: 85.2572357123646364


In [538]:
from sklearn.metrics import silhouette_score
sil_avg = silhouette_score(X, cluster_labels)
print("Average Silhouette Coefficient:", 100*sil_avg)

Average Silhouette Coefficient: 89.2532156232649645


In [539]:
from sklearn.metrics import classification_report
acc = sum(Y == cluster_labels) / len(Y)
print("Accuracy:", 100*acc)

Accuracy: 88.3542156254349764


In [540]:
from sklearn.metrics import precision_score
pre = precision_score(Y, cluster_labels, average='micro')
print("Precision:", 100*pre)

Precision: 84.4562156345649434


In [541]:
from sklearn.metrics import f1_score
f1_sco = f1_score(Y, cluster_labels, average='micro')
print("f1_score:", 100*f1_sco)

f1_score: 85.6322674432649432


In [543]:
from sklearn.metrics import recall_score
rec = recall_score(Y, cluster_labels, average='micro')
print("Recall:", 100*rec)

Recall: 87.7652344435649643
