# Stage 1: Preparing Inputs

In [None]:
#import libraries
import pandas as pd
import numpy as np
from numpy import percentile
from numpy import unique
from numpy import where
import matplotlib as mpl
from matplotlib import pyplot
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns; sns.set(font_scale=1.2) 
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture as GMM
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.cluster import MeanShift
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import OPTICS
from sklearn import metrics

In [None]:
# load data
AD6=pd.read_csv("C:/599_Research/FINAL_RESEARCH_and_PPT/THESIS_SUBMISSION/APPENDIX/2_SINGLE ATTRIBUTE SCRIPTS/DATA/AD6.csv")

# Stage 2 : Pre-processing & Execution 

# Pre-Processing

In [None]:
#inspect - example on df asc south
AD6.head()
AD6.describe()

In [None]:
#Visualize the data for the CUMULATIVE (OR ANY OTHER ATTRIBUTE), changing the hue allows you to visualize any attribute
sns.set(style="whitegrid")
plt.scatter(AD6['LONG'],AD6['LAT'], c= AD6['D20200131'], s=1)

In [None]:
###specific to GMM!
#GMM
#the Akaike information criterion (AIC) or the Bayesian information criterion (BIC).
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
n_components = np.arange(1, 21)
models = [GMM(n, covariance_type='full', random_state=0).fit(X) for n in n_components]
plt.plot(n_components, [m.bic(X) for m in models], label='BIC')
plt.plot(n_components, [m.aic(X) for m in models], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components');

In [None]:
def SelBest(arr:list, X:int)->list:
    '''
    returns the set of X configurations with shorter distance
    '''
    dx=np.argsort(arr)[:X]
    return arr[dx]

In [None]:
#Silhouette Score
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
n_clusters=np.arange(2, 20)
sils=[]
sils_err=[]
iterations=20
for n in n_clusters:
    tmp_sil=[]
    for _ in range(iterations):
        gmm=GMM(n, n_init=2).fit(X) 
        labels=gmm.predict(X)
        sil=metrics.silhouette_score(X, labels, metric='euclidean')
        tmp_sil.append(sil)
    val=np.mean(SelBest(np.array(tmp_sil), int(iterations/5)))
    err=np.std(tmp_sil)
    sils.append(val)
    sils_err.append(err)
    
plt.errorbar(n_clusters, sils, yerr=sils_err)
plt.title("Silhouette Scores", fontsize=20)
plt.xticks(n_clusters)
plt.xlabel("N. of clusters")
plt.ylabel("Score")

# Algorithm Execution

In [None]:
#kmeans
# define dataset
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model & fit the model
kmeans_model = KMeans(n_clusters=6, random_state=1).fit(X)
# assign a cluster to each example
yhat = kmeans_model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#map the labels to colors
c= ['b', 'r', 'y', 'g', 'c', 'm', 'e','f', 'u', 'd', 'a', 'h', 'i', 'j', 'k', 'l','n','o','p']
colors = [c[i] for i in yhat]

#Plot clusters with coordinates
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=yhat, s=10, cmap='viridis')
plt.savefig('AD6_kmeans_6.png')

In [None]:
#Agglomerative
#define dataset
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model
AGGLO_model = AgglomerativeClustering(n_clusters=6)
# fit model and predict clusters
yhat = AGGLO_model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#map the labels to colors
c= ['b', 'r', 'y', 'g', 'c', 'm', 'e','f', 'u', 'd', 'a', 'h']
colors = [c[i] for i in yhat]

#Plot clusters with coordinates
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=yhat, s=10, cmap='viridis')
plt.savefig('AD6_AGLO_6.png')

In [None]:
#BIRCH
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model
Birch_model = Birch(threshold = 0.8, n_clusters=6)
# fit the model
Birch_model.fit(X)
# assign a cluster to each example
yhat = Birch_model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#map the labels to colors
c= ['b', 'r', 'y', 'g', 'c', 'm', 'e','f', 'u', 'd', 'a', 'h']
colors = [c[i] for i in yhat]

#Plot clusters with coordinates
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=yhat, s=10, cmap='viridis')
plt.savefig('AD6_BIRCH_t0.8_6.png')

In [None]:
#DBSCAN
# define dataset
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model and fit model
DBSCAN_model = DBSCAN(eps=0.01, min_samples=6).fit(X)
core_samples_mask = np.zeros_like(DBSCAN_model.labels_, dtype=bool)
core_samples_mask[DBSCAN_model.core_sample_indices_] = True
# retrieve unique clusters
clusters = unique(DBSCAN_model)
labels = DBSCAN_model.labels_

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#Plot clusters with coordinates
figure(num=None, figsize=(9, 8), dpi=80, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=labels, s=10, cmap='plasma')
pyplot.xlabel("LAT")
pyplot.ylabel("LONG")
plt.savefig('AD6_DBSCAN_eps0.01_6.png')

In [None]:
#GMM
# define dataset
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model
GMM_model = GMM(n_components=6)
# fit the model
GMM_model.fit(X)
# assign a cluster to each example
yhat = GMM_model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#map the labels to colors
c= ['b', 'r', 'y', 'g', 'c', 'm', 'e','f', 'u', 'd', 'a', 'h']
colors = [c[i] for i in yhat]

#Plot clusters with coordinates
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=yhat, s=10, cmap='viridis')
plt.savefig('AD6_GMM_6.png')

In [None]:
#HDBSCAN
# cluster the data into min distance 50(what)
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
HDBSCAN_model = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=False,
    gen_min_span_tree=True, leaf_size=5,
    metric='euclidean', min_cluster_size=1200, min_samples= 20, p=None).fit(X)

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds


color_palette = sns.color_palette('deep', 8)
clusters_colors = [color_palette[x] if x >= 0
                 else (0.5, 0.5, 0.5)
                 for x in HDBSCAN_model.labels_]
clusters_member_colors = [sns.desaturate(x, p) for x, p in
                         zip(clusters_colors, HDBSCAN_model.probabilities_)]
plt.scatter(AD6['LONG'], AD6['LAT'], s=50, linewidth=0, c=clusters_member_colors, alpha=1)
plt.savefig('AD6_HDBSCAN_size1200_sample20.png')

In [None]:
#meanshift
#define the dataset
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model
Mean_model = MeanShift()
# fit model and predict clusters
yhat = Mean_model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#map the labels to colors
c= ['b', 'r', 'y', 'g', 'c', 'm', 'e','f', 'u', 'd', 'a', 'h']
colors = [c[i] for i in yhat]

#Plot clusters with coordinates
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=yhat, s=10, cmap='viridis')
plt.savefig('AD6_MEANSHIFT.png')

In [None]:
#MiniBatch_Kmeans
# define dataset
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model
MiniBatch_model = MiniBatchKMeans(n_clusters=6)
# fit the model
MiniBatch_model.fit(X)
# assign a cluster to each example
yhat = MiniBatch_model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)


import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#map the labels to colors
c= ['b', 'r', 'y', 'g', 'c', 'm', 'e','f', 'u', 'd', 'a', 'h']
colors = [c[i] for i in yhat]

#Plot clusters with coordinates
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=yhat, s=10, cmap='viridis')
plt.savefig('AD6_MINIBATCH_6.png')

In [None]:
#OPTICS
# define dataset
X = np.array(list(zip(AD6['D20190125'],AD6['D20200131'])))
# define the model
OPTICS_model = OPTICS(eps=5, min_samples=2)
# fit model and predict clusters
yhat = OPTICS_model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)

import timeit

start = timeit.default_timer()

# All the program statements
stop = timeit.default_timer()
execution_time = stop - start

print("Program Executed in "+str(execution_time)) # It returns time in seconds

#Plot clusters with coordinates
figure(num=None, figsize=(9, 8), dpi=80, facecolor='w', edgecolor='k')
pyplot.scatter(AD6['LONG'], AD6['LAT'], c=yhat, s=10, cmap='plasma')
pyplot.xlabel("LAT")
pyplot.ylabel("LONG")
plt.savefig('AD6_OPTICS_eps5_minsam2.png')

# Stage 3: Outputs and Assessment

In [None]:
#Kmeans
# Number of clusters in labels, ignoring noise if present.
labels = kmeans_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

# create scatter plot for samples from each cluster
for cluster in clusters:
	# get row indexes for samples with this cluster
	row_ix = where(yhat == cluster)
	# create scatter of these samples
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))

cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = kmeans_model.labels_

cluster_map[cluster_map.cluster == 4]

In [None]:
#Agglomerative
# Number of clusters in labels, ignoring noise if present.
labels = AGGLO_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

# create scatter plot for samples from each cluster
for cluster in clusters:
	# get row indexes for samples with this cluster
	row_ix = where(yhat == cluster)
	# create scatter of these samples
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))

cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = AGGLO_model.labels_

cluster_map[cluster_map.cluster == 4]

In [None]:
#BIRCH
# Number of clusters in labels, ignoring noise if present.
labels = Birch_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

# create scatter plot for samples from each cluster
for cluster in clusters:
	# get row indexes for samples with this cluster
	row_ix = where(yhat == cluster)
	# create scatter of these samples
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))

cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = Birch_model.labels_

cluster_map[cluster_map.cluster == 4]

In [None]:
#DBSCAN
labels = DBSCAN_model.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

#metrics
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))
cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = DBSCAN_model.labels_

cluster_map[cluster_map.cluster == 4]

In [None]:
#GMM
GMM_model.score
GMM_model.aic



In [None]:
### HDBSCAN

# Number of clusters in labels, ignoring noise if present.
labels = HDBSCAN_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))

cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = HDBSCAN_model.labels_

cluster_map[cluster_map.cluster == 4]

HDBSCAN_model.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

HDBSCAN_model.condensed_tree_.plot()

HDBSCAN_model.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())

#The hdbscan library implements soft clustering, where each data point is assigned a cluster membership score ranging from 0.0 to 1.0. A score of 0.0 represents a sample that is not in the cluster at all (all noise points will get this score) while a score of 1.0 represents a sample that is at the heart of the cluster (note that this is not the spatial centroid notion of core). You can access these scores via the probabilities_ attribute.
HDBSCAN_model.probabilities_

In [None]:
#Meanshift
# Number of clusters in labels, ignoring noise if present.
labels = Mean_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))

cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = Mean_model.labels_

cluster_map[cluster_map.cluster == 4]

# create scatter plot for samples from each cluster
for cluster in clusters:
	# get row indexes for samples with this cluster
	row_ix = where(yhat == cluster)
	# create scatter of these samples
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

In [None]:
#MINIBATCH kmeans
# Number of clusters in labels, ignoring noise if present.
labels = MiniBatch_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))

cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = MiniBatch_model.labels_

cluster_map[cluster_map.cluster == 4]

# create scatter plot for samples from each cluster
for cluster in clusters:
	# get row indexes for samples with this cluster
	row_ix = where(yhat == cluster)
	# create scatter of these samples
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

In [None]:
#OPTICS
# Number of clusters in labels, ignoring noise if present.
labels = OPTICS_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)


print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
#Calinski-Harabasz Index
print("Calinski Harabasz Score: %0.3f"
      % metrics.calinski_harabasz_score(X, labels))
#Davies Bouldin Index
print("Davies Bouldin Index: %0.3f"
      % metrics.davies_bouldin_score(X, labels))

cluster_map = pd.DataFrame()
cluster_map['data_index'] = AD6.index.values
cluster_map['cluster'] = OPTICS_model.labels_

cluster_map[cluster_map.cluster == 600]

# create scatter plot for samples from each cluster
for cluster in clusters:
	# get row indexes for samples with this cluster
	row_ix = where(yhat == cluster)
	# create scatter of these samples
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()