In [1]:
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier

from ipywidgets import interactive

from collections import defaultdict

import hdbscan
import folium
import re

import plotly.express as px

In [3]:
coordinate = pd.read_excel('data/coordinate_date.xlsx', sheet_name = 'coordinate')

# Coordinates Analysis

In [4]:
coordinate

Unnamed: 0,x,y,z
0,48.590397,196.532365,24.887314
1,47.804610,194.297258,22.819531
2,46.856430,192.127937,20.676256
3,45.126050,191.717317,20.161303
4,59.612597,192.378014,24.926745
...,...,...,...
967,73.890712,150.978578,16.996818
968,72.991700,149.538802,16.565456
969,68.569732,136.482727,14.473480
970,70.007875,135.813398,15.124269


In [5]:
px.scatter(coordinate ,x= coordinate['x'], y = coordinate['y'])

## DBSCAN

## 2D Clustering - Top View

In [6]:
X = np.array(coordinate[['x', 'y']], dtype='float64')

In [7]:
model = DBSCAN(eps= 2.7, min_samples=5).fit(X) #  eps = 2.7
class_predictions = model.labels_

coordinate['CLUSTERS_DBSCAN_2d'] = class_predictions
coordinate['CLUSTERS_DBSCAN_2d'] = coordinate['CLUSTERS_DBSCAN_2d'].astype('object')

In [8]:
print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')

print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')

Number of clusters found: 15
Number of outliers found: 94
Silhouette ignoring outliers: -0.24834182096630353
Silhouette outliers as singletons: -0.6003513770457716


In [9]:
px.scatter(coordinate ,
           x= coordinate['x'], y = coordinate['y'], color = coordinate['CLUSTERS_DBSCAN_2d'])

In [10]:
px.scatter_3d(coordinate ,x= coordinate['x'], y = coordinate['y'], z = coordinate['z'],
             color = 'CLUSTERS_DBSCAN_2d')

 ### 3D Clustering

In [11]:
XYZ = np.array(coordinate[['x', 'y', 'z']], dtype='float64')

In [12]:
model_3d = DBSCAN(eps= 2.7, min_samples=5).fit(XYZ) #  eps = 2.7
class_predictions_3d = model_3d.labels_

coordinate['CLUSTERS_DBSCAN_3d'] = class_predictions_3d
coordinate['CLUSTERS_DBSCAN_3d'] = coordinate['CLUSTERS_DBSCAN_3d'].astype('object')

In [13]:
print(f'Number of clusters found: {len(np.unique(class_predictions_3d))}')
print(f'Number of outliers found: {len(class_predictions_3d[class_predictions_3d==-1])}')

print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions_3d!=-1], class_predictions_3d[class_predictions_3d!=-1])}')

no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions_3d)])
print(f'Silhouette outliers as singletons: {silhouette_score(XYZ, no_outliers)}')

Number of clusters found: 28
Number of outliers found: 183
Silhouette ignoring outliers: -0.15365671576236484
Silhouette outliers as singletons: -0.46792015422148386


In [14]:
px.scatter(coordinate ,
           x= coordinate['x'], y = coordinate['y'], color = coordinate['CLUSTERS_DBSCAN_3d'])

In [15]:
px.scatter_3d(coordinate ,x= coordinate['x'], y = coordinate['y'], z = coordinate['z'],
             color = 'CLUSTERS_DBSCAN_3d')

In [16]:
coordinate.to_csv('3d_2d_dbscan_clustering.csv')