# DBSCAN

- 밀도 기반 군집화
- k-means는 독특한 기하구조를 잡아주지 못하지만 밀도기반은 가능

## 절차
- 기준 params설정 (epsilon, n)
- 점을 돌면서 핵심점 찾기
- 핵심점 끼리 겹치면 확장 아니면 개별로 분류
- 어디에도 속하지 않으면 잡음점(noise)

### 용어
- 핵심점: 설정한 기준을 만족하는 점


In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

# load_iris().keys()
data = pd.DataFrame(load_iris().data, columns=load_iris().feature_names)
target = load_iris().target

In [2]:
from sklearn.cluster import DBSCAN

dbs = DBSCAN(eps=0.5, min_samples=4, metric='euclidean')

db_res = dbs.fit_predict(data)
print('noise:',round(len(db_res[db_res == -1])/len(db_res)*100, 3),'%')
print(db_res)

noise: 8.667 %
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0
  0  0  1  1  1  1  1  1  1  2  1  1  2  1  1  1  1  1  1  1 -1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  2  1  1
  1  1  2  1  1  1  1  1  1 -1 -1  1 -1 -1  1  1  1  1  1  1  1 -1 -1  1
  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1 -1 -1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1]


In [3]:
## 설정에 따라 달라지는 값 보기
## 실코딩은 알고리즘 문제 풀듯이 풀면 될 듯
## 비지도 학습의 특징 확연히 느낄 예제!!

In [4]:
data['clustering_score'] = db_res

In [5]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),clustering_score
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1
146,6.3,2.5,5.0,1.9,1
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1


In [6]:
from sklearn.metrics import silhouette_samples


score_samples = silhouette_samples(data, db_res)

sils = pd.DataFrame([db_res, score_samples]).T
sils.columns = ['clustering', 'silhouette']

In [7]:
sils.groupby('clustering')['silhouette'].mean()

clustering
-1.0    0.160322
 0.0    0.783291
 1.0    0.429790
 2.0    0.835507
Name: silhouette, dtype: float64

In [8]:
db_res

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1,  1,  2,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  2,  1,  1,  1,
        1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
      dtype=int64)