# 비원형 데이터의 군집화 - DBSCAN
---
- 분할적 군집화 방식
- 밀도(데이터의 밀집) 기반 군집화 => 미리 군집 수 지정할 필요가 없음
- 다양한 형태의 데이터에서 군집화 가능
- 이상치 데이터 제거도 가능
- 군집을 정하는 기준
    * 임의의 점(Point)에서 지정된 거리만큼 영역 안에 지정된 데이터 수 존재 여부
    * 하이퍼파라미터 => 거리, 데이터 수

In [18]:
from sklearn.datasets import load_iris
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

## [1] 데이터 로딩

In [2]:
load_iris().keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [16]:
# DataFrame 형태로 데이터 로딩 시 => return_X_y=True, as_frame=True 설정
irisDF, y=load_iris(return_X_y=True, as_frame=True)
irisDF

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## [2] 데이터 전처리

In [19]:
scaler=StandardScaler()
scaler.fit(irisDF)
x_scaled=scaler.transform(irisDF)

In [21]:
x_scaled[:, 2]

array([-1.34022653, -1.34022653, -1.39706395, -1.2833891 , -1.34022653,
       -1.16971425, -1.34022653, -1.2833891 , -1.34022653, -1.2833891 ,
       -1.2833891 , -1.22655167, -1.34022653, -1.51073881, -1.45390138,
       -1.2833891 , -1.39706395, -1.34022653, -1.16971425, -1.2833891 ,
       -1.16971425, -1.2833891 , -1.56757623, -1.16971425, -1.05603939,
       -1.22655167, -1.22655167, -1.2833891 , -1.34022653, -1.22655167,
       -1.22655167, -1.2833891 , -1.2833891 , -1.34022653, -1.2833891 ,
       -1.45390138, -1.39706395, -1.34022653, -1.39706395, -1.2833891 ,
       -1.39706395, -1.39706395, -1.39706395, -1.22655167, -1.05603939,
       -1.34022653, -1.22655167, -1.34022653, -1.2833891 , -1.34022653,
        0.53540856,  0.42173371,  0.64908342,  0.13754657,  0.47857113,
        0.42173371,  0.53540856, -0.26031542,  0.47857113,  0.08070915,
       -0.14664056,  0.25122143,  0.13754657,  0.53540856, -0.08980313,
        0.36489628,  0.42173371,  0.194384  ,  0.42173371,  0.08

## [3] 군집화

In [22]:
# 군집화 객체 생성
dbscan=DBSCAN(eps=0.6, min_samples=8)

In [27]:
# iris 데이터를 분류
irisDBS=dbscan.fit(x_scaled)

In [30]:
irisDBS.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1, -1,
       -1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
      dtype=int64)

In [32]:
# 라벨별로 들어가있는 데이터들이 분류돼있음?
irisDBS.components_.shape

(88, 4)

In [24]:

irisDF=pd.DataFrame(x_scaled)

In [25]:
irisDF

Unnamed: 0,0,1,2,3
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832
