# BIG DATA ANALYTICS: Clustering using Scikit-learn
- Scikit learn 라이브러리를 활용한 K-means, DBSCAN 알고리즘을 활용해 보겠습니다.
---

In [None]:
import sys
!{sys.executable} -m pip install sklearn

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


# 1. K-means 클러스터링

## 1-1. 1 차원 클러스터링
차원은 feature의 갯수를 의미

In [None]:
X = np.array([[1], [2], [4], [8], [9], [11]])

In [None]:
plt.scatter(X[:],np.zeros(X.shape[0]), label='True Position')

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
print("Labels: ",kmeans.labels_)
print("Predicted Labels: ",kmeans.predict([[10], [1.5]]))
print("Centroids: \n", kmeans.cluster_centers_)

In [None]:
plt.scatter(X[:],np.zeros(X.shape[0]), c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:],np.zeros(kmeans.cluster_centers_.shape[0]), color='black')

## 1-2. 2 차원 클러스터링

In [None]:
X = np.array([[1, 2], [1.5, 4], [2, 0],
              [8, 2], [8.5, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
print("Labels: ",kmeans.labels_)
print("Predicted Labels: ",kmeans.predict([[0, 0], [12, 3]]))
print("Centroids: \n", kmeans.cluster_centers_)

In [None]:
plt.scatter(X[:,0],X[:,1], label='True Position',s=50)

In [None]:
plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow' , s=50)
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black',s=50)

## 1-3. 3 차원 클러스터링

In [None]:
from mpl_toolkits.mplot3d import axes3d, Axes3D 

In [None]:
X = np.array([[1, 2, 1], [1,3,1.5], [1.5, 2.5, 1.8],
              [10, 10.4, 13], [10.5,13.2,14], [12, 14.4, 15],
              [20, 20.1, 22], [19,24,18], [23, 23, 25],
              ])
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
print("Labels: ",kmeans.labels_)
print("Predicted Labels: ",kmeans.predict([[4, 4, 4], [12, 12, 12], [25,25,25]]))
print("Centroids: \n", kmeans.cluster_centers_)

In [None]:
plt.figure(figsize=(10,10))
ax = plt.axes(projection = '3d')
ax.scatter(X[:,0],X[:,1],X[:,2], c=kmeans.labels_ , cmap='rainbow', s=100)
ax.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1],kmeans.cluster_centers_[:,2], color='black',s=100)

ax.legend()

# 2. 밀도 기반 클러스터링 (DBSCAN)

In [None]:
from sklearn.datasets import make_circles, make_moons
n_samples = 1000
np.random.seed(2)
X1, y1 = make_circles(n_samples=n_samples, factor=.5, noise=.09)
X2, y2 = make_moons(n_samples=n_samples, noise=.1)


In [None]:
X1.shape

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
ax1.scatter(X1[:,0], X1[:,1])
ax2.scatter(X2[:,0], X2[:,1])

In [None]:
kmeans1 = KMeans(n_clusters=2, random_state=0).fit(X1)
kmeans2 = KMeans(n_clusters=2, random_state=0).fit(X2)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
ax1.scatter(X1[:,0], X1[:,1],c=kmeans1.labels_, cmap='rainbow' , s=10)
ax1.scatter(kmeans1.cluster_centers_[:,0] ,kmeans1.cluster_centers_[:,1], color='black',s=50)
ax2.scatter(X2[:,0], X2[:,1],c=kmeans1.labels_, cmap='rainbow' , s=10)
ax2.scatter(kmeans2.cluster_centers_[:,0] ,kmeans2.cluster_centers_[:,1], color='black',s=50)

In [None]:
from sklearn.cluster import DBSCAN
dbscans1 = DBSCAN(eps=0.1).fit(X1)
dbscans2 = DBSCAN(eps=0.1).fit(X2)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
ax1.scatter(X1[:,0], X1[:,1],c=dbscans1.labels_, cmap='rainbow' , s=10)
ax2.scatter(X2[:,0], X2[:,1],c=dbscans2.labels_, cmap='rainbow' , s=10)