# 군집화

In [None]:
import os
import pandas as pd
import numpy as np
import hds
from plt_rcs import *

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
sorted(os.listdir())

In [None]:
objs = pd.read_pickle('Cereal.pkl')

- 딕셔너리 형태의 pkl 파일을 key를 변수명으로 한 변수로 한번에 등록

In [None]:
globals().update(objs)

In [None]:
df, df_scaled, pca_score = df, df_scaled, pca_score

## 계층적 군집화 모델 학습

In [None]:
from scipy.cluster.hierarchy import linkage

In [None]:
hc = linkage(
    y=df_scaled, method='single', metric='euclidean',
    optimal_ordering=True
)

## 덴드로그램 시각화

In [None]:
from scipy.cluster.hierarchy import dendrogram

In [None]:
plt.figure(figsize=(12, 4))
dendrogram(Z=hc, orientation='top', labels=df.index)
plt.show()

## 계층적 군집화 시각화 함수 생성

In [None]:
def plot_dendrogram(y, method):
    hc = linkage(
    y=y, method=method, metric='euclidean',
    optimal_ordering=True
)   
    plt.figure(figsize=(12, 4))
    dendrogram(Z=hc, orientation='top', labels=df.index)
    plt.show()

In [None]:
plot_dendrogram(y=df_scaled, method='complete')

In [None]:
plot_dendrogram(y=df_scaled, method='average')

In [None]:
plot_dendrogram(y=df_scaled, method='centroid')

In [None]:
plot_dendrogram(y=df_scaled, method='ward')

## k-means 군집화 모델 학습

In [None]:
from sklearn.cluster import KMeans

In [None]:
# k-means 군집화 모델 생성
model = KMeans(n_clusters=8, init='k-means++', random_state=0)

In [None]:
# 표준화된 데이터로 군집화 모델 학습
model.fit(X=df_scaled)

## 결과 확인

In [None]:
# 군집 정보 확인
cluster_labels = model.predict(X=df_scaled)
cluster_labels
# array([4, 6, 4, 4, 0, 0, 0, 6, 5, 5, 0, 5, 0, 6, 0, 2, 2, 0, 0, 6, 3, 2,
#        0, 2, 0, 0, 3, 1, 1, 0, 0, 0, 5, 5, 6, 0, 0, 0, 7, 7, 2, 5, 0, 3,
#        6, 6, 6, 5, 0, 6, 5, 6, 1, 7, 3, 3, 5, 4, 1, 6, 3, 2, 2, 3, 3, 3,
#        0, 5, 3, 7, 1, 7, 2, 0, 5, 5, 0], dtype=int32)

In [None]:
pd.Series(data=cluster_labels).value_counts().sort_index()
# 0    22
# 1     5
# 2     8
# 3    10
# 4     4
# 5    12
# 6    11
# 7     5
# Name: count, dtype: int64

## 군집 내 거리 제곱합

In [None]:
plt.rc(group='figure', figsize=(4, 4))

In [None]:
model.inertia_
# 215.61245219621284

In [None]:
hds.plot.wcss(X=df_scaled, k=10)

## 실루엣 계수

In [None]:
df1 = df_scaled.copy()

In [None]:
df1['cluster'] = cluster_labels
df1.head()

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# 전체 관측값의 실루엣 계수 평균 확인
silhouette_score(X=df_scaled, labels=cluster_labels)
# 0.3075777347145458

In [None]:
hds.plot.silhouette(X=df_scaled, k=10)

## 최적의 k를 적용한 k-means 군집화 모델 학습

In [None]:
# 기존 모델에 최적의 k를 설정 후 재학습
model.set_params(n_clusters=6).fit(X=df_scaled)

In [None]:
# 최적 모델의 군집 정보를 추가
df['k-means'] = model.predict(X=df_scaled)

In [None]:
df['k-means'].value_counts().sort_index()
# k-means
# 0    24
# 1     7
# 2    18
# 3    15
# 4    10
# 5     3
# Name: count, dtype: int64

In [None]:
# 최적 모델에 대한 실루엣 계수의 평균 확인
silhouette_score(X=df_scaled, labels=df['k-means'])
# 0.2750794137284007

## 군집별 특징 확인

In [None]:
df = df.reset_index()

In [None]:
df.head()

In [None]:
df.groupby(by='k-means').mean(numeric_only=True).round(3)

## 군집별 특징 시각화

In [None]:
sns.scatterplot(
    data=pca_score, x='PC1', y='PC2',
    s=50, alpha=0.5,
    hue=df['k-means'], palette='Set1'
)
plt.title(label='k-means with PCA', fontweight='bold')
plt.axvline(x=0, color='0.5', linewidth=0.5, linestyle='--')
plt.axhline(y=0, color='0.5', linewidth=0.5, linestyle='--')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='k-means')
plt.show()

## t-SNE 변환

In [None]:
from sklearn.manifold import TSNE

In [None]:
# t-SNE 모델 생성
tsne = TSNE(n_components=2, perplexity=15, random_state=0)

In [None]:
# 표준화된 데이터를 사용하여 t-SNE 변환 수행
X_tsne = tsne.fit_transform(X=df_scaled)

In [None]:
# t-SNE 변환 결과를 데이터프레임으로 저장
df_tsne = pd.DataFrame(data=X_tsne, columns=['tSNE1', 'tSNE2'])

In [None]:
df_tsne.head()

In [None]:
sns.scatterplot(
    data=df_tsne, x='tSNE1', y='tSNE2',
    s=50, alpha=0.5,
    hue=df['k-means'], palette='Set1'
)
plt.title(label='k-means with t-SNE(perplexity: 15)', fontweight='bold')
plt.axvline(x=0, color='0.5', linewidth=0.5, linestyle='--')
plt.axhline(y=0, color='0.5', linewidth=0.5, linestyle='--')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='k-means')
plt.show()

## DBSCAN 군집화

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
# DBSCAN 모델 생성
dbscan = DBSCAN(eps=2.0, min_samples=5)

In [None]:
# 표준화된 데이터로 DBSCAN 군집화 모델 학습
df['dbscan'] = dbscan.fit_predict(X=df_scaled)

In [None]:
# DBSCAN 군집별 도수 확인
df['dbscan'].value_counts().sort_index()
# dbscan
# -1    22
#  0    50
#  1     5
# Name: count, dtype: int64

## DBSCAN 시각화

In [None]:
sns.scatterplot(
    data=df_tsne, x='tSNE1', y='tSNE2',
    s=50, alpha=0.5,
    hue=df['dbscan'], palette='Set1'
)
plt.title(label='DBSCAN with t-SNE(perplexity: 15)', fontweight='bold')
plt.axvline(x=0, color='0.5', linewidth=0.5, linestyle='--')
plt.axhline(y=0, color='0.5', linewidth=0.5, linestyle='--')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='DBSCAN')
plt.show()