In [52]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [45]:
mlbDF = pd.read_excel("./backup/knn_dataset.xlsx")
mlbDF.drop(['Unnamed: 0'], axis=1, inplace=True)

In [49]:
mlb_target = mlbDF['Pos']
mlb_data = mlbDF.drop('Pos', axis=1)
scaler = StandardScaler()
scaler.fit(mlb_data)
mlb_data = scaler.transform(mlb_data)

In [54]:
### 클러스터 결과를 담은 DataFrame과 사이킷런의 Cluster 객체등을 인자로 받아 클러스터링 결과를 시각화하는 함수  
def visualize_cluster_plot(clusterobj, dataframe, label_name, iscenter=True):
    if iscenter :
        centers = clusterobj.cluster_centers_
        
    unique_labels = np.unique(dataframe[label_name].values)
    markers=['o', 's', '^', 'x', '*']
    isNoise=False

    for label in unique_labels:
        label_cluster = dataframe[dataframe[label_name]==label]
        if label == -1:
            cluster_legend = 'Noise'
            isNoise=True
        else :
            cluster_legend = 'Cluster '+str(label)
        
        plt.scatter(x=label_cluster['ftr1'], y=label_cluster['ftr2'], s=70,\
                    edgecolor='k', marker=markers[label], label=cluster_legend)
        
        if iscenter:
            center_x_y = centers[label]
            plt.scatter(x=center_x_y[0], y=center_x_y[1], s=250, color='white',
                        alpha=0.9, edgecolor='k', marker=markers[label])
            plt.scatter(x=center_x_y[0], y=center_x_y[1], s=70, color='k',\
                        edgecolor='k', marker='$%d$' % label)
    if isNoise:
        legend_loc='upper center'
    else: legend_loc='upper right'
    
    plt.legend(loc=legend_loc)
    plt.show()

In [58]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=4, random_state=0).fit(mlb_data)
gmm_cluster_labels = gmm.predict(mlb_data)

mlbDF['gmm_cluster'] = gmm_cluster_labels
print('실루엣 스코어는 : {0:.3f}'.format(silhouette_score(mlb_data, gmm_cluster_labels)))
mlbDF['Pos'] = mlb_target

mlb_result = mlbDF.groupby(['Pos'])['gmm_cluster'].value_counts()
print(mlb_result)

실루엣 스코어는 : 0.143
Pos  gmm_cluster
0    1              23
     2              16
     3              11
     0               8
1    2              22
     3              18
     1              16
     0              12
2    1              22
     2              20
     0              13
     3              12
3    1              24
     2              10
     3               9
     0               8
Name: gmm_cluster, dtype: int64


In [60]:
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, random_state=0).fit(mlb_data)
kmeans_cluster_labels = kmeans.predict(mlb_data)
mlbDF['kmeans_cluster'] = kmeans_cluster_labels
mlb_result = mlbDF.groupby(['Pos'])['kmeans_cluster'].value_counts()
print('실루엣 스코어는 : {0:.3f}'.format(silhouette_score(mlb_data, kmeans_cluster_labels)))

print(mlb_result)

실루엣 스코어는 : 0.243
Pos  kmeans_cluster
0    0                 21
     3                 18
     1                 16
     2                  3
1    3                 28
     1                 24
     2                 16
2    2                 24
     3                 24
     1                 19
3    2                 25
     3                 15
     1                 11
Name: kmeans_cluster, dtype: int64
