## sklearn.mixture.GaussianMixture

* _class_ sklearn.mixture.GaussianMixture(_n_components=1_, _*_, _covariance_type='full'_, _tol=0.001_, _reg_covar=1e-06_, _max_iter=100_, _n_init=1_, _init_params='kmeans'_, _weights_init=None_, _means_init=None_, _precisions_init=None_, _random_state=None_, _warm_start=False_, _verbose=0_, _verbose_interval=10_)[[source]](https://github.com/scikit-learn/scikit-learn/blob/8c9c1f27b/sklearn/mixture/_gaussian_mixture.py#L457)[¶](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture "Permalink to this definition")

** Gaussian Mixture.

Representation of a Gaussian mixture model probability distribution. This class allows to estimate the parameters of a Gaussian mixture distribution.

Read more in the  [User Guide](https://scikit-learn.org/stable/modules/mixture.html#gmm).

In [5]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

iris = load_iris()
feature_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# 보다 편리한 데이터 handling 위해 dataframe 으로 변환
iris_df = pd.DataFrame(data=iris.data, columns=feature_names)
iris_df['target'] = iris.target

In [6]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3, random_state=0).fit(iris.data)
gmm_cluster_labels = gmm.predict(iris.data)

# 클러스터링 결과를 iris_df의 'gmm_cluster' 컬럼명으로 저장

iris_df['gmm_cluster'] = gmm_cluster_labels
iris_df['target'] = iris.target

# target 값에 따라 gmm_cluster 값이 어떻게 매핑되는지 확인

iris_result = iris_df.groupby(['target'])['gmm_cluster'].value_counts()
print(iris_result)

target  gmm_cluster
0       0              50
1       2              45
        1               5
2       1              50
Name: gmm_cluster, dtype: int64


붓꽃 데이터 KMeans 군집화 결과

In [7]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0).fit(iris.data)
kmeans_cluster_labels = kmeans.predict(iris.data)
iris_df['kmeans_cluster'] = kmeans_cluster_labels
iris_result = iris_df.groupby(['target'])['kmeans_cluster'].value_counts()
print(iris_result)

target  kmeans_cluster
0       1                 50
1       0                 48
        2                  2
2       2                 36
        0                 14
Name: kmeans_cluster, dtype: int64




Kmeans 와 GMM 비교

In [8]:
# 클러스터 결과를 담은 dataframe과 사이킷런의 cluster 객체등을 인자로 받아 클러스터링 결과를 시각화하는 함수

def visualize_cluster_plot(clusterobj, dataframe, label_name, iscenter=True):
    if iscenter:
        centers = clusterobj.cluster_centers_
    
    unique_labels = np.unique(dataframe[label_name].values)
    markers=['o, 's', '^', 'x', '*']
    isNoise=False
    
    for label in unique_labels:
             label_cluster = dataframe[dataframe[labe_name]==label]
             if label == -1:
                 cluster_legend = 'Noise'
                 isNoise=True
             else:
                 cluster_legend = 'Cluster'+str(label)
             
             pit.scatter(x=label_cluster['ftr1'], y=label_cluster['ftr2'], s=70,
                        edgecolor='k', marker=markers[label], label=cluster_legend)
             
             if iscenter:
                 center_x_y = centers[label]
                 plt.scatter(x=center_x_y[0], y=center_x_y[1]. s=250, color='white',
                            alpha=0.9, edgecolor='k', marker=markers[label])
                 plt.scatter(x=center_x_y[0], y=center_x_y[1]. s=70, color='k',
                            edgecolor='k', marker='$%d$' % label)
    if isNoise:
             legend_loc='upper center'
    else:
             legend_loc='upper right'
             
    plt.legend(loc=legend_loc)
    plt.show()
             

SyntaxError: unterminated string literal (detected at line 8) (1392822123.py, line 8)

In [9]:
from sklearn.datasets import make_blods

# make_blobs()로 300개의 데이터 셋, 3개의 cluster 셋, cluster_std=0.5를 만듬
X, y = make_blods(n_samples=300, n_feature=2, centers=3, cluster_std=0.5, random_state=0)

# 길게 늘어난 타원형의 데이터 셋을 생성하기 위해 변환
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X_aniso = np.dot(X, transformation)
# feature 데이터 셋과 make_blods()의 y 결과 값을 Dataframe으로 저장
cluster_df = pd.DataFrame(data=X_aniso, columns=['ftr1', 'ftr2'])
cluster_df['target'] = y
# 생성된 데이터 셋을 target 별로 다른 marker 로 표시하여 시각화 함
visualize_cluster_plot(None, cluster_df, 'target', iscenter=False)

ImportError: cannot import name 'make_blods' from 'sklearn.datasets' (C:\Users\seopa\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\datasets\__init__.py)