<a href="https://colab.research.google.com/github/tomonari-masada/course2023-stats1/blob/main/clustering_with_gaussian_mixtures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 混合正規分布を使ったクラスタリング

In [None]:
import numpy as np
from scipy.special import softmax
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

## 単変量正規分布の場合

### 人工データの生成

In [None]:
n_samples = 1000
n_clusters = 2
n_features = 1
cluster_std = 0.3 # 問題を解きやすくするため。

X, y = make_blobs(
    n_samples=n_samples,
    centers=n_clusters,
    n_features=n_features,
    cluster_std=cluster_std,
    random_state=0)

In [None]:
X[:10]

In [None]:
y[:10]

In [None]:
plt.hist(X[y == 0].reshape(-1), bins=50, lw=0, alpha=0.5)
plt.hist(X[y == 1].reshape(-1), bins=50, lw=0, alpha=0.5);

### responsibilityの初期化

In [None]:
q = softmax(np.random.randn(n_samples, n_clusters), axis=-1)

In [None]:
q

### Mステップ

* 授業資料の$\mu_k, \sigma_k^2, \theta_k$の更新式をそのまま実装している。

In [None]:
def M_step(X, q):
  mu = (q * X).sum(0) / q.sum(0)
  var = (q * (X - mu) ** 2).sum(0) / q.sum(0)
  theta = q.sum(0) / q.shape[0]
  return mu, var, theta

### Eステップ

* クラスタごとの条件付き対数尤度の計算式

In [None]:
def log_conditional_likelihood(X, mu, var):
  return - 0.5 * np.log(2 * np.pi * var) - (X - mu) ** 2 / (2 * var)

* 授業資料の$q_{i,k}$の更新式をそのまま実装している。

In [None]:
def E_step(X, mu, var, theta):
  q_logit = np.log(theta) + log_conditional_likelihood(X, mu, var)
  return softmax(q_logit, axis=-1)

### EMアルゴリズムの実行

In [None]:
for _ in range(10000):
  mu, var, theta = M_step(X, q)
  q = E_step(X, mu, var, theta)

In [None]:
q

In [None]:
theta

### クラスタリングの評価

In [None]:
(q.argmax(-1) == y).sum()

## 多変量正規分布の場合
* ただし、分散共分散行列は対角成分のみ非ゼロとする。

### 人工データの生成

In [None]:
n_samples = 1000
n_clusters = 2
n_features = 3
cluster_std = 0.5 # 問題を解きやすくするため。

X, y = make_blobs(
    n_samples=n_samples,
    centers=n_clusters,
    n_features=n_features,
    cluster_std=cluster_std,
    random_state=0)

In [None]:
X[:10]

In [None]:
y[:10]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
markers = ["x", "+"]

prediction = q.argmax(-1)
for k in range(n_clusters):
  flag = y == k
  ax.scatter(X[flag,0], X[flag,1], X[flag,2], alpha=0.1)

### responsibilityの初期化

In [None]:
q = softmax(np.random.randn(n_samples , n_clusters), axis=-1)

### Eステップ

In [None]:
def E_step_multi(X, mu, var, theta):
  q_logit = np.log(theta)
  for d in range(X.shape[-1]):
    q_logit = q_logit + log_conditional_likelihood(X[:,d].reshape(-1,1), mu[:,d], var[:,d])
  return softmax(q_logit, axis=-1)

### EMアルゴリズムの実行

In [None]:
mu = np.zeros((n_clusters, n_features))
var = np.zeros((n_clusters, n_features))

for _ in range(1000):
  for d in range(n_features):
    mu[:,d], var[:,d], theta = M_step(X[:,d].reshape(-1,1), q)
  q = E_step_multi(X, mu, var, theta)

In [None]:
(q.argmax(-1) == y).sum()

In [None]:
q

In [None]:
mu

In [None]:
var

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
markers = ["x", "+"]

prediction = q.argmax(-1)
for k in range(n_clusters):
  flag = prediction == k
  ax.scatter(X[flag,0], X[flag,1], X[flag,2], alpha=0.1)
  ax.scatter(mu[k,0], mu[k,1], mu[k,2], color="black", marker=markers[k], s=300);