<a href="https://colab.research.google.com/github/tomonari-masada/course2021-stats1/blob/main/normal_distribution_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 単変量正規分布の最尤推定によるirisデータセットの分類
* 訓練データを使って、正規分布のパラメータを最尤推定。
 * 各特徴量は独立に単変量正規分布に従うと仮定。
* テストデータについて、コンポーネントごとの対数尤度を計算。
 * 尤度を最大にするコンポーネントを予測結果として出力。

In [None]:
import numpy as np
import matplotlib.pyplot as plt

%config InlineBackend.figure_formats = {'png', 'retina'}

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

iris = load_iris()
X = iris['data']
y = iris['target']
target_names = iris['target_names']
feature_names = iris['feature_names']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print(list(zip(*(X_train_scaled, y_train))))

In [None]:
mu_ML = []
sigma_ML = []
theta_ML = []
for k, target_name in enumerate(target_names):
  print(target_name)
  indices = y_train == k
  mu_ML.append(X_train_scaled[indices].mean(0))
  sigma_ML.append(np.sqrt(((X_train_scaled[indices] - mu_ML[-1]) ** 2).mean(0)))
  theta_ML.append(indices.sum() / len(indices))
mu_ML = np.array(mu_ML)
sigma_ML = np.array(sigma_ML)
theta_ML = np.array(theta_ML)

In [None]:
plt.bar(target_names, theta_ML);

In [None]:
mu_ML

In [None]:
sigma_ML

In [None]:
from scipy.stats import norm

x_axis = np.arange(-400, 401) * 0.01
for j, feature_name in enumerate(feature_names):
  for k, target_name in enumerate(target_names):
    plt.plot(x_axis, norm.pdf(x_axis, loc=mu_ML[k,j], scale=sigma_ML[k,j]), label=target_name)
  plt.title(feature_name)
  plt.legend()
  plt.show()
  plt.close()

In [None]:
accuracy = 0
for n, test_instance in enumerate(X_test_scaled):
  x = X_test_scaled[n]
  likelihood = []
  for k, _ in enumerate(target_names):
    likelihood.append(np.log(theta_ML[k]) + norm.logpdf(x, loc=mu_ML[k], scale=sigma_ML[k]).sum())
  accuracy += np.argmax(np.array(likelihood)) == y_test[n]
  print(f'No.{n} prediction={target_names[np.argmax(np.array(likelihood))]} ground truth={target_names[y_test[n]]}')
print(f'test accuracy : {accuracy / len(y_test):.3f}')