# 머신러닝 - 나이브베이즈 (Naive Bayes)

**2019-2023 [FinanceData.KR]()**


## 예제 - 와인 품종 데이터셋
https://nbviewer.org/fd9564d7fa5f042e5f5c50d01b1589ee


In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

wine=load_wine()
x_train, x_test, y_train, y_test=train_test_split(wine['data'], wine['target'], test_size=0.3, random_state=1)

model = GaussianNB()
model.fit(x_train, y_train)
model.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

## BernoulliNB, GaussianNB, MultinomialNB 비교

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

### Bernoulli Naive Bayes (BernoulliNB)
* 주어진 데이터가 이진(binary) 형태로 구성되어 있는 경우에 사용
* 각 특성의 값은 0 또는 1
* 예: 텍스트 데이터의 단어 등장 여부


In [None]:
from sklearn.naive_bayes import BernoulliNB

# 데이터 생성
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, random_state=0)

In [None]:
# 데이터 시각화
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=ListedColormap(['red', 'blue']), s=10)
plt.title("Generated Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 모델 학습
clf_bern = BernoulliNB()
clf_bern.fit(X_train, y_train)

# 모델 예측
y_pred = clf_bern.predict(X_test)

In [None]:
# 예측 결과 시각화
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
Z = clf_bern.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3, cmap=ListedColormap(['red', 'blue']))
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=ListedColormap(['red', 'blue']), s=10)
plt.title("BernoulliNB Classification")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
clf_bern.classes_

In [None]:
clf_bern.class_count_

In [None]:
np.exp(clf_bern.class_log_prior_)

In [None]:
fc = clf_bern.feature_count_
fc

In [None]:
theta = np.exp(clf_bern.feature_log_prob_)
theta

In [None]:
clf_bern.predict_proba([[-4, 4]])

### Gaussian Naive Bayes (GaussianNB)

데이터를 기반으로 클래스를 예측하는 데 사용

In [None]:
# 데이터 생성
X, y = make_blobs(n_samples=1000, n_features=2, random_state=0)

In [None]:
# 데이터 시각화
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=ListedColormap(['red', 'blue', 'green']), s=10)
plt.title("Generated Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 모델 학습
clf_norm = GaussianNB()
clf_norm.fit(X_train, y_train)

In [None]:
# 모델 예측
y_pred = clf_norm.predict(X_test)

accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred, average=None)

In [None]:
# 예측 결과 시각화
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
Z = clf_norm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3, cmap=ListedColormap(['red', 'blue', 'green']))
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=ListedColormap(['red', 'blue', 'green']), s=10)
plt.title("GaussianNB Classification")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
clf_norm.classes_   # y값이 가질 수 있는 클래스들

In [None]:
clf_norm.class_count_   # 각각 y 값의 갯수

In [None]:
clf_norm.class_prior_   # 각각 y의 확률(사전)

In [None]:
clf_norm.theta_ # 가우시안 정규 분포의 기댓값μ

In [None]:
clf_norm.var_ # 가우시안 정규 분포의 분산 σ2

### MultinomialNB :  multinomial models

In [None]:
# 데이터 생성
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0)

#  MultinomialNB 모델은 입력 데이터로 비음수(non-negative) 값만 허용
X_min = X.min()
X = X - X_min  # 최소값을 빼서 모든 값이 0 이상으로 만듦

In [None]:
# 데이터 시각화
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=ListedColormap(['red', 'blue']), s=10)
plt.title("Generated Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
from sklearn.naive_bayes import MultinomialNB

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 모델 학습
clf_mult = MultinomialNB()
clf_mult.fit(X_train, y_train)

# 모델 예측
y_pred = clf_mult.predict(X_test)

In [None]:
clf_mult.classes_

In [None]:
clf_mult.alpha

In [None]:
clf_mult.predict_proba([[1, 4], [7, 6]])

In [None]:
# 예측 결과 시각화
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
Z = clf_mult.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3, cmap=ListedColormap(['red', 'blue']))
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=ListedColormap(['red', 'blue']), s=10)
plt.title("MultinomialNB Classification")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

----
**2019-2023 [FinanceData.KR]()**
