#  Gaussian Naive Bayes


[Gaussian Naive Bayes](http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)

高斯模型假设数据每一维特征都服从高斯分布.


${\displaystyle P(x|y)={1 \over \sigma {\sqrt {2\pi }}}\,e^{-{(x-\mu )^{2} \over 2\sigma ^{2}}}} $

所以我们只需要求得每一个特征的$\sigma_i,\mu_i$在使用每一个特征的$\sigma_i,\mu_i$,去计算高斯值,即可以得到条件概率.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal

#### Scipy建立在Numpy的基础之上，用于数值运算.具有很多搞笑操作,如数值积分、优化、统计、信号处理，以及图像处理功能。

#### １加载数据

In [2]:
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data,columns = iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, [0, 1, -1]])
    return data[:,:2], data[:,-1]

In [4]:
X,y = create_data()

####  2 用sklearn将原始数据集划分为训练样本和测试样本

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [17]:
X_train[:4]

array([[5.7, 2.8],
       [5.6, 3. ],
       [5. , 3.4],
       [4.4, 3.2]])

#### 4 建立高斯模型

In [18]:
def NaiveBayes_Gaussian(X_train,y_train):
    m,n = X_train.shape
    unique_y = np.unique(y_train)
    K = unique_y.shape[0] 
    
    
    mu = np.zeros((K,n))
    sigma = np.zeros((K,n))
    pri_prob = np.zeros(K)
    
    for k in range(K):
        y = np.where(y_train==unique_y[k])[0]
        pri_prob[k] = y.shape[0] / m
        mu[k] = np.mean(X_train[y],axis=0)
        sigma[k] = np.var(X_train[y],axis=0)
        
        
    return pri_prob,mu,sigma

In [20]:
NaiveBayes_Gaussian(X_train,y_train)

(array([0.475, 0.525]), array([[4.97368421, 3.40263158],
        [5.92380952, 2.77857143]]), array([[0.12351801, 0.14551939],
        [0.2646712 , 0.09977891]]))

#### ５预测函数

In [21]:
def predict(pri_prob,mu,sigma,X_test):

    m,n = X_test.shape
    pridict = np.zeros((m,n))
    
    for k in range(n):
        prob = multivariate_normal.pdf(X_test,mean=mu[k],cov=sigma[k]) * pri_prob[k]
        
        pridict[:,k] = prob
        
    return np.argmax(pridict,axis=1)

In [22]:
predict_label = predict(pri_prob=pri_prob,mu=mu,sigma=sigma,X_test=X_test)
print("predict labels is: ",predict_label)

predict labels is:  [1 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0]


#### ６评分

In [23]:
def score(pri_prob,mu,sigma,X_test,y_test):
    predict_y = predict(pri_prob,mu,sigma,X_test)
    Correct_rate = np.sum(predict_y == y_test) / y_test.shape[0]
    print("correct rate is :{}".format(Correct_rate))

In [24]:
score(pri_prob,mu,sigma,X_test,y_test)

correct rate is :1.0


### 通过sklearn 调用高斯模型

In [25]:
from sklearn.naive_bayes import GaussianNB

In [35]:
clf = GaussianNB(priors=None, var_smoothing=1e-09)

In [36]:
clf.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [37]:
clf.score(X_test,y_test)

1.0