In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Container

%matplotlib inline
%config InlineBackend.figure_format='svg'

  


In [2]:
# 创建鸢尾花数据集
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data,columns = iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    # 100 samples
    data = np.array(df.iloc[:100,:])
    # 4 features
    return data[:,:-1],data[:,-1]

In [3]:
X,y = create_data()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=123)

### $$GaussianNB$$

$$E(X)=\sum_{i=1}^{∞}x_ip_i$$

$$D(X)=\sum_i^{n}(x_i-EX)^2p_i$$

$$P(x_i|y_i)=\frac{1}{\sqrt{2\pi\sigma^{2}_{yk}}}exp(-\frac{(x_i-u_{yk})^2}{2\sigma^2_{yk}})$$

In [4]:
# 定义高斯贝叶斯模型
class NaiveBayes:
    def __init__(self):
        self.model = None
        
    # 均值
    @staticmethod
    def mean(X):
        return sum(X)/float(len(X))
    
    # 标准差
    def stdev(self,X):
        avg = self.mean(X)
        return np.sqrt(sum([np.power(x-avg,2) for x in X])/float(len(X)))
    
    # 概率密度函数
    def gaussian_probability(self,x,mean,stdev):
        exponent = np.exp(-(np.power(x-mean,2)/(2*np.power(stdev,2))))
        
        return (1/(np.sqrt(2*np.pi)*stdev))*exponent
    
    # process train set
    def summarize(self,train_data):
        summaries = [(self.mean(i),self.stdev(i)) for i in zip(*train_data)]
        return summaries
    
    # 计算平均值和标准差
    def fit(self,X,y):
        labels = list(set(y))
        data = {label:[] for label in labels}
        for f,label in zip(X,y):
            data[label].append(f)
            
        self.model = {
            label : self.summarize(value)
            for label,value in data.items()
        }
        
        return 'gaussianNB train done!'
    
    # 计算概率
    def calculate_probabilities(self,input_data):
        probabilities = {}
        for label,value in self.model.items():
            probabilities[label] = 1
            for i in range(len(value)):
                mean , stdev = value[i]
                probabilities[label] *= self.gaussian_probability(input_data[i],mean,stdev)
                
        return probabilities
    
    
    # 返回标签
    def predict(self,X_test):
        label = sorted(
            self.calculate_probabilities(X_test).items(),
            key = lambda x:x[-1])[-1][0]
        return label
    
    def score(self,X_test,y_test):
        right = 0
        for X,y in zip(X_test,y_test):
            label = self.predict(X)
            if label == y:
                right += 1
        return right/float(len(X_test))

In [5]:
clf = NaiveBayes()

In [6]:
clf.fit(X_train,y_train)

'gaussianNB train done!'

In [7]:
print(clf.predict([4.4,  3.2,  1.3,  0.2]))

0.0


In [8]:
clf.score(X_test,y_test)

1.0