# Naive Bayes

#### 1.1 [贝叶斯定理](https://zh.wikipedia.org/wiki/%E8%B4%9D%E5%8F%B6%E6%96%AF%E5%AE%9A%E7%90%86):

贝叶斯定理是关于随机事件A和B的条件概率的一则定理:

${\displaystyle P(A|B)={\frac {P(A)\times P(B|A)}{P(B)}}}$

其中$P(A|B)$是指在事件B发生的情况下事件A发生的概率。

在贝叶斯定理中,每个名词都有约定俗成的名称:

- $P(A|B)$是已知B发生后A的条件概率,也由于得自B的取值而被称作A的后验概率.
- $P(A)$是A的先验概率(或边缘概率),之所以称为"先验"是因为它不考虑任何B方面的因素.
- $P(B|A)$是已知A发生后B的条件概率,也由于得自A的取值而被称作B的后验概率.
- $P(B)$是B的先验概率或边缘概率.

也就是说:

后验概率 = (似然性(Likelihood)*先验概率)/标准化常量

更一般化的情况,假设$\{A_i\}$是事件集合里的部分集合,对于任意的$A_i$,贝氏定理可用下式表示:

$P(A_{i}|B)={\frac {P(B|A_{i})\,P(A_{i})}{\sum _{j}P(B|A_{j})\,P(A_{j})}}$

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
def LoadData():
    """
    Load data set 
    
    Return:
    ------
        data: DataFrame.
        
    Note:
    ----
        last column: labels
    """
    datasets = np.array([[1,1,1,1,1,2,2,2,2,2,3,3,3,3,3],
                  ['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L'],
                  [-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]]).T
    columns = np.array(['X1','X2','Y'])

    data = pd.DataFrame(data=datasets,columns=columns)
    return data

In [6]:
data = LoadData()
data

Unnamed: 0,X1,X2,Y
0,1,S,-1
1,1,M,-1
2,1,M,1
3,1,S,1
4,1,S,-1
5,2,S,-1
6,2,M,-1
7,2,M,1
8,2,L,1
9,2,L,1


### NB的代码

In [90]:
def NaiveBayes(data,test_X,gamma,is_print=False):
    """
    Build Naive Bayes
    
    Parameters:
    ----------
        data: training set.
        test_X: testing set.
        gamma: Laplace smoothing
        is_print: is print label category and pos posterior probability.
    """
    
    # 获取labels
    labels = data.iloc[:,-1]
    # 获取labels的类别个数K
    labels_ = np.array(list(set(labels)))
    len_labels_ = labels_.shape[0]
    
    # 先验概率
    pri_prob= np.zeros((len_labels_))
    
    #>>>print(pri_prob)
    #>>>[0. 0.]
    # 预测值保留数组
    m,n = test_X.shape
    cache_predict = np.zeros((m,len_labels_))
    
    # 计算先验概率
    for i in range(len_labels_):
        P_y = (labels[labels == labels_[i]].size + gamma) / (labels.size + len_labels_ * gamma)
        pri_prob[i] = P_y
    
    # 计算条件概率
    for i in range(m):
        for j in range(len_labels_):
            Conditional_Prob = 1
            for k in range(n):
        
                data_label = data[labels==labels_[j]] # 该标签下的所有数据
                future_k = data_label.iloc[:,k]  # test_X 的第k个特征列
                molecule = data_label[future_k == test_X[i,k]].shape[0] + gamma # 分子部分
                Sj = len(set(future_k))
                denominator = data_label.shape[0] + (Sj * gamma) # 分母部分
                
                Conditional_Prob *= molecule /denominator # 计算条件概率
                
            Pos_proba = pri_prob[j] * Conditional_Prob # 计算后验概率
            
            cache_predict[i,j] += Pos_proba
        
    # if ture,print labels_ and predict probability array.
    if is_print:
        print(labels_)
        print(cache_predict)
        
    best_predict_index = np.argmax(cache_predict,axis=1)
    return labels_[best_predict_index]

In [91]:
test_x = np.array([['2','S']])
predic_label = NaiveBayes(data=data,test_X=test_x,gamma=1,is_print=True)
print('Predict label is: ',predic_label)

['-1' '1']
[[0.06100218 0.03267974]]
Predict label is:  ['-1']


### 不带拉普拉斯平滑项的贝叶斯

In [234]:
#计算的是先验概率
def cal_pre_prob(data):
    y = data.iloc[:,-1]
    y_size = y.size
    prob_y = {}
    for i in y:
        if i not in prob_y:
            prob_y[i] = 1
        else:
            prob_y[i] += 1
    for key,value in prob_y.items():
        prob_y[key] = value / y.size
    return prob_y

In [235]:
cal_pre_prob(data)

{'-1': 0.4, '1': 0.6}

In [199]:
test_x = ['2','S']

In [244]:
#计算条件概率
def clac_condition_prob(data,test_x):
    columns_x = data.columns[:-1]
    columns_len = len(columns_x)
    all_size = data.index.size
    labels = data.iloc[:,-1]
    labels_ = np.array(list(set(labels)))
    len_labels_ = labels_.shape[0]
    cond_Proba = 1
    for i in range(columns_len):
        #看有几个2和几个S
        
        for j in range(len_labels_):
            data_label = data[labels==labels_[j]]
            denominator = data_label.shape[0] #分母的部分
            cond_prob = (data[columns_x[i]] == test_x[i]).sum() / denominator
        
        cond_Proba *= cond_prob
    return cond_Proba

In [245]:
clac_condition_prob(data,test_x)

0.24691358024691357

In [248]:
def split_x(data,test_x):
    y = data.iloc[:,-1]
    prob_y = cal_pre_prob(data)
    unique_y = pd.unique(y)
    predict_y = []
    for i in unique_y:
        pri_prob = prob_y[i]
        
        #>>>print(pri_prob)
        #>>>7
        #   10
        
        s_data = data[y == i]
        cond_Proba = clac_condition_prob(s_data,test_x) #拆分后的计算条件概率
        houyan = pri_prob * cond_Proba
        predict_y.append([i,houyan])
        
    print(predict_y)
    predict = sorted(predict_y,key = lambda z:z[1],reverse = True)[0][0]
    return predict

In [249]:
split_x(data,test_x)

[['-1', 0.06666666666666667], ['1', 0.02222222222222222]]


'-1'

### 带拉普拉斯平滑项的贝叶斯

In [82]:
#计算先验概率，将最后一列计数包含拉普拉斯平滑
def cal_pre_prob(data,gamma):
    y = data.iloc[:,-1] #取出最后一列y的标签
    K = len(pd.unique(y)) #K代表y的类别
    all_y_size = y.size + (K * gamma)
    prob_y = {}
    for i in y:
        if i not in prob_y:
            prob_y[i] = 1
        else:
            prob_y[i] += 1 
        
    for key,value in prob_y.items():
        prob_y[key] = (value + gamma) / all_y_size
    return prob_y

In [83]:
cal_pre_prob(data,1)

{'-1': 0.4117647058823529, '1': 0.5882352941176471}

In [241]:
#计算条件概率
def clac_condition_prob(data,test_x,gamma):
    colums = data.columns[:-1] 
    
    #>>> print(colums)
    #>>> ['X1', 'X2']
    
    all_size = data.index.size #计数最后一列
    
    #>>> print(all_size)
    #>>> 15
    cond_Proba = 1 
    for i in range(len(colums)): #循环几个特征 要用下面拆分后的数据算
        Sj = len(pd.unique(data[colums[i]]))
        cond_prob = (data[colums[i]] == test_x[i]).sum() + gamma / (all_size + Sj * gamma)
        cond_Proba *= cond_prob
    return cond_Proba

In [85]:
def split_data(data,test_x):
    unique_y = pd.unique(data.iloc[:,-1]) #去重
    #>>>print(unique_y)
    #>>>['-1' '1']
    prob_y = cal_pre_prob(data,1)
    predict_y = []
    for i in unique_y:
        pri_prob = prob_y[i]
        
        #>>>print(pri_prob)
        #>>>7
        #   10
        
        s_data = data[data.iloc[:,-1] == i]
        cond_Proba = clac_condition_prob(s_data,test_x,1) #拆分后的计算条件概率
        houyan = pri_prob * cond_Proba
        predict_y.append([i,houyan])
        
    print(predict_y)
    predict = sorted(predict_y,key = lambda z:z[1],reverse = True)[0][0]
    return predict

In [86]:
split_data(data,test_x)

[['-1', 2.7044299201161945], ['1', 1.9648692810457515]]


'-1'

 ### pip3 install -i https://pypi.douban.com/simple scikit-learn     

### 通过sklearn 调用Native bayes 

In [92]:
from sklearn.model_selection import train_test_split

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [96]:
import numpy as np
a,b = X_train,y_train
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(a,b)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
print(clf.predict(X_test))


[1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.]


In [99]:
clf.score(X_test,y_test)

0.9

In [98]:
clf.score(X_train,y_train)

0.875

In [133]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, [0, 1, -1]])
    return data[:,:2], data[:,-1]
X, y = create_data()

###  GaussionNB   sklearn

In [129]:
import numpy as np
X = np.array(X_train)
Y = np.array(y_train)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, Y)
GaussianNB(priors=None, var_smoothing=1e-09)
print(clf.predict(X_test))

clf_pf = GaussianNB()
clf_pf.partial_fit(X, Y, np.unique(Y))
GaussianNB(priors=None, var_smoothing=1e-09)
print(clf_pf.predict(X_test))


[1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.]
[1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.]


In [131]:
clf_pf.score(X_test,y_test)

1.0

In [136]:
clf.score(X_train,y_train)

1.0

### ComplementNB  sklearn

In [143]:
import numpy as np
X,y = X_train,y_train
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(X, y)
ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
print(clf.predict(X_test))


[1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.]


In [145]:
clf_pf.score(X_test,y_test)

1.0

In [146]:
clf_pf.score(X_train,y_train)

1.0

### 文本识别

In [149]:
#加载数据集
def loadDataSet():
    
    postingList = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless','grabage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = np.array([0,1,0,1,0,1]) # 1 is absive,0 not
    
    return postingList,classVec

In [178]:
postingList,classVec = loadDataSet()
postingList

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
 ['stop', 'posting', 'stupid', 'worthless', 'grabage'],
 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

In [151]:
def create_Vocabulary(postingList):
    #第一步：去重
    #这种去重效率低
    #Voc_list = []
    #for words in postingList:
        #for word in words:
            #if word not in Voc_list:
                #Voc_list.append(word)
    Voc_list = set([])
    for words in postingList:
        Voc_list = Voc_list | set(words)
    #去重后的文本
    return list(Voc_list)
    
    

In [180]:
Vocabulary = create_Vocabulary(postingList)

In [183]:
def words2Vec(Vocabulary,postingList):
    m,n = len(postingList),len(Vocabulary)
    #转换成零向量
    Words_Vec = np.zeros((m,n))
    for i in range(m):
        for word in postingList[i]:
            try:
                index = Vocabulary.index(word)
                Words_Vec[i,index] = 1
            except:
                print('这个{}不在我们的词集中'.format(word))
            
    return Words_Vec        

In [184]:
Words_Vec = words2Vec(Vocabulary,postingList)
Words_Vec#不能保证位置一样

array([[0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
        0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.]])

In [185]:
def Model(X,y,X_test,Vocabulary,gamma):
    
    
    uniuqe_y = np.unique(y)
    K = len(uniuqe_y)
     
    all_y_size = y.shape[0] + (K * gamma)
    #all_y_size = 8
    prob_b = {}
    for i in y:
        if i not in prob_b:
            prob_b[i] = 1
        else:
            prob_b[i] += 1
    for key,value in prob_b.items():
        prob_b[key] = (value + gamma) / all_y_size 
    
    #>>>print(prob_b)
    #>>>{0: 0.5, 1: 0.5}
    predict_ = []
    # 计算先验概率
    
    #P_1 =( y.sum()+gamma) /(y.shape[0]+K*gamma)
    #P_1 = 0.5
    
    #P_0 = 1 - P_1
    #P_y = {0:P_0,1:P_1}
    test_x = words2Vec(Vocabulary,X_test)
    #print(P_1)
    # 计算条件概率
    for label in uniuqe_y:
        Sj = np.unique(X[label]).shape[0]
        pri_P_y = prob_b[label]
        Index = np.where(y==label)[0]
        split_X = X[Index]
        #Sj = 2
        cond_fenmu = split_X.shape[0] + (Sj * gamma)
        
        sum_feature = np.sum(test_x == split_X,axis=0) + gamma
        cond_prob = np.product(sum_feature / cond_fenmu) 
        
        houyan = pri_P_y * cond_prob
        predict_.append([label,houyan])
    print(predict_)
    predict = sorted(predict_,key=lambda z:z[1],reverse=True)[0][0]
    return predict
        

In [186]:
X_test = [['my','SB']]
Model(X=Words_Vec,y=classVec,X_test=X_test,Vocabulary=Vocabulary,gamma=1)

这个SB不在我们的词集中
[[0, 8.374976495501263e-07], [1, 1.9606712234552562e-07]]


0

In [45]:
classVec.sum() / classVec.shape[0]

0.5

In [48]:
index = np.where(classVec == 0)[0]
index

array([0, 2, 4])