# 朴素贝叶斯

- 条件概率
- 贝叶斯准则
![](data/beiyesi.png)
![](data/beiyesi2.jpg)

## 文本分类

In [11]:
def load_data_set():
    """
    创建数据集,都是假的 fake data set 
    :return: 单词列表posting_list, 所属类别class_vec
    """
    posting_list = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'gar e'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vec = [0, 1, 0, 1, 0, 1]  # 1 is 侮辱性的文字, 0 is not
    return posting_list, class_vec

In [12]:
# def createVocabList(dataset):
#     '''
#     不重复词列表
#     '''
#     vocabSet =[]
#     for document in dataset:
#         vocabSet +=document
#     return list(set(vocabSet))
def createVocabList(data_set):
    """
    获取所有单词的集合
    :param data_set: 数据集
    :return: 所有单词的集合(即不含重复元素的单词列表)
    """
    vocab_set = set([])  # create empty set
    for item in data_set:
        # | 求两个集合的并集
        vocab_set = vocab_set | set(item)
    return list(vocab_set)

In [13]:
def setOfWords2Vec(vocabList,inputSet):
    '''
    文档中出现词汇表中单词标记1，否则0
    vocabList：词汇表
    inputSet：文档
    '''
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if(word in vocabList):
            returnVec[vocabList.index(word)] =1
        else:
            print('the word : %s is not in my vocabulary!' % word)
    return returnVec

In [14]:
plist,_ = load_data_set()
myVocabList = createVocabList(plist)
print(myVocabList)
setOfWords2Vec(myVocabList,plist[0])

['dalmation', 'stupid', 'stop', 'my', 'to', 'love', 'mr', 'take', 'gar e', 'steak', 'help', 'food', 'has', 'licks', 'not', 'I', 'how', 'him', 'please', 'is', 'quit', 'dog', 'problems', 'flea', 'so', 'buying', 'worthless', 'park', 'maybe', 'cute', 'posting', 'ate']


[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [25]:
import numpy as np 
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAb = sum(trainCategory) / float(numTrainDocs)
    #防止为0
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if(trainCategory[i] ==1 ):
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #防止小数想成溢出
    p1Vect = np.log(p1Num / p1Denom)
    p0Vect = np.log(p0Num / p0Denom)
    return p0Vect,p1Vect,pAb

In [26]:
listPosts,listClasses = load_data_set()
myVocabList = createVocabList(listPosts)

In [27]:
trainMat = []
for postinDoc in listPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))

In [28]:
p0V,p1V,pAb = trainNB0(trainMat,listClasses)

In [29]:
#每个单词属于侮辱性文档的概率
p1V

array([-3.04452244, -1.65822808, -2.35137526, -3.04452244, -2.35137526,
       -3.04452244, -3.04452244, -2.35137526, -2.35137526, -3.04452244,
       -3.04452244, -2.35137526, -3.04452244, -3.04452244, -2.35137526,
       -3.04452244, -3.04452244, -2.35137526, -3.04452244, -3.04452244,
       -2.35137526, -1.94591015, -3.04452244, -3.04452244, -3.04452244,
       -2.35137526, -1.94591015, -2.35137526, -2.35137526, -3.04452244,
       -2.35137526, -3.04452244])

In [24]:
p0V

array([0.04166667, 0.        , 0.04166667, 0.125     , 0.04166667,
       0.04166667, 0.04166667, 0.        , 0.        , 0.04166667,
       0.04166667, 0.        , 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.04166667, 0.08333333, 0.04166667, 0.04166667,
       0.        , 0.04166667, 0.04166667, 0.04166667, 0.04166667,
       0.        , 0.        , 0.        , 0.        , 0.04166667,
       0.        , 0.04166667])

In [37]:
# 因为概率使用log,所以概率相乘就变成了概率相加
#log(a*b) = log(a) + log(b)
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if(p1>p0):
        return 1
    else:
        return 0

In [40]:
def testingNB():
    listPosts,listClasses = load_data_set()
    myVocabList = createVocabList(listPosts)
    trainMat = []
    for postinDoc in listPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(trainMat,listClasses)
    testEntry =['dog','my','stupid']
    thisdoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as :',classifyNB(thisdoc,p0V,p1V,pAb))

In [41]:
testingNB()

['dog', 'my', 'stupid'] classified as : 1


## 词袋模型
* 目前为止，我们将每个词是否出现作为一个特征，成为 *词级模型* (set of words model)
* 如果一个词在文档中不止出现一次，用 *词袋模型* (bag of words model)

In [42]:
def bagOfWord2VecMN(vocabList,inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        returnVec[vocabList.index(word)] +=1
    return returnVec