# 使用朴素贝叶斯进行文档分类

## 准备数据：从文本中构建词向量

In [1]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    # 1-代表侮辱性文字，0-代表正常言论
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec

In [2]:
# 创建一个包含在所有文档中出现的不重复词的列表
def createVocabList(dataSet):
    # 创建一个空集合，为了去重
    vocabSet = set([])
    for document in dataSet:
        # 创建两个集合的并集
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [3]:
def setOfWords2Vec(vocabList, inputSet):
    # 创建一个其中所含元素都是0的向量，进行该向量数组的初始化
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            # 1或0表示词汇表中的单词在输入文档中是否出现
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word )
    return returnVec

In [4]:
listOPosts, listClasses = loadDataSet()

In [5]:
myVocabList = createVocabList(listOPosts)

### 得到没有排序的词汇表

In [6]:
myVocabList

['him',
 'cute',
 'quit',
 'my',
 'so',
 'flea',
 'problem',
 'to',
 'is',
 'stop',
 'please',
 'has',
 'how',
 'posting',
 'dog',
 'steak',
 'love',
 'not',
 'help',
 'buying',
 'garbage',
 'maybe',
 'food',
 'park',
 'worthless',
 'dalmation',
 'stupid',
 'licks',
 'I',
 'mr',
 'ate',
 'take']

In [7]:
setOfWords2Vec(myVocabList, listOPosts[0])

[0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [8]:
setOfWords2Vec(myVocabList, listOPosts[3])

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0]

## 训练算法：从词向量计算概率

In [9]:
import numpy as np

In [10]:
# trainMatrix-文档矩阵，trainCategory-由每篇文档类别标签所构成的向量
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    # 求出有侮辱性词语的文档占比
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    # 初始化概率，num为分子，denom为分母
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = 0.0
    p1Denom = 0.0
    for i in range(numTrainDocs):
        # p1表示侮辱性词语，p0表示正常词语
        if trainCategory[i] == 1:
            # 向量相加
            # 侮辱性词语的词频数向量
            p1Num += trainMatrix[i]
            # 侮辱性词语的总词数
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # 对每个元素做除法，求出该词的概率即p(w|c)
    p1Vect = p1Num / p1Denom
    p0Vect = p0Num / p0Denom
    return p0Vect, p1Vect, pAbusive

In [11]:
listOPosts, listClasses = loadDataSet()

In [12]:
myVocabList = createVocabList(listOPosts)

In [13]:
trainMat = []

In [14]:
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [15]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [16]:
pAb

0.5

In [17]:
p0V

array([0.08333333, 0.04166667, 0.        , 0.125     , 0.04166667,
       0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.04166667, 0.04166667, 0.        , 0.04166667,
       0.04166667, 0.04166667, 0.        , 0.04166667, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04166667, 0.        , 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.        ])

In [18]:
p1V

array([0.05263158, 0.        , 0.05263158, 0.        , 0.        ,
       0.        , 0.        , 0.05263158, 0.        , 0.05263158,
       0.        , 0.        , 0.        , 0.05263158, 0.10526316,
       0.        , 0.        , 0.05263158, 0.        , 0.05263158,
       0.05263158, 0.05263158, 0.05263158, 0.05263158, 0.10526316,
       0.        , 0.15789474, 0.        , 0.        , 0.        ,
       0.        , 0.05263158])

### 找到所有概率中的最大值，该值出现在第7个下标位置，大小为0.15789474，对应的单词是'stupid'

In [19]:
myVocabList[np.where(p1V == np.max(p1V))[0][0]]

'stupid'

## 测试算法：根据现实情况修改分类器

### 朴素贝叶斯分类函数

In [20]:
from math import log

In [21]:
# trainMatrix-文档矩阵，trainCategory-由每篇文档类别标签所构成的向量
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    # 求出有侮辱性词语的文档占比
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    # 初始化概率，num为分子，denom为分母
    # 如果其中一个概率值为0， 那么最后的乘积也为0，为了降低这种影响，
    # 可以将所有词的出现数初始化为1， 并将分母初始化为2
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        # p1表示侮辱性词语，p0表示正常词语
        if trainCategory[i] == 1:
            # 向量相加
            # 侮辱性词语的词频数向量
            p1Num += trainMatrix[i]
            # 侮辱性词语的总词数
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # 对每个元素做除法，求出该词的概率即p(w|c)
    # 解决下溢出问题：利用ln(a*b)=ln(a)+ln(b)
    p1Vect = np.array([log(x) for x in p1Num/p1Denom])
    p0Vect = np.array([log(x) for x in p0Num/p0Denom])
    return p0Vect, p1Vect, pAbusive

In [22]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    # 元素相乘，相当于p(w0|c1)p(w1|c1)...
    # 由于p(w)是一样的，所以在求p1的时候就不需要做除法运算了，计算p(c|w)
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [23]:
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb))

In [24]:
testingNB()

['love', 'my', 'dalmation'] classified as :  0
['stupid', 'garbage'] classified as :  1


可以得出，第一个评论是正常言论，第二个评论是含有侮辱性词语的

## 朴素贝叶斯词袋模型

In [25]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec