# 基于概率论的分类方法：朴素贝叶斯

## 数据集以及词集到向量的转换

In [1]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec


def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: {!s} is not in my Vocabulary!".format(word))
    return returnVec

In [2]:
listOPosts, listClasses = loadDataSet()

In [3]:
myVocabList = createVocabList(listOPosts)
print(myVocabList)

['maybe', 'ate', 'buying', 'park', 'stupid', 'has', 'help', 'to', 'I', 'not', 'garbage', 'mr', 'posting', 'licks', 'food', 'how', 'please', 'worthless', 'take', 'flea', 'him', 'dalmation', 'my', 'love', 'stop', 'problems', 'steak', 'dog', 'quit', 'cute', 'is', 'so']


In [4]:
print(setOfWords2Vec(myVocabList, listOPosts[0]))

[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]


In [5]:
print(setOfWords2Vec(myVocabList, listOPosts[3]))

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


## 朴素贝叶斯分类器训练函数

In [6]:
import numpy as np


def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = np.sum(trainCategory) / float(numTrainDocs)
    p0Num, p1Num = np.zeros(numWords), np.zeros(numWords)
    p0Denom, p1Denom = 0.0, 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom
    p0Vect = p0Num / p0Denom
    return p0Vect, p1Vect, pAbusive

In [7]:
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [8]:
print(trainMat)

[[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]]


In [9]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)
print("p0V: ", p0V)
print("p1V: ", p1V)
print("pAb: ", pAb)

p0V:  [0.         0.04166667 0.         0.         0.         0.04166667
 0.04166667 0.04166667 0.04166667 0.         0.         0.04166667
 0.         0.04166667 0.         0.04166667 0.04166667 0.
 0.         0.04166667 0.08333333 0.04166667 0.125      0.04166667
 0.04166667 0.04166667 0.04166667 0.04166667 0.         0.04166667
 0.04166667 0.04166667]
p1V:  [0.05263158 0.         0.05263158 0.05263158 0.15789474 0.
 0.         0.05263158 0.         0.05263158 0.05263158 0.
 0.05263158 0.         0.05263158 0.         0.         0.10526316
 0.05263158 0.         0.05263158 0.         0.         0.
 0.05263158 0.         0.         0.10526316 0.05263158 0.
 0.         0.        ]
pAb:  0.5


## 根据现实情况修改分类器

In [10]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num, p1Num = np.ones(numWords), np.ones(numWords)
    p0Denom, p1Denom = 2.0, 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num / p1Denom)
    p0Vect = np.log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive

In [11]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)
print("p0V: ", p0V)
print("p1V: ", p1V)
print("pAb: ", pAb)

p0V:  [-3.25809654 -2.56494936 -3.25809654 -3.25809654 -3.25809654 -2.56494936
 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -3.25809654 -2.56494936
 -3.25809654 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654
 -3.25809654 -2.56494936 -2.15948425 -2.56494936 -1.87180218 -2.56494936
 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -2.56494936]
p1V:  [-2.35137526 -3.04452244 -2.35137526 -2.35137526 -1.65822808 -3.04452244
 -3.04452244 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244
 -2.35137526 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -1.94591015
 -2.35137526 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -3.04452244
 -2.35137526 -3.04452244 -3.04452244 -1.94591015 -2.35137526 -3.04452244
 -3.04452244 -3.04452244]
pAb:  0.5


In [12]:
def classifyNB(vec2classify, p0Vec, p1Vect, pClass1):
    p1 = sum(vec2classify * p1Vect) + np.log(pClass1)
    p0 = sum(vec2classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0
    
    
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0Vec, p1Vect, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, "classified as: ", classifyNB(thisDoc, p0Vec, p1Vect, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, "classified as: ", classifyNB(thisDoc, p0Vec, p1Vect, pAb))

In [13]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


## 文档词袋模型

In [14]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

## 使用朴素贝叶斯过滤垃圾邮件

### 准备数据：切分文本

In [15]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
print(mySent.split())

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']


In [16]:
import re

regEx = re.compile('[^a-zA-Z0-9]+')
listOfTokens = regEx.split(mySent)
print(listOfTokens)

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']


In [17]:
print([tok for tok in listOfTokens if len(tok) > 0])

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon']


In [18]:
print([tok.lower() for tok in listOfTokens if len(tok) > 0])

['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']


In [19]:
emailText = open('/home/coco/Documents/MachineLearninginAction/Ch04/email/ham/6.txt').read()
listOfTokens = regEx.split(emailText)
print([tok.lower() for tok in listOfTokens if len(tok) > 0])

['hello', 'since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'google', 'groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message', 'pages', 'or', 'files', 'we', 'are', 'writing', 'to', 'inform', 'you', 'that', 'we', 'will', 'no', 'longer', 'be', 'supporting', 'these', 'features', 'starting', 'february', '2011', 'we', 'made', 'this', 'decision', 'so', 'that', 'we', 'can', 'focus', 'on', 'improving', 'the', 'core', 'functionalities', 'of', 'google', 'groups', 'mailing', 'lists', 'and', 'forum', 'discussions', 'instead', 'of', 'these', 'features', 'we', 'encourage', 'you', 'to', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation', 'such', 'as', 'google', 'docs', 'and', 'google', 'sites', 'for', 'example', 'you', 'can', 'easily', 'create', 'your', 'pages', 'on', 'google', 'sites', 'and', 'share', 'the', 'site', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'py', 'hl', 'en',

### 使用朴素贝叶斯进行交叉验证

In [20]:
def textParse(bigString):
    listOfTokens = re.split('[^a-zA-Z0-9]+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]


def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('/home/coco/Documents/MachineLearninginAction/Ch04/email/spam/{:d}.txt'.
                                  format(i)).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('/home/coco/Documents/MachineLearninginAction/Ch04/email/ham/{:d}.txt'.
                                  format(i)).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('The error rate is: {:.3f}'.format(float(errorCount) / len(testSet)))

In [21]:
spamTest()

The error rate is: 0.000


### 从个人发布的征婚广告中获取区域倾向

In [22]:
import feedparser
import operator


def calcMostFreq(vocabList, fullText):
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq[:30]


def localWords(feed1, feed0):
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):  # 数据集的大小为2*minLen
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        vocabList.remove(pairW[0])  # 词汇列表去掉前30个最高频的词汇，之后再用来创建词集向量
    trainingSet = list(range(2 * minLen))
    testSet = []
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(randIndex)
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print("the error is: {:.3f}".format(float(errorCount/len(testSet))))
    return vocabList, p0V, p1V


ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
# vocabList, pSF, pNF = localWords(ny, sf)
print(ny)
print(sf)

{'bozo': 1, 'entries': [], 'feed': {}, 'headers': {'strict-transport-security': 'max-age=63072000'}, 'href': 'https://newyork.craigslist.org/stp/index.rss', 'status': 301, 'encoding': 'iso-8859-1', 'bozo_exception': SAXParseException('no element found'), 'version': '', 'namespaces': {}}
{'bozo': 1, 'entries': [], 'feed': {}, 'headers': {'strict-transport-security': 'max-age=63072000'}, 'href': 'https://sfbay.craigslist.org/stp/index.rss', 'status': 301, 'encoding': 'iso-8859-1', 'bozo_exception': SAXParseException('no element found'), 'version': '', 'namespaces': {}}


In [23]:
freq = {'dog': 5, 'cat':2, 'pig': 1, 'monkey': 10}
sortedfreq = sorted(freq.items(), key=operator.itemgetter(1), reverse=True)
print(sortedfreq[:2])
freq.items()

[('monkey', 10), ('dog', 5)]


dict_items([('dog', 5), ('cat', 2), ('pig', 1), ('monkey', 10)])

### 分析数据：显示地域相关的用词

In [24]:
def getTopWords(ny, sf):
    vocabList, p0V, p1V = localWords(ny, sf)
    topNY = []
    topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0:
            topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0:
            topNY.append((vocabList[i], p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**" * 16)
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**" * 16)
    for item in sortedNY:
        print(item[0])