## 基于概率论的分类方法：朴素贝叶斯

### 使用python进行文本分类

In [1]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid','worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec

In [3]:
def createVocabList(dataSet):
    vocabSet = set([]) #创建一个空的set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #创建两个Set的并集
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)   #产生一个包含len(voacbList)个0元素的列表
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % woed)
    return returnVec

In [4]:
listOposts, listClasses = loadDataSet()
print(listOposts)
print(listClasses)

[['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
[0, 1, 0, 1, 0, 1]


In [5]:
myVocabList = createVocabList(listOposts)
myVocabList

['help',
 'take',
 'posting',
 'dog',
 'cute',
 'not',
 'dalmation',
 'flea',
 'has',
 'maybe',
 'to',
 'garbage',
 'my',
 'so',
 'mr',
 'stupid',
 'I',
 'stop',
 'him',
 'love',
 'buying',
 'ate',
 'problem',
 'food',
 'worthless',
 'is',
 'please',
 'how',
 'steak',
 'quit',
 'licks',
 'park']

In [7]:
setOfWords2Vec(myVocabList,listOposts[1])

[0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

### 训练算法：从词向量计算概率

In [14]:
import numpy as np
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs) #标签为1的样本数除以总数
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])  ###为什么要计算p1Denom???
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)   #为什么要这样计算??
    p0Vect = np.log(p0Num/p0Denom)
    return p0Vect, p1Vect,pAbusive

In [15]:
trainMat=[]
for postinDoc in listOposts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
trainMat

[[1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0],
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0]]

In [18]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)
print(p0V)
print(p1V)
print(pAb)
print(len(p0V))

[-2.56494936 -3.25809654 -3.25809654 -2.56494936 -2.56494936 -3.25809654
 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -3.25809654
 -1.87180218 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.56494936
 -2.15948425 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654
 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -3.25809654
 -2.56494936 -3.25809654]
[-3.04452244 -2.35137526 -2.35137526 -1.94591015 -3.04452244 -2.35137526
 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -2.35137526
 -3.04452244 -3.04452244 -3.04452244 -1.65822808 -3.04452244 -2.35137526
 -2.35137526 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -2.35137526
 -1.94591015 -3.04452244 -3.04452244 -3.04452244 -3.04452244 -2.35137526
 -3.04452244 -2.35137526]
0.5
32


In [19]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) ###加号是因为取对数，相加即相乘
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [22]:
def testingNB():
    listOposts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOposts)
    trainMat = []
    for postinDoc in listOposts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

In [23]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


- 文本的切分

In [30]:
mySent = 'This book is the beat book on Python or M.L. I have ever laid eyes upon.'
mySent.split()

['This',
 'book',
 'is',
 'the',
 'beat',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [31]:
import re

In [32]:
regEx = re.compile('\\w*')
listOfTokens = regEx.split(mySent)
listOfTokens

  


['',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '.',
 '. ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '.']

In [35]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [36]:
textParse(mySent)

['this',
 'book',
 'the',
 'beat',
 'book',
 'python',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [61]:
def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1, 26):
        wordList = textParse(open('./email/spam/%d.txt' % i, encoding="ISO-8859-1").read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./email/ham/%d.txt' % i, encoding="ISO-8859-1").read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = range(50);testSet =[]   #create test set
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(list(trainingSet)[randIndex])
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error", docList[docIndex])
    print('The error rate is: ', float(errorCount)/len(testSet))

In [62]:
spamTest()

The error rate is:  0.0


In [64]:
!conda list

# packages in environment at C:\ProgramData\Anaconda3:
#
# Name                    Version                   Build  Channel
_ipyw_jlab_nb_ext_conf    0.1.0            py36he6757f0_0    defaults
absl-py                   0.6.0                     <pip>
alabaster                 0.7.10           py36hcd07829_0    defaults
anaconda                  5.2.0                    py36_3    defaults
anaconda-client           1.6.14                   py36_0    defaults
anaconda-navigator        1.8.7                    py36_0    defaults
anaconda-project          0.8.2            py36hfad2e28_0    defaults
asn1crypto                0.24.0                   py36_0    defaults
astor                     0.7.1                     <pip>
astroid                   1.6.3                    py36_0    defaults
astropy                   3.0.2            py36h452e1ab_1    defaults
attrs                     18.1.0                   py36_0    defaults
babel                     2.5.3                    py36_0   

markupsafe                1.0              py36h0e26971_1    defaults
matplotlib                2.2.2            py36h153e9ff_1    defaults
mccabe                    0.6.1            py36hb41005a_1    defaults
menuinst                  1.4.14           py36hfa6e2cd_0    defaults
mistune                   0.8.3            py36hfa6e2cd_1    defaults
mkl                       2018.0.2                      1    defaults
mkl-service               1.1.2            py36h57e144c_4    defaults
mkl_fft                   1.0.1            py36h452e1ab_0    defaults
mkl_random                1.0.1            py36h9258bd6_0    defaults
more-itertools            4.1.0                    py36_0    defaults
mpmath                    1.0.0            py36hacc8adf_2    defaults
msgpack                   0.5.6                     <pip>
msgpack-python            0.5.6            py36he980bc4_0    defaults
msys2-conda-epoch         20160418                      1    defaults
multipledispatch          0.5.0 

In [65]:
!pip install feedparser

Collecting feedparser
  Downloading https://files.pythonhosted.org/packages/91/d8/7d37fec71ff7c9dbcdd80d2b48bcdd86d6af502156fc93846fb0102cb2c4/feedparser-5.2.1.tar.bz2 (192kB)
Building wheels for collected packages: feedparser
  Running setup.py bdist_wheel for feedparser: started
  Running setup.py bdist_wheel for feedparser: finished with status 'done'
  Stored in directory: C:\Users\28614\AppData\Local\pip\Cache\wheels\8c\69\b7\f52763c41c5471df57703a0ef718a32a5e81ee35dcf6d4f97f
Successfully built feedparser
Installing collected packages: feedparser
Successfully installed feedparser-5.2.1


You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [66]:
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq

In [67]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [83]:
def localWords(feed1, feed0):  ###fees0 and feed1 是两个RSS源
    import feedparser
    docList = []; classList = []; fullText =[]
    minLen = min(len(feed1['links']), len(feed0['links'])) ### ???
    
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])  #删除高频的前30个词汇
    trainingSet = range(2*minLen); testSet = []
    print(2*minLen)
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(list(trainingSet)[randIndex])
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList(docIndex)))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
    return vocabList, p0V, p1V

In [84]:
import feedparser
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

In [85]:
print(sf)

{'feed': {'html': {'class': 'no-js'}, 'links': [{'type': 'text/css', 'rel': 'stylesheet', 'media': 'all', 'href': 'https://www.craigslist.org/styles/simple-page.css?v=3c90c8a90be2e9f4e6248d38855be90c'}, {'type': 'text/css', 'rel': 'stylesheet', 'media': 'all', 'href': 'https://www.craigslist.org/styles/jquery-ui-clcustom.css?v=3b05ddffb7c7f5b62066deff2dda9339'}, {'type': 'text/css', 'rel': 'stylesheet', 'media': 'all', 'href': 'https://www.craigslist.org/styles/jquery.qtip-2.2.1.css?v=cd202aead4d1dd4894fbae4ade23fcf8'}], 'meta': {'name': 'viewport', 'content': 'width=device-width,initial-scale=1'}}, 'entries': [], 'bozo': 1, 'headers': {'Connection': 'keep-alive', 'Cache-control': 'private', 'Last-Modified': 'Thu, 20 Dec 2018 13:06:42 GMT', 'Date': 'Thu, 20 Dec 2018 13:06:42 GMT', 'Content-Encoding': 'gzip', 'Vary': 'Accept-Encoding', 'Content-Length': '1863', 'Content-Type': 'text/html; charset=UTF-8', 'X-Frame-Options': 'SAMEORIGIN', 'Pragma': 'no-cache', 'Set-Cookie': 'cl_b=tENOGFgE

In [80]:
len(sf['entries'])

0

In [86]:
vocabList, psF, pNY = localWords(ny, sf)

KeyError: 'links'

In [87]:
def getTopWords(ny, sf):
    import operator
    vocabList, p0V, p1V = localWords(ny, sf)
    topNY = []; topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -0.6:
            topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -0.6:
            topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lamba pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedSF = sorted(topNY, key=lamba pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedNY:
        print(item[0])

SyntaxError: invalid syntax (<ipython-input-87-4936d0c0081c>, line 10)