# 朴素贝叶斯

- 优点：在数据较少的情况下仍然有效，可以处理多类别的问题。
- 缺点：对输入数据的准备方式较为敏感

In [4]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea',\
                   'problems', 'help', 'please'],
                  ['maybe', 'not', 'take', 'him', \
                  'to', 'dog', 'park', 'stupid'], \
                  ['my', 'dalmation', 'is', 'so', 'cute', \
                  'I', 'love', 'him'],
                  ['stop', 'posting', 'stupid', 'worthless', 'grabage'],
                  ['mr', 'licks', 'ate', 'my', 'steak', 'how', \
                  'to', 'stop', 'him'],
                  ['quit', 'buting', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec

def createVocabList(dataSet):
    '''创建一个包含所有词汇（不重复）的列表'''
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 # 这里只是赋值为1，所以这里只考虑某个单词是否出现
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [5]:
# test
listPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listPosts)
print(myVocabList)

['quit', 'stupid', 'steak', 'please', 'worthless', 'ate', 'flea', 'take', 'buting', 'maybe', 'how', 'has', 'dalmation', 'food', 'I', 'him', 'help', 'grabage', 'posting', 'dog', 'to', 'mr', 'not', 'love', 'problems', 'so', 'is', 'cute', 'licks', 'park', 'my', 'stop']


In [6]:
import numpy as np
from numpy import log



In [7]:
setOfWords2Vec(myVocabList, listPosts[0])

[0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0]

要计算的是当已知某个词向量时，出现某个类别的概率，由贝叶斯准则可知：
$$P(c_{i} | \vec W) = \frac {P(\vec W | c_{i})P(c_i)}{p(\vec W)}$$

In [8]:
import numpy as np
from math import log2

def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs) # P( trainCategory = 1 ) 
    # p0Num, p1Num = np.zeros(numWords), np.zeros(numWords) # 分别是在类别为1，0时，各个单词出现的次数
    # p0Denom, p1Denom = 0.0, 0.0
    
    # 对每个概率进行求log， 为了避免概率为0的情况，用add one平滑
    p0Num, p1Num = np.ones(numWords), np.ones(numWords)
    p0Denom, p1Denom = 2.0, 2.0 # 之所以是2，是因为有两个类别
    
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i] # 这是向量相加
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log2(p1Num / p1Denom) # P(c1 | W)
    p0Vect = np.log2(p0Num / p0Denom) # P(c0 | W)
    return p0Vect, p1Vect, pAbusive
        

In [9]:
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
trainMat = []
for postinDoc in listOfPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0v, p1v, pAb = trainNB0(trainMat, listClasses)

In [10]:
print(p0v)
print(p1v)
print(pAb)

[-4.70043972 -4.70043972 -3.70043972 -3.70043972 -4.70043972 -3.70043972
 -3.70043972 -4.70043972 -4.70043972 -4.70043972 -3.70043972 -3.70043972
 -3.70043972 -4.70043972 -3.70043972 -3.11547722 -3.70043972 -4.70043972
 -4.70043972 -3.70043972 -3.70043972 -3.70043972 -4.70043972 -3.70043972
 -3.70043972 -3.70043972 -3.70043972 -3.70043972 -3.70043972 -4.70043972
 -2.70043972 -3.70043972]
[-3.39231742 -2.39231742 -4.39231742 -4.39231742 -2.80735492 -4.39231742
 -4.39231742 -3.39231742 -3.39231742 -3.39231742 -4.39231742 -4.39231742
 -4.39231742 -3.39231742 -4.39231742 -3.39231742 -4.39231742 -3.39231742
 -3.39231742 -2.80735492 -3.39231742 -4.39231742 -3.39231742 -4.39231742
 -4.39231742 -4.39231742 -4.39231742 -4.39231742 -4.39231742 -3.39231742
 -4.39231742 -3.39231742]
0.5


In [13]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat = []
    for postinDoc in listOfPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0v, p1v, pAb = trainNB0(np.array(trainMat), np.array(listClasses) )
    
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0v, p1v, pAb))
    
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0v, p1v, pAb))

In [14]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
the word: garbage is not in my Vocabulary!
['stupid', 'garbage'] classified as:  1


# 词袋模型
前面的模型是只考虑某个词是否出现，这里是考虑出现了多少次。

In [15]:
def bagOfWord2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in returnVec:
            returnVec[returnVec.index(word)] += 1
    return returnVec

In [155]:
import random
def textParse(bigString):
    import re
    pattern = re.compile(r'\W*')
    listOfTokens = re.split(pattern, bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 0]

def spamText():
    docList, classList, fullText = [], [], []
    for i in range(1, 26):
        with open('./email/spam/%d.txt' % i, encoding='cp1252') as f:
            wordList = textParse(f.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        with open('./email/ham/%d.txt' % i, encoding='cp1252') as f:
            wordList = textParse(f.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat, trainClass = [], []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClass.append(classList[docIndex])
    p0v, p1v, pSpam = trainNB0(np.array(trainMat), np.array(trainClass))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0v, p1v, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))

In [156]:
spamText()

the error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)


## 使用朴素贝叶斯分类器从个人广告中获取区域倾向

In [92]:
import feedparser
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

In [97]:
ny['entries']

[{'dc_source': 'https://newyork.craigslist.org/que/stp/d/seeking-fellow-psycho-m4w/6486246554.html',
  'dc_type': 'text',
  'id': 'https://newyork.craigslist.org/que/stp/d/seeking-fellow-psycho-m4w/6486246554.html',
  'language': 'en-us',
  'link': 'https://newyork.craigslist.org/que/stp/d/seeking-fellow-psycho-m4w/6486246554.html',
  'links': [{'href': 'https://newyork.craigslist.org/que/stp/d/seeking-fellow-psycho-m4w/6486246554.html',
    'rel': 'alternate',
    'type': 'text/html'}],
  'published': '2018-02-28T03:10:00-05:00',
  'published_parsed': time.struct_time(tm_year=2018, tm_mon=2, tm_mday=28, tm_hour=8, tm_min=10, tm_sec=0, tm_wday=2, tm_yday=59, tm_isdst=0),
  'rights': 'copyright 2018 craigslist',
  'rights_detail': {'base': 'https://newyork.craigslist.org/search/stp?format=rss',
   'language': None,
   'type': 'text/plain',
   'value': 'copyright 2018 craigslist'},
  'summary': "Hi, \nI'm looking for someone who's pretty similar to me. \nI suffer from mild psychosis, bi-

In [101]:
len(ny['entries'])

25

In [100]:
import operator
def calcMostFreq(vocabList, fullText):
    '''计算前三十个频率最高的词'''
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq[:30]

In [148]:
import feedparser

def localWords(feed1, feed0):
    docList , classList, fullText = [], [], []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        classList.append(1)
        fullText.append(wordList)
        
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        classList.append(0)
        fullText.extend(wordList)
    vocabList = createVocabList(fullText)
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    
    trainingSet, testSet = list(range(2*minLen)), []
    for i in range(int(minLen*0.3)):
        randIndex = random.randint(0, len(trainingSet)-1)
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat, trainClass = [], []
    for docIndex in trainingSet:
        trainMat.append(bagOfWord2VecMN(vocabList, docList))
        trainClass.append(classList[docIndex])
    p0v, p1v, pSpam = trainNB0(np.array(trainMat), np.array(trainClass))
    errorCount = 0
    for docIndex in testSet:
        wordVec = bagOfWord2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVec), p0v, p1v, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is ', float(errorCount)/len(testSet))
    return vocabList, p0v, p1v
        

In [113]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')


In [158]:
vocabList, p0v, p1v = localWords(ny, sf)

the error rate is  0.42857142857142855


  return _compile(pattern, flags).split(string, maxsplit)


In [157]:
def getTopWords(ny, sf):
    vocalList, p0v, p1v = localWords(ny, sf)
    topNY, topSF = [], []
    for i in range(len(p0v)):
        if p0v[i] > -6.0: topSF.append((vocalList[i], p0v[i]))
        if p1v[i] > -6.0: topNY.append((vocalList[i], p1v[i]))
    sortedSF = sorted(topSF, key=lambda pair:pair[1], reverse=True)
    sortedNY = sorted(topNY, key=lambda pair:pair[1], reverse=True)
    print('SF--SF--SF--SF--SF--SF--SF--SF')
    for item in sortedSF:
        print(item[0])
    print('NY--NY--NY--NY--NY--NY--NY--NY')
    for item in sortedNY:
        print(item[0])

In [159]:
getTopWords(ny, sf)

the error rate is  0.42857142857142855
SF--SF--SF--SF--SF--SF--SF--SF
pictures
bronx
women
sma
x
cliche
9
first
21
blog
k
type
guy
know
how
pretty
ll
pilates
always
hurt
slightly
fee
bust
e
articulate
h
chill
since
cl
relief
f
long
flushing
honestly
eclecticism
camp
demanding
queens
often
25
lots
form
people
bring
among
biking
introvert
also
nature
hello
p
o
guess
jerk
sleep
cast
clean
comes
another
find
our
work
1
above
art
end
weekend
short
shooting
get
from
goth
there
gt
all
woman
talk
where
psychosis
8
usually
its
theres
lol
areas
shot
drummer
year
then
later
live
pic
but
6
interested
what
person
muse
mature
fit
full
sooo
everything
easy
dark
hav
travel
together
hi
send
shy
attract
successful
other
buddies
white
sent
hung
give
korean
annonymus
off
special
feel
really
tell
on
w
latina
wants
span
yourself
into
no
college
kneading
proximity
smoking
much
connect
j
man
cool
care
great
chat
as
face
needed
right
say
party
hours
wow
fitness
serious
feels
either
forgets
we
discrete
hate
may

  return _compile(pattern, flags).split(string, maxsplit)
