# 朴素贝叶斯

In [1]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

In [2]:
#统计出现的单词
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [3]:
#创建文档向量，表明输入文档中有没有词汇表中的某个词
def words2vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]=1
        else:
            print(word,"is not in my Vocabulary!")
    return returnVec

In [4]:
#创建词袋模型，在文档向量的基础上记录了每个词出现的次数
def words2vecBag(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        else:
            print(word,"is not in my Vocabulary!")
    return returnVec

In [6]:
sentenceList,sentenceClass = loadDataSet()
myVocabulary = createVocabList(sentenceList)
print(myVocabulary)

['please', 'take', 'I', 'dog', 'ate', 'mr', 'maybe', 'him', 'steak', 'not', 'stop', 'stupid', 'dalmation', 'so', 'quit', 'park', 'posting', 'buying', 'has', 'my', 'to', 'problems', 'garbage', 'food', 'licks', 'help', 'cute', 'is', 'flea', 'worthless', 'how', 'love']


In [7]:
print(words2vecBag(myVocabulary,sentenceList[0]))

[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]


#### 训练

In [8]:
import numpy as np
#vecList训练集中每一个样本生成的向量
#样本的类别列表
def trainNB(samplesVec,classList):
    numSample = len(samplesVec)
    numWords = len(classList)
    numClass = len(set(classList))
    #计算属于每个类别的概率
    classP = {}
    for aclass in classList:
        classP[str(aclass)] = classP.get(str(aclass),0) + 1
    for key in classP:
        #为了不出现0，上面加λ，下面加Sλ
        classP[key] = (classP[key]+1)/float(numWords+numClass*1)
    #计算以每个class为条件的每个w的条件概率
    classWP={}
    for key in classP:
        #初始值都设为1，避免后期概率相乘时出现0
        classWP[key]=np.ones(len(samplesVec[0]))
    for i in range(numSample):
        classlabel = str(classList[i])
        classWP[classlabel] = classWP[classlabel]+samplesVec[i]
    
    for key in classWP:
        classWP[key] = np.log(classWP[key]/np.sum(classWP[key]))
        #classWP[key] = classWP[key]/(np.sum(classWP[key])-numWords+numClass)
    return classP,classWP
    

In [9]:
sentenceList,sentenceClass = loadDataSet()
myVocabulary = createVocabList(sentenceList)
trainMat=[]
for item in sentenceList:
    trainMat.append(words2vec(myVocabulary,item))

In [10]:
classP,classWP = trainNB(trainMat,sentenceClass)

In [11]:
classP

{'0': 0.5, '1': 0.5}

In [12]:
classWP

{'0': array([-3.33220451, -4.02535169, -3.33220451, -3.33220451, -3.33220451,
        -3.33220451, -4.02535169, -2.9267394 , -3.33220451, -4.02535169,
        -3.33220451, -4.02535169, -3.33220451, -3.33220451, -4.02535169,
        -4.02535169, -4.02535169, -4.02535169, -3.33220451, -2.63905733,
        -3.33220451, -3.33220451, -4.02535169, -4.02535169, -3.33220451,
        -3.33220451, -3.33220451, -3.33220451, -3.33220451, -4.02535169,
        -3.33220451, -3.33220451]),
 '1': array([-3.93182563, -3.23867845, -3.93182563, -2.83321334, -3.93182563,
        -3.93182563, -3.23867845, -3.23867845, -3.93182563, -3.23867845,
        -3.23867845, -2.54553127, -3.93182563, -3.93182563, -3.23867845,
        -3.23867845, -3.23867845, -3.23867845, -3.93182563, -3.93182563,
        -3.23867845, -3.93182563, -3.23867845, -3.23867845, -3.93182563,
        -3.93182563, -3.93182563, -3.93182563, -3.93182563, -2.83321334,
        -3.93182563, -3.93182563])}

#### 分类

In [13]:
def classifyNB(sampleVec,classP,classWP):
    classResult={}
    for key in classWP:
        temp = np.sum(sampleVec*classWP[key]) + np.log(classP[key])
        classResult[key] = temp
    sortedclassResult = sorted(classResult.items(),key = lambda d:d[1],reverse = True)
    return sortedclassResult[0][0]

In [14]:
testEntry = ['love','my','dalmation']
classifyNB(words2vec(myVocabulary,testEntry),classP,classWP)

'0'

In [15]:
testEntry = ['stupid','garbage']
classifyNB(words2vec(myVocabulary,testEntry),classP,classWP)

'1'

### 过滤垃圾邮件

In [16]:
import os,random
from os.path import join
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 
    
def spamTest():
    docList = []
    classList = []
    fullText = []
    for filename in os.listdir('spam'):
        with open(join('spam',filename),errors='ignore')as fp:
            wordList = textParse(fp.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
    for filename in os.listdir('ham'):
        with open(join('ham',filename),errors='ignore')as fp:
            wordList = textParse(fp.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    #create vocabulary
    vocabList = createVocabList(docList)
    
    trainingSet = list(range(50)); testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    
    trainMat=[]; trainClasses = []
    
    for docIndex in trainingSet:#train the classifier (get probs) trainNB
        trainMat.append(words2vecBag(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    classP,classWP = trainNB(trainMat,trainClasses)
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = words2vecBag(vocabList, docList[docIndex])
        if classifyNB(wordVector,classP,classWP) != str(classList[docIndex]):
            errorCount += 1
            print ("classification error:\n",docList[docIndex])
    print ('the error rate is: ',float(errorCount)/len(testSet))
    #return vocabList,fullText

In [30]:
spamTest()

classification error:
 ['oem', 'adobe', 'microsoft', 'softwares', 'fast', 'order', 'and', 'download', 'microsoft', 'office', 'professional', 'plus', '2007', '2010', '129', 'microsoft', 'windows', 'ultimate', '119', 'adobe', 'photoshop', 'cs5', 'extended', 'adobe', 'acrobat', 'pro', 'extended', 'windows', 'professional', 'thousand', 'more', 'titles']
classification error:
 ['yeah', 'ready', 'may', 'not', 'here', 'because', 'jar', 'jar', 'has', 'plane', 'tickets', 'germany', 'for']
the error rate is:  0.2


  return _compile(pattern, flags).split(string, maxsplit)
