In [25]:
'''
edited by:Qingping Zheng
2018-10-09
'''
from numpy import *

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 代表侮辱性文字, 0 代表不是
    return postingList,classVec

def createVocabList(dataSet): #创建一个包含在所有文档中出现的不重复词的列表
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)  #   操作符|用于求两个集合的并集
    return list(vocabSet)

def setofWords2Vec(vocabList, inputVec):
    returnVec = [0] * len(vocabList)
    for word in inputVec:
        if word in vocabList:
            tempIndex = vocabList.index(word) #获取出现词汇的下标
            returnVec[tempIndex] = 1
        else:
            print('the word: %s is not in vocabList' % word)
    return returnVec

################################################################################
#朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix, trainCategory):
    numofTrainDocs = len(trainMatrix)
    numofWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numofTrainDocs)  #计算p(c)  c是类别标号
    # pAbusive
    p0Num = ones(numofWords)
    p1Num = ones(numofWords)
    p0Denom = 2
    p1Denom = 2
    for i in range(numofTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)                  #计算p(xi | c)   c=1
    p0Vect = log(p0Num/p0Denom)                  #计算p(xi | c)   c=0取对数避免下溢出
    return p0Vect, p1Vect, pAbusive

###################################################################################
#朴素贝叶斯分类函数
def nbClassify(vec2Classify, p0Vect, p1Vect, pAbusive):
    p1 = sum(vec2Classify * p1Vect) + log(pAbusive)           #????????????这样就能计算一个新样本的类别了吗
    p0 = sum(vec2Classify * p0Vect) + log(1 - pAbusive)
    if p1 > p0:
        return 1
    else:
        return 0
    
def testingNBayes():
    listofPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listofPosts)
    trainMatrix = []
    for tempList in listofPosts:
        trainMatrix.append(setofWords2Vec(myVocabList, tempList))
    p0Vect, p1Vect, pAbusive = trainNB0(array(trainMatrix), array(listClasses))
    testEntry1 = ['love','my','dalmation']
    thisDoc1 = array(setofWords2Vec(myVocabList, testEntry1))
    tempClass1 = nbClassify(thisDoc1, p0Vect, p1Vect, pAbusive)
    print(testEntry1,'classified as:',tempClass1)
    testEntry2 = ['stupid','garbage']
    thisDoc2 = array(setofWords2Vec(myVocabList, testEntry2))
    tempClass2 = nbClassify(thisDoc2, p0Vect, p1Vect, pAbusive)    
    print(testEntry2,'classified as:', tempClass2)
    
##################################################################################
##朴素贝叶斯词袋模型
def bagofWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            tempIndex = vocabList.index(word) #获取出现词汇的下标
            returnVec[tempIndex] += 1
#         else:
#             print('the word: %s is not in vocabList' % word)
    return returnVec

##文件解析及完整的垃圾邮件测试函数
def textParse(bigString):
    import re
    listofTokens = re.split(r'\W', bigString) #使用re.split可以支持正则及多个符号切割
#     listofTokens = bigString.split('W*')
    return [tok.lower() for tok in listofTokens if len(tok) > 2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i,'r',encoding='UTF-8',errors='ignore').read())
        docList.append(wordList) #追加元素，将一个TXT文件作为一个元素加到列表后面
        fullText.extend(wordList)#追加列表，将一个TXT文件中的所有元素加列表后面
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i,'r',encoding='UTF-8',errors='ignore').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))  #要是生成两个相同的随机数怎么办？？？？？？？？？？？
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])       
    trainMatrix = []
    trainClasses = []
    for tempIndex in trainingSet:
        trainMatrix.append(setofWords2Vec(vocabList, docList[tempIndex]))
        trainClasses.append(classList[tempIndex])
    p0Vect, p1Vect, pSpam = trainNB0(array(trainMatrix), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        testWordVect = setofWords2Vec(vocabList, docList[docIndex])
        tempClass = nbClassify(array(testWordVect), p0Vect, p1Vect, pSpam)
        if tempClass != classList[docIndex]:
            errorCount += 1
    print('the total error rate is:', float(errorCount)/len(testSet))

#########################################################################################
##RSS源分类器及高频词去除函数
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key = operator.itemgetter(1), reverse = True)
    return sortedFreq[:30]

def localWords(feed1, feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen))
    testSet = []
    for i in range(20): #产生20个测试样本
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMatrix = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMatrix.append(bagofWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
        p0V, p1V, pSpam = trainNB0(array(trainMatrix), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVec = bagofWords2Vec(vocabList,docList[docIndex])
        if nbClassify(array(wordVec), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is:', float(errorCount)/len(testSet))
    return vocabList, p0V, p1V

####################################################################################
##最具表征性的词汇显示函数
def getTopWords(ny, sf):
    import operator
    vocabList, p0V, p1V = localWords(ny, sf)
    topNy = []
    topSf = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0:
            topSf.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0:
            topNy.append((vocabList[i], p1V[i]))
    sortedSf = sorted(topSf, key = lambda pair:pair[1], reverse = True)
    print('SF*****************************************************************')
    for item in sortedSf:
        print(item[0])
#     print(len(sortedSf))
    sortedNy = sorted(topNy, key = lambda pair:pair[1], reverse = True)
    print('NY*****************************************************************')
    for item in sortedNy:
        print(item[0])
#     print(len(sortedNy))