# 文本情感分析（基于伯努利模型的朴素贝叶斯）

In [47]:
from functools import reduce
import numpy as np

## 创建实验样本

In [48]:
def loadDataSet():
    postingList = [
        ['my','dog','has','flea','problem','help','please'], # 无侮辱性
        ['maybe','not','take','him','to','dog','park','stupid'], # 侮辱性
        ['my','dalmation','is','so','cute','I','love','him'], # 无侮辱性
        ['stop','posting','stupid','worthless','garbage'], # 侮辱性
        ['mr','licks','ate','my','steak','how','to','stop','him'], # 无侮辱性
        ['quit','buying','worthless','dog','food','stupid'] # 侮辱性
    ]
    classVec = [0,1,0,1,0,1] # 1代表侮辱性，0代表不是
    return postingList, classVec

## 创建词汇表（去重）

In [49]:
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

## 向量化

In [50]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec

## 朴素贝叶斯分类器训练函数

In [51]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix) # 计算训练的文档数
    numWords = len(trainMatrix[0]) # 计算每篇文档的词条数
    pAbusive = sum(trainCategory) / len(trainCategory) # 侮辱类词汇的占比
    p0Num = np.ones(numWords) # 存所有非侮辱类词汇的条件概率
    p1Num = np.ones(numWords) # 存所有侮辱类词汇的条件概率
    p0Denom = 2.0
    p1Denom = 2.0 # 分母初始化为2，拉普拉斯平滑
    for i in range(numTrainDocs):
        if trainCategory[i] == 1: # 侮辱类，即P(w0|1),P(w1|1)...
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else: # 非侮辱类，即P(w0|0),P(w1|0)
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vec = np.log(p1Num / p1Denom) # 取log，防止指数
    p0Vec = np.log(p0Num / p0Denom)
    return p0Vec, p1Vec, pAbusive

## 利用朴素贝叶斯进行分类

In [52]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) # 对应元素相乘 log(a*b) = loga + logb
    p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
    if p0 > p1:
        return 0
    else:
        return 1

## 主函数

In [55]:
postingList, classVec = loadDataSet()
myVocabList = createVocabList(postingList)
trainMat = []
for postinDoc in postingList:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(trainMat, classVec)
testEntry = ['love','my','dalmation'] # 测试样本1
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
if classifyNB(thisDoc, p0V, p1V, pAb):
    print(testEntry,'属于侮辱类')
else:
    print(testEntry,'属于非侮辱类')
    
testEntry = ['stupid','garbage'] # 测试样本2
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
if classifyNB(thisDoc, p0V, p1V, pAb):
    print(testEntry,'属于侮辱类')
else:
    print(testEntry,'属于非侮辱类')

['love', 'my', 'dalmation'] 属于非侮辱类
['stupid', 'garbage'] 属于侮辱类
