### 1.加载数据

In [28]:
def loadDataSet():
    dataSet = [['my','dog','has','flea','problems','help','please'],
               ['maybe','not','take','him','to','dog','park','stupid'],
               ['my','dalmation','is','so','cute','I','love','him'],
               ['stop','posting','stupid','worthless','garbage'],
               ['mr','licks','ate','my','steak','how','to','stop','him'],
               ['quit','buying','worthless','dog','food','stupid']]
    classLabel = [0,1,0,1,0,1]    
    return dataSet,classLabel 

In [29]:
dataSet,classLabel = loadDataSet()
dataSet

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

### 2.创建一个在所有文档中出现的不重复的词表

In [33]:
def vocabularyTable(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [34]:
vocabSet = vocabularyTable(dataSet)
vocabSet

['worthless',
 'to',
 'posting',
 'has',
 'my',
 'ate',
 'I',
 'buying',
 'flea',
 'love',
 'stop',
 'licks',
 'please',
 'dalmation',
 'mr',
 'maybe',
 'him',
 'not',
 'so',
 'garbage',
 'help',
 'dog',
 'steak',
 'take',
 'quit',
 'problems',
 'cute',
 'stupid',
 'how',
 'is',
 'food',
 'park']

### 3.对于每一个文档，即每一条评论，生成一个文档向量
### 首先创建一个和词表长度一样长的向量，并将其元素都置为0，
### 接着遍历文档中的所有单词，如果在词表中出现，则将文档向量中对应值设为1

In [35]:
def doc2vec(vocabSet,document):
    docVec = [0]*len(vocabSet)
    for word in document:
        if (word in vocabSet):
            docVec[vocabSet.index(word)] = 1
    return docVec        

In [36]:
docVec = doc2vec(vocabSet,dataSet[0])
docVec

[0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0]

### 4.训练贝叶斯分类器训练函数

In [37]:
import numpy as np
def trainBayes(trainVec,classLabel):
    numData = len(trainVec)
    numWords = len(trainVec[0])
    pAbusive = sum(classLabel)/float(numData)
    p0num = np.zeros(numWords); p1num = np.zeros(numWords)
    p0sum = 0.0; p1sum = 0.0
    for i in range(numData):
        if(classLabel[i]==1):
            p1num += trainVec[i]
            p1sum += sum(trainVec[i])
        else:
            p0num += trainVec[i]
            p0sum += sum(trainVec[i])
    p1Vect = p1num/p1sum
    p0Vect = p0num/p0sum
    return pAbusive,p1Vect,p0Vect

In [38]:
trainVec = []
for document in dataSet:
    trainVec.append(doc2vec(vocabSet,document))

In [39]:
pAbusive,p1Vect,p0Vect =  trainBayes(trainVec,classLabel)

In [40]:
pAbusive

0.5

In [41]:
p1Vect

array([0.10526316, 0.05263158, 0.05263158, 0.        , 0.        ,
       0.        , 0.        , 0.05263158, 0.        , 0.        ,
       0.05263158, 0.        , 0.        , 0.        , 0.        ,
       0.05263158, 0.05263158, 0.05263158, 0.        , 0.05263158,
       0.        , 0.10526316, 0.        , 0.05263158, 0.05263158,
       0.        , 0.        , 0.15789474, 0.        , 0.        ,
       0.05263158, 0.05263158])

In [42]:
p0Vect

array([0.        , 0.04166667, 0.        , 0.04166667, 0.125     ,
       0.04166667, 0.04166667, 0.        , 0.04166667, 0.04166667,
       0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.04166667,
       0.        , 0.08333333, 0.        , 0.04166667, 0.        ,
       0.04166667, 0.04166667, 0.04166667, 0.        , 0.        ,
       0.04166667, 0.04166667, 0.        , 0.04166667, 0.04166667,
       0.        , 0.        ])

In [43]:
vocabSet

['worthless',
 'to',
 'posting',
 'has',
 'my',
 'ate',
 'I',
 'buying',
 'flea',
 'love',
 'stop',
 'licks',
 'please',
 'dalmation',
 'mr',
 'maybe',
 'him',
 'not',
 'so',
 'garbage',
 'help',
 'dog',
 'steak',
 'take',
 'quit',
 'problems',
 'cute',
 'stupid',
 'how',
 'is',
 'food',
 'park']

In [44]:
#我们可看到导数第三个概率分别是0与0.04166667，词汇表的倒数第三个单词是cute，其在类别0中出现1次，在类别1中未出现
#我们找到所有概率中的最大值，该值出现在p1Vect的第20个位置，大小为0.15789474，词汇表的第20个位置单词是stupid,
#这意味着stupid是最能表征类别1（侮辱性文档类）的单词。

### 5.根据现实情况修改分类器

In [45]:
import numpy as np
import math
def trainBayes(trainVec,classLabel):
    numData = len(trainVec)
    numWords = len(trainVec[0])
    pAbusive = sum(classLabel)/float(numData)
    p0num = np.ones(numWords); p1num = np.ones(numWords)
    p0sum = 2; p1sum = 2
    for i in range(numData):
        if(classLabel[i]==1):
            p1num += trainVec[i]
            p1sum += sum(trainVec[i])
        else:
            p0num += trainVec[i]
            p0sum += sum(trainVec[i])
    for i in range(numWords):
        p1Vect[i] = math.log(p1num[i]/p1sum)
        p0Vect[i] = math.log(p0num[i]/p0sum)
    return pAbusive,p1Vect,p0Vect

### 6.朴素贝叶斯分类函数

In [46]:
def classify(docVec,pClass1,p1Vect,p0Vect):
    p1 = sum(docVec * p1Vect)+math.log(pClass1)
    p0 = sum(docVec *p0Vect) +math.log(1-pClass1)
    if(p1>p0):
        return 1
    else:
        return 0

In [47]:
test = ['love','my','dalmation']
testVec = doc2vec(vocabSet,test)
pClass1,p1Vect,p0Vect = trainBayes(trainVec,classLabel)
classify(testVec,pClass1,p1Vect,p0Vect)

0

In [50]:
test = ['stupid','garbage']
testVec = doc2vec(vocabSet,test)
pClass1,p1Vect,p0Vect = trainBayes(trainVec,classLabel)
classify(testVec,pClass1,p1Vect,p0Vect)

1

In [51]:
#对训练集进行测试
test = dataSet[0]
testVec = doc2vec(vocabSet,test)
pClass1,p1Vect,p0Vect = trainBayes(trainVec,classLabel)
classify(testVec,pClass1,p1Vect,p0Vect)

0