### 使用Bayesian模型处理垃圾邮件

#### 词汇表

In [1]:
def createVocabList(dataSet):
    vocabSet = set([]) # 空集合，存储最终的词汇表
    for data in dataSet:
        # vocabSet = vocabSet.union(set([str.lower(x) for x in data]))
        vocabSet = vocabSet | set([str.lower(x) for x in data])
    return sorted(list(vocabSet))

In [2]:
dataSet = [['a','i','love','You'],
           ['he','Love','you']]

vocabList = createVocabList(dataSet)
print(vocabList)

['a', 'he', 'i', 'love', 'you']


##### 词集模型

In [3]:
def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0] * len(vocabList) # 初始化向量，长度与词汇表长度一致
    LowerInputSet = [str.lower(x) for x in inputSet]
    for i in range(len(vocabList)):
        if vocabList[i] in LowerInputSet:
            returnVec[i] = 1
    return returnVec

In [4]:
print(setOfWords2Vec(vocabList,['I','a','you']))

[1, 0, 1, 0, 1]


##### 词袋模型

In [5]:
def bagOfWords2Vec(vocabList,inputSet):
    returnVec = [0] * len(vocabList) # 初始化向量，长度与词汇表长度一致
    LowerInputSet = [str.lower(x) for x in inputSet]
    for word in LowerInputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec    

In [6]:
print(bagOfWords2Vec(vocabList,['I','I','a','you']))

[1, 0, 2, 0, 1]


##### 处理邮件内容

In [7]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+', bigString) # 返回一个包含分割后单词的列表，每个单词由连续的字母/数字组成，且单词间由非单词字符分隔‌
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [8]:
textParse('I love You 30Kg --20ml')

['love', 'you', '30kg', '20ml']

##### 加载数据并训练模型

In [9]:
def loaddata():
    docList = []
    classList = []

    num = 26 # 垃圾邮件和非垃圾邮件各有25封

    for i in range(1,num):
        if i==17: # 第17封垃圾邮件编码为windows-1252
            wordList = textParse(open('data/email/spam/%d.txt' % i,encoding='Windows-1252').read())
        else:
            wordList = textParse(open('data/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)

        if i==6: # 第6封非垃圾邮件编码为windows-1252
            wordList = textParse(open('data/email/ham/%d.txt' % i,encoding='Windows-1252').read())
        else:
            wordList = textParse(open('data/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)
    X = []
    for doc in docList:
        X.append(setOfWords2Vec(vocabList,doc))
    return X, classList, vocabList

In [10]:
from sklearn import naive_bayes as nb
from sklearn.metrics import accuracy_score
X,y,vocabList = loaddata()

model = nb.MultinomialNB()
model.fit(X,y)

y_hat = model.predict(X)
print("正确率：",accuracy_score(y_hat,y))

正确率： 1.0


In [11]:
textParse(open('data/email/ham/%d.txt' % 5).read())

['there',
 'was',
 'guy',
 'the',
 'gas',
 'station',
 'who',
 'told',
 'that',
 'knew',
 'mandarin',
 'and',
 'python',
 'could',
 'get',
 'job',
 'with',
 'the',
 'fbi']

In [12]:
import chardet
 
with open('data/email/ham/6.txt', 'rb') as f:
    result = chardet.detect(f.read())
    print(result['encoding'])

Windows-1252
