## 导入必要的库

In [1]:
%matplotlib inline

import random
import numpy as np
from math import *
from matplotlib import pyplot as plt

In [2]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

## 根据样本生成词向量
这里使用map也可以，使用set也可以

In [3]:
def buildWordVec(wordLists):
    wordVec = set([])
    for words in wordLists:
        wordVec = wordVec | set(words)
    return list(wordVec)

In [4]:
dataset, datalabel = loadDataSet()
wordVec = buildWordVec(dataset)
print(wordVec)

['my', 'ate', 'licks', 'so', 'not', 'to', 'maybe', 'dalmation', 'him', 'how', 'help', 'park', 'stop', 'problems', 'love', 'dog', 'has', 'please', 'cute', 'steak', 'stupid', 'worthless', 'mr', 'I', 'flea', 'food', 'quit', 'is', 'garbage', 'take', 'buying', 'posting']


## 把句子转换为词向量
使用list

In [5]:
def setOfWordVec(wordVec, words):
    N = len(wordVec)
    res = [0]*N
    for word in words:
        if word in wordVec:
            res[wordVec.index(word)] = 1
    return res

In [6]:
test_words = ['my', 'dog', 'is', 'quit']
test_res = setOfWordVec(wordVec, test_words)
index = list(range(len(wordVec)))
print(wordVec)
print(index)
print(test_res)

['my', 'ate', 'licks', 'so', 'not', 'to', 'maybe', 'dalmation', 'him', 'how', 'help', 'park', 'stop', 'problems', 'love', 'dog', 'has', 'please', 'cute', 'steak', 'stupid', 'worthless', 'mr', 'I', 'flea', 'food', 'quit', 'is', 'garbage', 'take', 'buying', 'posting']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]


## 训练函数

In [7]:
def trainNBS(train_data, train_lables, wordVec):
    ''' set of train labels '''
    label_v = list(set(train_lables))
    label_N = len(label_v)
    vec_len = len(wordVec)
    Ps = np.zeros((label_N, vec_len))
    Ns = np.zeros((label_N, 1))
    for data, label in zip(train_data, train_lables):
        vec = setOfWordVec(wordVec, data)
        idx = label_v.index(label)
        Ps[idx] += np.array(vec)
        Ns[idx] += 1
    return Ps/Ns, Ns.squeeze()/len(train_lables)

In [8]:
train_Pxy, train_Py = trainNBS(dataset, datalabel, wordVec)

In [9]:
print("x|y\n", train_Pxy)
print("y\n", train_Py)

x|y
 [[1.         0.33333333 0.33333333 0.33333333 0.         0.33333333
  0.         0.33333333 0.66666667 0.33333333 0.33333333 0.
  0.33333333 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333
  0.33333333 0.33333333 0.         0.         0.33333333 0.33333333
  0.33333333 0.         0.         0.33333333 0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.33333333 0.33333333
  0.33333333 0.         0.33333333 0.         0.         0.33333333
  0.33333333 0.         0.         0.66666667 0.         0.
  0.         0.         1.         0.66666667 0.         0.
  0.         0.33333333 0.33333333 0.         0.33333333 0.33333333
  0.33333333 0.33333333]]
y
 [0.5 0.5]


In [10]:
def classifyNBS(testDoc, Pxy, Py, wordVec):
    vec = setOfWordVec(wordVec, testDoc)
    mat = np.dot([vec], Pxy.T) * Py.reshape(1, -1)
    mat = mat.squeeze()
    print(mat)
    idx = np.argmax(mat)
    return idx

In [11]:
''' test naive bayes '''
test_words = ['love', 'my', 'dalmation']
print(classifyNBS(test_words, train_Pxy, train_Py, wordVec))
test_words = ['I', 'stupid']
print(classifyNBS(test_words, train_Pxy, train_Py, wordVec))

[0.83333333 0.        ]
0
[0.16666667 0.5       ]
1


## 使用词袋模型

In [12]:
def bagOfWordVec(wordVec, words):
    N = len(wordVec)
    res = [0]*N
    for word in words:
        if word in wordVec:
            res[wordVec.index(word)] += 1
    return res

## 对样本字词进行拆解

In [13]:
def text_split(text):
    import re
    token_list = re.split(r'\W*', text)
    return [token.lower() for token in token_list if len(token) > 2]

In [14]:
def load_dataset(pos_dir, neg_dir):
    import os
    
    labels = []
    dataset = []
    
    pos_files = os.listdir(pos_dir)
    neg_files = os.listdir(neg_dir)
    
    for file in pos_files:
        with open(os.path.join(pos_dir, file), 'r', encoding='ISO-8859-1') as fp:
            words = []
            line = fp.read()
            words.extend(text_split(line))
            dataset.append(words)
            labels.append(1)
    
    for file in neg_files:
        with open(os.path.join(neg_dir, file), 'r', encoding='ISO-8859-1') as fp:
            words = []
            line = fp.read()
            words.extend(text_split(line))
            dataset.append(words)
            labels.append(0)
    
    return dataset, labels

In [15]:
pos_dir = './email/ham'
neg_dir = './email/spam'
dataset, labels = load_dataset(pos_dir, neg_dir)

print(dataset[:10])
print(labels[:10])

[['jay', 'stepp', 'commented', 'your', 'status', 'jay', 'wrote', 'the', 'reply', 'this', 'email', 'comment', 'this', 'status', 'see', 'the', 'comment', 'thread', 'follow', 'the', 'link', 'below'], ['yeah', 'ready', 'may', 'not', 'here', 'because', 'jar', 'jar', 'has', 'plane', 'tickets', 'germany', 'for'], ['scifinance', 'now', 'automatically', 'generates', 'gpu', 'enabled', 'pricing', 'risk', 'model', 'source', 'code', 'that', 'runs', '300x', 'faster', 'than', 'serial', 'code', 'using', 'new', 'nvidia', 'fermi', 'class', 'tesla', 'series', 'gpu', 'scifinance', 'derivatives', 'pricing', 'and', 'risk', 'model', 'development', 'tool', 'that', 'automatically', 'generates', 'and', 'gpu', 'enabled', 'source', 'code', 'from', 'concise', 'high', 'level', 'model', 'specifications', 'parallel', 'computing', 'cuda', 'programming', 'expertise', 'required', 'scifinance', 'automatic', 'gpu', 'enabled', 'monte', 'carlo', 'pricing', 'model', 'source', 'code', 'generation', 'capabilities', 'have', 'be

  return _compile(pattern, flags).split(string, maxsplit)


In [29]:
'''
shuffled the dataset and labels
'''
index = list(range(len(labels)))
random.shuffle(index)

print(index)
dataset_new = []
labels_new = []
for i in index:
    dataset_new.append(dataset[i])
    labels_new.append(labels[i])

print(dataset_new[:5])
print(labels_new[:5])

[2, 12, 13, 11, 40, 7, 3, 14, 35, 33, 16, 45, 39, 8, 34, 27, 29, 24, 47, 21, 31, 6, 37, 43, 22, 10, 20, 41, 26, 23, 42, 36, 5, 46, 18, 44, 38, 0, 4, 17, 48, 15, 9, 25, 32, 30, 28, 1, 49, 19]
[['scifinance', 'now', 'automatically', 'generates', 'gpu', 'enabled', 'pricing', 'risk', 'model', 'source', 'code', 'that', 'runs', '300x', 'faster', 'than', 'serial', 'code', 'using', 'new', 'nvidia', 'fermi', 'class', 'tesla', 'series', 'gpu', 'scifinance', 'derivatives', 'pricing', 'and', 'risk', 'model', 'development', 'tool', 'that', 'automatically', 'generates', 'and', 'gpu', 'enabled', 'source', 'code', 'from', 'concise', 'high', 'level', 'model', 'specifications', 'parallel', 'computing', 'cuda', 'programming', 'expertise', 'required', 'scifinance', 'automatic', 'gpu', 'enabled', 'monte', 'carlo', 'pricing', 'model', 'source', 'code', 'generation', 'capabilities', 'have', 'been', 'significantly', 'extended', 'the', 'latest', 'release', 'this', 'includes'], ['peter', 'with', 'jose', 'out', 

In [30]:
'''
split them to train set and test set
'''
N = int(len(labels)*0.8)
train_dataset = dataset_new[:N]
train_labels  = labels_new[:N]

test_dataset = dataset_new[N+1:]
test_labels  = labels_new[N+1:]

In [31]:
wordVec = buildWordVec(train_dataset)
print(len(wordVec))

554


In [34]:
def trainNBS1(train_data, train_lables, wordVec):
    ''' set of train labels '''
    label_v = list(set(train_lables))
    label_N = len(label_v)
    vec_len = len(wordVec)
    Ps = np.zeros((label_N, vec_len))
    Ns = np.zeros((label_N, 1))
    for data, label in zip(train_data, train_lables):
        vec = bagOfWordVec(wordVec, data)
        idx = label_v.index(label)
        Ps[idx] += np.array(vec)
        Ns[idx] += 1
    return Ps/Ns, Ns.squeeze()/len(train_lables)

def classifyNBS1(testDoc, Pxy, Py, wordVec):
    vec = bagOfWordVec(wordVec, testDoc)
    mat = np.dot([vec], Pxy.T) * Py.reshape(1, -1)
    mat = mat.squeeze()
    print(mat)
    idx = np.argmax(mat)
    return idx

In [35]:
Pxy, Py = trainNBS1(train_dataset, train_labels, wordVec)

cur = 0
for i in range(len(test_dataset)):
    test_data = test_dataset[i]
    test_label = test_labels[i]
    l = classifyNBS1(test_data, Pxy, Py, wordVec)
    if l == test_label:
        cur+=1
print(">>> right/all: {}/{}".format(cur, len(test_dataset)))
print(cur/len(test_dataset))

[2.125 7.3  ]
[0.625 2.175]
[1.9   0.975]
[1.3   2.725]
[0.9 0.2]
[8.025 2.225]
[0.325 0.375]
[4.5  2.25]
[ 6.1 19.6]
>>> right/all: 8/9
0.8888888888888888
