In [6]:
# coding:utf-8

#------------ 项目案例1 屏蔽社区留言板的侮辱性言论 -----------
import numpy as np

"""
贝叶斯
p(xy) = p(x|y)p(y)=p(y|x)p(x)
"""

def load_data_set():
    """
    创建数据集
    """
    posting_list = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'gar e'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
    ]
    class_vec = [0, 1, 0, 1, 0, 1]
    return posting_list, class_vec

def create_vocab_list(dataset):
    """
    获取所有单词的集合
    :param data_set
    :return
    """
    vocab_set = set()
    for item in dataset:
        vocab_set = vocab_set | set(item)
    return list(vocab_set)

def set_of_words2vec(vocab_list, input_set):
    """
    遍历查看该单词是否出现,出现该单词则将单词置为1
    :param vocab_list: 所有单词集合列表
    :param input_set:  输入数据集
    :return: 匹配列表[0, 1, 0, 1...]
    """
    result = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            result[vocab_list.index(word)] = 1
        else:
            pass
    return result

def _train_navie_bayes(train_mat, train_category):
    """
    朴素贝叶斯分类原版
    :param train_mat: type is ndarray
    :param train_category: [0, 1, 0]
    :return:
    """
    train_doc_num = len(train_mat)
    words_num = len(train_mat[0])
    # 因为侮辱性的被标记为1
    pos_abusive = np.sum(train_category)/train_doc_num
    p0num = np.zeros(words_num)
    p1num = np.zeros(words_num)
    
    #整个数据集单词出现的次数
    p0num_all = 0
    p1num_all = 0
    
    for i in range(train_doc_num):
        # 遍历所有的文件
        if train_category[i] == 1:
            p1num += train_mat[i]
            p1num_all += np.sum(train_mat[i])
        else:
            p0num += train_mat[i]
            p0num_all += np.sum(train_mat[i])
    # 后面需要改成取log函数
    p1vec = p1num/p1num_all
    p0vec = p0num/p0num_all
    return p0vec, p1vec, pos_abusive

def train_navie_bayes(train_mat, train_category):
    """
    朴素贝叶斯分类修正版
    :param train_mat: type is ndarray
    :param train_category:
    :return
    """
    train_doc_num = len(train_mat)
    words_num = len(train_mat[0])
    
    pos_abusive = np.sum(train_category)/train_doc_num
    p0num = np.ones(words_num)
    p1num = np.ones(words_num)
    p0num_all = 2.0
    p1num_all = 2.0
    
    for i in range(train_doc_num):
        if train_category[i] == 1:
            p1num += train_mat[i]
            p1num_all += np.sum(train_mat[i])
        else:
            p0num += train_mat[i]
            p0num_all += np.sum(train_mat[i])
    # 取log函数
    p1vec = np.log(p1num/p1num_all)
    p0vec = np.log(p0num/p0num_all)
    return p1vec, p0vec, pos_abusive

def classify_naive_bayes(vec2classify, p0vec, p1vec, p_class1):
    p1 = np.sum(vec2classify * p1vec) + np.log(p_class1)
    p0 = np.sum(vec2classify * p0vec) + np.log(1 - p_class1)
    if p1 > p0:
        return 1
    else:
        return 0

def bag_words2vec(vocab_list, input_set):
    result = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            result[vocab_list.index(word)] += 1
        else:
            print('the word: {} is not in my vocabulary'.format(word))
    return result

def testing_naive_bayes():
    #1. 加载数据集
    list_post, list_classes = load_data_set()
    #2. 创建单词集合
    vocab_list = create_vocab_list(list_post)
    #3.计算单词是否出现并创建数据矩阵
    train_mat = []
    for post_in in list_post:
        train_mat.append(set_of_words2vec(vocab_list, post_in))
    #4.训练数据
    p0v, p1v, p_abusive = train_navie_bayes(np.array(train_mat), np.array(list_classes))
    #5.测试数据
    test_one = ['love', 'my', 'dalmation']
    test_one_doc = np.array(set_of_words2vec(vocab_list, test_one))
    print('the result is: {}'.format(classify_naive_bayes(test_one_doc, p0v, p1v, p_abusive)))
    test_two = ['stupid', 'garbage']
    test_two_doc = np.array(set_of_words2vec(vocab_list, test_two))
    print('the result is :{}'.format(classify_naive_bayes(test_two_doc, p0v, p1v, p_abusive)))


#testing_naive_bayes()


    

the result is: 1
the result is :0
