<a href="https://colab.research.google.com/github/wakerh1/SF/blob/master/%E5%A4%9A%E5%88%86%E7%B1%BB%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from collections import Counter
from sklearn import datasets
import pandas as pd
import time
from sklearn.model_selection import train_test_split

In [0]:
class NaiveBayes:
    def __init__(self, lamb=1):
        self.lamb = lamb  # 贝叶斯估计的参数
        self.prior = dict()  # 存储先验概率
        self.conditional = dict()  # 存储条件概率
 
    def training(self, features, target):
        """
        根据朴素贝叶斯算法原理,使用 贝叶斯估计 计算先验概率和条件概率
        特征集集为离散型数据,预测类别为多元.  数据集格式为np.array
        :param features: 特征集m*n,m为样本数,n为特征数
        :param target: 标签集m*1
        :return: 不返回任何值,更新成员变量
        """
        features = np.array(features)
        target = np.array(target).reshape(features.shape[0], 1)
        m, n = features.shape
        labels = Counter(target.flatten().tolist())  # 计算各类别的样本个数
        k = len(labels.keys())  # 类别数
        for label, amount in labels.items():
            self.prior[label] = (amount + self.lamb) / (m + k * self.lamb)  # 计算平滑处理后的先验概率
        for feature in range(n):  # 遍历每个特征
            if  feature%800==0:
                print("-----百分之:%f-----"%(feature/n*100))
            self.conditional[feature] = {}
            values = np.unique(features[:, feature])
            for value in values:  # 遍历每个特征值
                self.conditional[feature][value] = {}
                for label, amount in labels.items():  # 遍历每种类别
                    feature_label = features[target[:, 0] == label, :]  # 截取该类别的数据集
                    c_label = Counter(feature_label[:, feature].flatten().tolist())  # 计算该类别下各特征值出现的次数
                    self.conditional[feature][value][label] = (c_label.get(value, 0) + self.lamb) / \
                                                              (amount + len(values) * self.lamb)  # 计算平滑处理后的条件概率
        return
 
    def predict(self, features):
        """预测单个样本"""
        best_poster, best_label = -np.inf, -1
        for label in self.prior:
            poster = np.log(self.prior[label])  # 初始化后验概率为先验概率,同时把连乘换成取对数相加，防止下溢（即太多小于1的数相乘，结果会变成0）
            for feature in range(features.shape[0]):
                poster += np.log(self.conditional[feature][features[feature]][label])
            if poster > best_poster:  # 获取后验概率最大的类别
                best_poster = poster
                best_label = label
        return best_label

In [0]:
def loadDataSet(doc):
    """加载数据集合及其对应的分类"""

    data = pd.read_csv(doc)
    data = np.array(data)
    wordsList = data[:, 1:-1]
    lable = set(data[:, 0])
    reslable=data[:,0]
    value = []
    for i in range(len(lable)):
        value.append(i)

    classVec = dict(zip(lable, value))
    for i in range(data.shape[0]):
        reslable[i]=classVec.get(reslable[i])
    return wordsList,reslable

In [0]:
def test():
    dataset,lable=loadDataSet("retrain.csv")
    print("数据量为:%d"%(dataset.shape[0]))
    dataset = np.concatenate((dataset,lable.reshape(-1, 1)), axis=1)  # 组合数据
    np.random.shuffle(dataset)  # 打乱数据
    X_train, X_test, y_train, y_test = train_test_split(dataset[:,:-1], dataset[:,-1:], test_size=0.33, random_state=42)
    
    start=time.time()
    nb = NaiveBayes()
    nb.training(dataset[:,:-1],  dataset[:,-1:])
    
    prediction = []
    for features in X_test:
        prediction.append(nb.predict(features))
    correct = [1 if a == b else 0 for a, b in zip(prediction, y_test)]
    end=time.time()
    print(end-start)
    print(correct.count(1) / len(correct))  # 计算准确率

In [30]:
test()

FileNotFoundError: ignored