In [1]:
# import

import numpy as np
import pandas as pd
import math

In [2]:
# read excel and geenrate dataframe
np.random.seed(3)
raw_dataset = pd.DataFrame(pd.read_excel(r'dataset_watermelon3.xlsx'))
raw_dataset = raw_dataset.sample(frac = 1).reset_index(drop=True) # shuffle
raw_dataset.loc[raw_dataset['好瓜']=='是','好瓜']=1
raw_dataset.loc[raw_dataset['好瓜']=='否','好瓜']=0
print(raw_dataset)

# one-hot encoding
dataset = pd.DataFrame()
for col in raw_dataset.iloc[:,:-3].columns :
    for element in raw_dataset[col].unique():
        dataset[col+element]=np.zeros_like(raw_dataset[col])
        dataset.loc[raw_dataset[col]==element,col+element]=1
for col in raw_dataset.iloc[:,-3:].columns :
    dataset[col]=raw_dataset[col]
    
print(dataset)


    色泽  根蒂  敲声  纹理  脐部  触感     密度    含糖率  好瓜
0   青绿  蜷缩  沉闷  稍糊  稍凹  硬滑  0.719  0.103   0
1   乌黑  稍蜷  浊响  稍糊  稍凹  软粘  0.481  0.149   1
2   浅白  蜷缩  浊响  清晰  凹陷  硬滑  0.556  0.215   1
3   乌黑  稍蜷  浊响  清晰  稍凹  硬滑  0.437  0.211   1
4   乌黑  稍蜷  浊响  清晰  稍凹  软粘  0.360  0.370   0
5   乌黑  蜷缩  沉闷  清晰  凹陷  硬滑  0.774  0.376   1
6   浅白  蜷缩  浊响  模糊  平坦  软粘  0.343  0.099   0
7   乌黑  蜷缩  浊响  清晰  凹陷  硬滑  0.634  0.264   1
8   浅白  稍蜷  沉闷  稍糊  凹陷  硬滑  0.657  0.198   0
9   青绿  稍蜷  浊响  清晰  稍凹  软粘  0.403  0.237   1
10  青绿  蜷缩  浊响  清晰  凹陷  硬滑  0.697  0.460   1
11  青绿  稍蜷  浊响  稍糊  凹陷  硬滑  0.639  0.161   0
12  浅白  蜷缩  浊响  模糊  平坦  硬滑  0.593  0.042   0
13  青绿  蜷缩  沉闷  清晰  凹陷  硬滑  0.608  0.318   1
14  青绿  硬挺  清脆  清晰  平坦  软粘  0.243  0.267   0
15  乌黑  稍蜷  沉闷  稍糊  稍凹  硬滑  0.666  0.091   0
16  浅白  硬挺  清脆  模糊  平坦  硬滑  0.245  0.057   0
    色泽青绿  色泽乌黑  色泽浅白  根蒂蜷缩  根蒂稍蜷  根蒂硬挺  敲声沉闷  敲声浊响  敲声清脆  纹理稍糊  纹理清晰  纹理模糊  \
0      1     0     0     1     0     0     1     0     0     1     0     0   
1      0     1     0     0     1  

In [3]:
# define the class
class Naive_Bayes:
    
    def __init__(self,dataset):
        self.dataset = dataset
            
        
    def MLE(self,sample_density,sample_sugar_content,class_i_dataset):
        '''
        实现了正态分布样本的极大似然估计，返回样本特征满足正态分布条件的概率
        '''
        def compute_probability(x,mu,sigma):
            exp = math.exp(-(x-mu)**2/(2*(sigma**2)))
            coe = 1/(((2*math.pi)**0.5)*sigma)
            return coe*exp
        density = class_i_dataset[:,-3] # [N,1]
        sugar_content = class_i_dataset[:,-2] # [N,1]
        
        mu_d = density.mean()
        mu_s = sugar_content.mean()
        
        sigma_d = np.std(density)
        sigma_s = np.std(sugar_content)
        
        return (compute_probability(sample_density, mu_d, sigma_d),
                        compute_probability(sample_sugar_content, mu_s, sigma_s) )
        
    def classification(self,sample_property,train_classification,class_0_set,class_1_set):
        # 样本为0类的概率（贝叶斯）
        num_of_train_set = len(train_classification)
        num_of_class_0 = len(train_classification[train_classification==0])
        class_0_probability = 1*num_of_class_0/num_of_train_set
        for i in range(len(sample_property)-2):
            class_0_probability *= len(class_0_set[class_0_set[:,i]==sample_property[i]]) \
                                    /num_of_class_0
        
        p_den_0,p_sug_0 = self.MLE(sample_property[-2],sample_property[-1],class_0_set)
        class_0_probability = class_0_probability*p_den_0*p_sug_0
        #-------------------------------------------------------------
        # 样本为1类的概率（贝叶斯）
        num_of_class_1 = len(train_classification[train_classification==1])
        class_1_probability = 1*num_of_class_1/num_of_train_set
        for i in range(len(sample_property)-2):
            class_1_probability *= len(class_1_set[class_1_set[:,i]==sample_property[i]]) \
                                    /num_of_class_1
        p_den_1,p_sug_1 = self.MLE(sample_property[-2],sample_property[-1],class_1_set)
        class_1_probability = class_1_probability*p_den_1*p_sug_1
        
        return (1 if class_0_probability<class_1_probability else 0)   

    def Cross_validation(self, k=5):
        '''
        实现了交叉验证
        '''
        num_in_each_group = int(len(self.dataset)/k)
        correct_classification = 0
        wrong_classification = 0

        for i in range(0,k):
            # 分为k组，每次选取一组作为验证集，其余组作为训练集
            k_train_set = pd.concat([self.dataset.iloc[0:i*num_in_each_group,:],  \
                                self.dataset.iloc[(i+1)*num_in_each_group:,:]])
            k_train_classification =  k_train_set.iloc[:,-1].values
            k_class_0_set = k_train_set[k_train_set['好瓜']==0].values 
            k_class_1_set = k_train_set[k_train_set['好瓜']==1].values 

            k_val_set = self.dataset.iloc[i*num_in_each_group:(i+1)*num_in_each_group,:]
            k_val_property = k_val_set.iloc[:,:-1].values

            for j in range((k_val_property.shape[0])) :
                # 对验证集的每一个样本进行验证
                if(k_val_set.iloc[j,-1]==   \
                        self.classification(k_val_property[j,:],k_train_classification,k_class_0_set,k_class_1_set)):
                    correct_classification += 1
                else :
                    wrong_classification += 1
        # print(correct_classification,wrong_classification)
        return correct_classification/(correct_classification+wrong_classification)
    

In [4]:
Bayes_watermelon = Naive_Bayes(dataset)
print('Accuracy with Cross Validation:',"%.2f%%"  %(Bayes_watermelon.Cross_validation()*100))

Accuracy with Cross Validation: 66.67%
