In [1]:
import numpy as np
import operator
from collections import defaultdict

In [2]:
def getSigmaMean(dataSet, labels):
    '''
    Instruction: Naive Bayes trainging functions, the value is numeric
    
    Parameters:
        dataSet - array of features and values
        labels - tags of each event
    
    Returns:
        finalSigmaMean - the mean and standard deviation of each tags
            {0: [([avrg1, avrg2, ...] , [sigma1, sigma2, ...])], 1: [([avrg1, avrg2, ...], [sigma1, sigma2, ...])]...}
    '''
    # define a dict to store the average and std value, the key is the tag, value is a list consists of average and std of each features
    finalSigmaMean = defaultdict(list)
    # define a dict, the key is the tag, the value is a list consists of the row within the tag.
    classDict = defaultdict(list)
    # get the all labels
    labelsList = list(set(labels))
    labelsLength = len(set(labels))
    length = len(dataSet)
    # divide the rows into several groups
    for i in range(length):
        classDict[labels[i]].append(dataSet[i])

    # calculate the mean and std for each group
    for i in range(labelsLength):
        # temporary storage
        sigmaMean = []
        Array = np.array(classDict[labelsList[i]])
        # calculate the average and std for each col
        average = np.mean(Array, axis=0)
        std = np.std(Array, axis=0)
        # to store the average and std for each col
        means = []
        stds = []
        for eachmean, eachsigma in zip(average, std):
            means.append(eachmean)
            stds.append(eachsigma)
            
        sigmaMean.append((means, stds))
        
        finalSigmaMean[labelsList[i]] = np.array(sigmaMean)

    return finalSigmaMean

In [3]:
def classerProb(labels):
    '''
    Instruction: calculate the prob of each tag
    
    Parameters: 
        labels: the label list
        
    Return:
        prob of tags after log operation
    
    '''
    # define a dict to store the number of each tag
    classesDict = defaultdict(int)
    # get the number of categories 
    labelNum = len(set(labels))
    lengthLabels = len(labels)

    probResults = defaultdict(float)
    
    # get the number of each type and store into the dict
    for each in labels:
        classesDict[each] += 1
    
    
    for eachClass, eachNum in classesDict.items():
        # calculate the prob for each tag, +1 and + lengthLabels is to use the laplace smoothing
        eachProb = round(np.log((1 + eachNum) / (lengthLabels + labelNum), 10), 4)
        
        probResults[eachClass] = eachProb
        
    return probResults

In [4]:
def gaussProb( mean, sigam, x):
    '''
    Instruction: calculate the prob density under the Gauss distribution
    
    Parameters:
        mean - the average of each features, get from getSigmaMean functions
        sigma - the std of each features, get from getSigmaMean functions
        x - input data
    
    Return:
        prob density - the prob density belongs to each tag and the prob is the prob after log operation
                        {0：prob1， 1： prob2 ...}
                        
    '''
    zhiShu = -1 / 2 * np.log(2 * np.pi * sigam ** 2, 10)
    diShu = np.log(np.exp(-(x - mean) ** 2 / (2 * sigam ** 2)), 10)
    
    return np.round(zhiShu + diShu, 4)

In [5]:
def NB_classify(trains, tlabels, testss):
    '''
    Instructions: the classification of Naive Bayes
    
    Parameters: 
        trains: the value of training dataset
        tlabels: the labels of training dataset
        testss: the testing dataset
    
    Returns:
        
    
    '''
    # training the dataset and get the mean and sigma
    meanSigmas = getSigmaMean(trains, tlabels)
    # calculate the prob of each label
    classesProbs = classerProb(tlabels)
    
    tlabelsNumber = set(tlabels)
    probDict = defaultdict(float)
    
    # calculate the prob density under of each feature
    for twoClasser,classerInfos in meanSigmas.items():
        # temporary summation of prob
        sumProb = 0
        # calculate the prob of feature i and sum all of them
        sumProb = np.sum(gaussProb(classerInfos[0][0],classerInfos[0][1],testss))
        # adding the prob density of tag
        sumProb += classesProbs[twoClasser]
        # store sumProb into dict
        probDict[twoClasser] = sumProb
    
    # sort the dict by the prob density from maxmium to minmiun
    finalList = sorted(probDict.items(),key = operator.itemgetter(1),reverse = True)
    # return the tag under the maxmium prob
    return finalList[0][0]