In [1]:
import numpy as np

In [2]:
# training the naive bayes classificaiton
def trainNB0(trainMatrix, trainCategory):
    '''
    input:
        trainMatrix: the training dataset
        trainCategory: the label, 0 and 1
    
    return:
        the prob of each features to tag 1
        the prob of each features to tag 0
        the prob of tag1
    
    '''
    # calcualte the number of training set
    numTrainDocs = len(trainMatrix)
    # calculate the number of features
    numWords = len(trainMatrix[0])
    # calculate the prob of label 1
    pAbusive = sum(trainCategory) / float(numTrainDocs)

    # initialize the frequency of each features in both tags
    # set the value of 1, in order to Laplace smoothing
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)

    # initialize the denumerate of 2, using laplace smoothing
    p0Denom = 2.0
    p1Denom = 2.0

    for i in range(numTrainDocs):

    # count the date of label 1, P(w0|1),P(w1|1),P(w2|1)···
    # P(w0 | 1) equals the counts of w0 in tags 1 divided by the counts of all words in tags 1
        if trainCategory[i] == 1:
            # here using the matrix operation, because the dimension of p1Num equals that of each row
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        
        # count the date of label 0, P(w0|1),P(w1|1),P(w2|1)···
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])

    # applying the log operation in case the product of prob is too small 
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)

    return p0Vect, p1Vect, pAbusive


In [3]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    
    """
    Parameters:
        vec2Classify - the sentences will be classified
        p0Vec - the prob arrays of features in tag 0
        p1Vec - the prob arrays of features in tag 1
        pClass1 - the prob of tag 1
    Returns:
        0 - the label 0
        1 - the label 1

"""
    # calculate the prob of belonging to tag 1, tag 0, respectively. logA * B = logA + logB, so here should add the log(pClass1)
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)

    # if there is no log operation, using multiplying other than adding of log
    # p1 = reduce(lambda x,y: x * y, vec2Classify * p1Vec) * pClass1
    # p0 = reduce(lambda x,y: x * y, vec2Classify * p0Vec) * (1.0 - pClass1)
    print('p0:', p0)
    print('p1:', p1)

    if p1 > p0:
        return 1
    else:
        return 0

In [4]:
def testingNB(testEntry, myVocabList, p0V, p1V, pAb):
    '''
    Instructions: testing the sample
    
    Parameters:
        testEntry: the testing data
        myVocabList: the vocabulary list
        p0V: the prob arrays of features in tag 0
        p1V: the prob arrays of features in tag 1
        pAb: the prob of tag 1
        
    Returns:
        the tag of test sample   
    '''
    # vectorize the test sample
    thisDoc = np.array(wordSet2Vec(myVocabList, testEntry))

    # return positive label
    if classifyNB(thisDoc, p0V, p1V, pAb):
        return 1
    # return negative label 0
    else:
        return 0