In [1]:
import numpy as np

In [2]:
def createDataSet():
    '''
    Instructions: Create the sample
    
    Parameters: 
        None
    
    Returns:
        postingList: the words list after splitting
        classVec: the labels array
    '''
    postingList = np.array([['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']])

    # labels array, 1 is positive and 0 is negative
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec

postingList, classVec = createDataSet()

In [3]:
def createVocabList(dataSet):
    '''
    Instructions: generate a vocabulary applying the word lists in sample. There are no duplicated values in vocabulary
    
    Parameters:
        dataSet: the word lists
    
    Return 
        vocabSet: the non-duplicated word lists
    
    '''
    # create an empty set to store the vocabulary
    vocabSet = set([])
    for document in dataSet:
        # union operatino of set
        vocabSet = vocabSet | set(document)
    
    return list(vocabSet)


def wordSet2Vec(vocabList, DataSet):
    '''
    Instructions: vectorize the inputsSet based on the exsiting vocabulary list, the value of vectors are 1 or 2
    
    Parameters:
        vocabList - the vocabulary list of createVocabList
        inputSet - the word lists
        
    Returns:
        returnVec - the vector of word lists, 
                    the row of returnVec equals to row of word lists, the col of returnVec equals to the length of vocabList
    
    '''
    DataSet = np.array(DataSet)
    # get the row number
    rowNum = len(DataSet)
    # get the number of vocab features
    featureNum = len(vocabList)
    # initialize the vecotors, values the 0
    returnVec = np.zeros((rowNum, featureNum))

    # iter each row
    for row in range(rowNum):
        
        if type(DataSet[row]) != np.str_:
            for col in DataSet[row]:
                index = vocabList.index(col)
                returnVec[row, index] = 1
                
        else:
            for col in DataSet:
                index = vocabList.index(col)
                returnVec[row, index] = 1
                
    return returnVec

myVocabList = createVocabList(postingList)
print('myVocabList:\n',myVocabList)
trainMat = wordSet2Vec(myVocabList, postingList)
print('trainMat:\n', trainMat)

myVocabList:
 ['buying', 'quit', 'cute', 'mr', 'problems', 'posting', 'I', 'please', 'dalmation', 'stupid', 'garbage', 'worthless', 'licks', 'flea', 'my', 'maybe', 'steak', 'take', 'not', 'ate', 'food', 'park', 'is', 'dog', 'so', 'love', 'how', 'stop', 'to', 'him', 'help', 'has']
trainMat:
 [[0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1.
  0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  1. 1. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 1. 1. 1. 1. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0.]]


In [4]:
# training the naive bayes classificaiton
def trainNB0(trainMatrix, trainCategory):
    '''
    input:
        trainMatrix: the training dataset
        trainCategory: the label, 0 and 1
    
    return:
        the prob of each features to tag 1
        the prob of each features to tag 0
        the prob of tag1
    
    '''
    # calcualte the number of training set
    numTrainDocs = len(trainMatrix)
    # calculate the number of features
    numWords = len(trainMatrix[0])
    # calculate the prob of label 1
    pAbusive = sum(trainCategory) / float(numTrainDocs)

    # initialize the frequency of each features in both tags
    # set the value of 1, in order to Laplace smoothing
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)

    # initialize the denumerate of 2, using laplace smoothing
    p0Denom = 2.0
    p1Denom = 2.0

    for i in range(numTrainDocs):

    # count the date of label 1, P(w0|1),P(w1|1),P(w2|1)···
    # P(w0 | 1) equals the counts of w0 in tags 1 divided by the counts of all words in tags 1
        if trainCategory[i] == 1:
            # here using the matrix operation, because the dimension of p1Num equals that of each row
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        
        # count the date of label 0, P(w0|1),P(w1|1),P(w2|1)···
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])

    # applying the log operation in case the product of prob is too small 
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)

    return p0Vect, p1Vect, pAbusive

In [5]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): 
    """
    Parameters:
        vec2Classify - the sentences will be classified
        p0Vec - the prob arrays of features in tag 0
        p1Vec -the prob arrays of features in tag 1
        pClass1 - the prob of tag 1
    Returns:
        0 - the label 0
        1 - the label 1
    """
    # calculate the prob of belonging to tag 1, tag 0, respectively. logA * B = logA + logB, so here should add the log(pClass1)
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)

    # if there is no log operation, using multiplying other than adding of log
    # p1 = reduce(lambda x,y: x * y, vec2Classify * p1Vec) * pClass1
    # p0 = reduce(lambda x,y: x * y, vec2Classify * p0Vec) * (1.0 - pClass1)
    print('p0:', p0)
    print('p1:', p1)

    if p1 > p0:
        return 1
    else:
        return 0

In [6]:
def testingNB(testEntry, myVocabList, p0V, p1V, pAb):
    '''
    Instructions: testing the sample
    
    Parameters:
        testEntry: the testing data
        myVocabList: the vocabulary list
        p0V: the prob arrays of features in tag 0
        p1V: the prob arrays of features in tag 1
        pAb: the prob of tag 1
        
    Returns:
        the tag of test sample   
    '''
    # vectorize the test sample
    thisDoc = np.array(wordSet2Vec(myVocabList, testEntry))

    # return positive label
    if classifyNB(thisDoc, p0V, p1V, pAb):
        print(testEntry,'belongs to positive one')
        return 1
    # return negative label 0
    else:
        print(testEntry,'belongs to negative one')
        return 0

In [7]:
p0V, p1V, pAb = trainNB0(trainMat, classVec)
print('p0V:\n', p0V)
print('p1V:\n', p1V)
print('classVec:\n', classVec)
print('pAb:\n', pAb)

# testing sample 1
testEntry = ['love', 'my', 'dalmation']
testingNB(testEntry, myVocabList, p0V, p1V, pAb)

# testing sample 2
testEntry = ['stupid', 'garbage']
testingNB(testEntry, myVocabList, p0V, p1V, pAb)

p0V:
 [-3.25809654 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654
 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -3.25809654 -3.25809654
 -2.56494936 -2.56494936 -1.87180218 -3.25809654 -2.56494936 -3.25809654
 -3.25809654 -2.56494936 -3.25809654 -3.25809654 -2.56494936 -2.56494936
 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -2.15948425
 -2.56494936 -2.56494936]
p1V:
 [-2.35137526 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -2.35137526
 -3.04452244 -3.04452244 -3.04452244 -1.65822808 -2.35137526 -1.94591015
 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -3.04452244 -2.35137526
 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244 -1.94591015
 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -2.35137526
 -3.04452244 -3.04452244]
classVec:
 [0, 1, 0, 1, 0, 1]
pAb:
 0.5
p0: -21.69824985603394
p1: -28.093849120070754
['love', 'my', 'dalmation'] belongs to negative one
p0: -13.725533332645874
p1: -8.712353848093965
['stupid', 'garbage'

1