In [None]:
import os
from operator import itemgetter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Some utilities for digit dataset.

In [None]:
def img2vector(filename):
    returnVect = np.zeros((1,1024)) #images are 32x32, constituting 1024-dim vectors
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

def loadDigits(dataDir):
    labels = []
    fileList = os.listdir(dataDir)
    m = len(fileList)
    dataMat = np.zeros((m,1024))
    for i in range(m):
        fileNameStr = fileList[i]  #load the training set
        fileStr = fileNameStr.split('.')[0]  #take off ".txt"
        classNumStr = int(fileStr.split('_')[0])
        labels.append(classNumStr)
        dataMat[i,:] = img2vector('%s/%s' % (dataDir, fileNameStr))
    return dataMat, labels

Load the dataset first.

In [None]:
trainMat, trainLabels = loadDigits('trainingDigits') 
testMat, testLabels = loadDigits('testDigits') 
nTest = testMat.shape[0]  #number of testing data

Exercise 3-1: complete the code for kNN classification

In [None]:
def kNNclassify(inX, dataMat, labels, k):
    dataSetSize = dataMat.shape[0]
    
    ### TODO:
    # compute distance between inX to each data point
    # distances: a vector (length='datasetSize') 
    #
    
    sortedDistIndicies = distances.argsort()  #sort in terms of distance (ascending)
    classCount={}          
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.items(), key=itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

Then let's run kNN!

In [None]:
k=10   #k for kNN
errorCount = 0.0
for i in range(nTest):
    classifierResult = kNNclassify(testMat[i,:], trainMat, trainLabels, k)
    print("the classifier came back with: %d, the real answer is: %s" % (classifierResult, testLabels[i]))
    if (classifierResult != testLabels[i]): errorCount += 1.0
print("\nthe total number of errors is: %d" % errorCount)
print("\nthe total error rate is: %f" % (errorCount/float(nTest)))

Some utilities for text processing.

In [None]:
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

Prepare the dataset for spam classification.

In [None]:
#load text data and label (1 for spam or 0 for not)
docList=[]; classList = []; fullText =[]
for i in range(1,26):
    wordList = textParse(open('email/spam/%d.txt' % i).read())
    docList.append(wordList)
    fullText.extend(wordList)
    classList.append(1)
    wordList = textParse(open('email/ham/%d.txt' % i).read())
    docList.append(wordList)
    fullText.extend(wordList)
    classList.append(0)

#create vocabulary
vocabList = createVocabList(docList)

#create training & test set
trainingSet = list(range(50)); testSet=[]           
for i in range(10):  #randomly sample 10 examples for testing and rest for training
    randIndex = int(np.random.uniform(0,len(trainingSet)))
    testSet.append(trainingSet[randIndex])
    del(trainingSet[randIndex])
    
#convert to bag-of-words matrix (and corresponding labels)
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
    trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
    trainClasses.append(classList[docIndex])

Exercies 3-2: complete the code for Naive Bayes classification.

In [None]:
def trainNB(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pClass1 = sum(trainCategory)/float(numTrainDocs)   #remember that we put one for spam
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to ones() 
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0 (avoiding zero)
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            #remember that trainMatrix[i] is a bag-of-words vector
            p1Num += trainMatrix[i]  
            p1Denom += sum(trainMatrix[i])
        else:
            ### TODO: 
            # p0Num 
            # p0Denom
    p1Vect = np.log(p1Num/p1Denom)          #change to log() to avoid multiplication of small values (probabilities)
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pClass1

#p1Vect: vector of posterior probability of each word for class 1
#p0Vect: vector of posterior probability of each word for class 0
#pClass1: prior probability for class 1 (spam)

In [None]:
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    p1 = sum(vec2Classify * p1Vect) + np.log(pClass1)    #element-wise mult 
    ###TODO:
    #p0 = ... #log probability for class 0    
    if p1 > p0:
        return 1
    else: 
        return 0

Prepare a Naive Bayes classifier.

In [None]:
p0V,p1V,pSpam = trainNB(np.array(trainMat),np.array(trainClasses))

Now we are ready to run the classifier.

In [None]:
errorCount = 0
for docIndex in testSet:        #classify the remaining items
    wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) #a testing example
    if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
        errorCount += 1
        print("classification error",docList[docIndex])
print('the error rate is: ',float(errorCount)/len(testSet))