## 使用k-近邻算法的手写识别系统

（1）收集数据
（2）

In [1]:
import numpy as np
import operator

import matplotlib
import matplotlib.pyplot as plt

##### 为了使用前面两个例子的分类器，必须将图像格式化处理为一个向量。我们将把一个32x32的二进制图像矩阵转换成一个1x1024的向量，这样前两节使用的分类器就可以处理数字图像信息了。

 ##### 首先编写一段函数img2vector，将图像转换为向量；该函数创建1X1024的Numpy数组，然后打开给定的文件，循环读出文件的前32行，并将每行的头32个字符值存储在Numpy数组中，最后返回数组。

In [2]:
#该函数创建1X1024的numpy数组，然后打开给定的文件，循环读出文件的前32行，
#并将每行的头32个字符值存储在Numpy数组中，最后返回数组。
def img2vector(filename):
    returnVect = np.zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect


In [26]:
tVector = img2vector('D:/10-Book/MachineLearninginaction/Ch02/testDigits/0_13.txt')
tVector[0,0:31]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [27]:
tVector[0,32:63]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [8]:
#k近邻算法
def classify0(inX,dataSet,labels,k):
    dataSetSize = dataSet.shape[0]#dataSetSize的行数
    diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
    #np.tile(a,(2,1))就是把a先沿x轴复制1倍, 再把结果沿y方向复制2倍
    sqDiffMat = diffMat ** 2
    sqDistances = sqDiffMat.sum(axis=1)#按列计算
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()#返回的是数组值从小到大的索引值
    classCount = {}#字典
    #选择距离最小的前k个点
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.items(),#Python3.5中：iteritems变为items
                              key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

### 手写数字识别系统的测试代码

In [37]:
from os import listdir

def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('D:/10-Book/MachineLearninginaction/Ch02/trainingDigits')#获取目录内容
    m = len(trainingFileList)#文件数量
    trainingMat = np.zeros((m,1024))#m行1024列的训练矩阵，矩阵每行存储一个图像
    for i in range(m):
        fileNameStr = trainingFileList[i]#从文件名解析分类数组
        fileStr = fileNameStr.split('.')[0]#例文件名9_45.txt,表示数字9的第45个实例
        classNumStr = int(fileNameStr.split('_')[0])
        hwLabels.append(classNumStr)#类别存储在hwlabels
        trainingMat[i,:] = img2vector('D:/10-Book/MachineLearninginaction/Ch02/trainingDigits/%s' % fileNameStr)
    #对testDigits执行相同操作，不同的是使用classify0()函数测试目录下的每个文件
    #文件的值已经在0-1之间，所以不需要autoNorm归一化。
    testFileList = listdir('D:/10-Book/MachineLearninginaction/Ch02/testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        
        vectorUnderTest = img2vector('D:/10-Book/MachineLearninginaction/Ch02/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels,3)
        #print("分类器返回 ： %d    真实结果为： %d " %(classifierResult,classNumStr))
        if(classifierResult != classNumStr):
            print(classifierResult,classNumStr)
            errorCount+= 1.0
            
    print("总错误数量为：%d" % errorCount)
    print("错误率为：%f"  %(errorCount/float(mTest)))

In [38]:
 handwritingClassTest()

7 1
9 3
3 5
6 5
6 8
3 8
1 8
1 8
1 9
7 9
总错误数量为：10
错误率为：0.010571


## 小结

        k近邻算法是分类数据最简单最有效的算法，本章通过两个例子讲述了如何使用k近邻算法构造分类器。k近邻算法是基于实例的学习，使用算法时我们必须有接近实际数据的训练样本数据。k近邻算法必须保存全部数据集，如果训练数据集很大，必须使用大量的存储空间。
        此外，由于必须对数据集中的每个数据计算距离值，实际使用时可能非常耗时。
        k近邻算法的另一个缺陷是它无法给出任何数据的基础结构信息，因此我们无法知晓平均实例样本和典型实例样本具有什么特征。下一章使用概率测量方法处理分类问题，该算法可以解决这个问题。

In [30]:
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('D:/10-Book/MachineLearninginaction/Ch02/trainingDigits')           #load the training set
    m = len(trainingFileList)
    trainingMat = np.zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('D:/10-Book/MachineLearninginaction/Ch02/trainingDigits/%s' % fileNameStr)
    testFileList = listdir('D:/10-Book/MachineLearninginaction/Ch02/testDigits')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('D:/10-Book/MachineLearninginaction/Ch02/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d" % errorCount)
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)))

In [31]:
handwritingClassTest()

the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answe

the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answe

the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answe

the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answe

the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 6, the real answe

the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 8, the real answer is: 8
the classifier came back with: 8, the real answer is: 8
the classifier came back with: 8, the real answe

the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 7, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answe